From efef0bc4f131d86fa2d1b35d013fe8ced437c5ba Mon Sep 17 00:00:00 2001 From: John Pangas Date: Fri, 12 Jan 2024 21:04:56 +0300 Subject: [PATCH 01/59] Create spamcomment model --- bugbug/models/__init__.py | 1 + bugbug/models/spamcomment.py | 150 +++++++++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 bugbug/models/spamcomment.py diff --git a/bugbug/models/__init__.py b/bugbug/models/__init__.py index 075c885419..d3c94be889 100644 --- a/bugbug/models/__init__.py +++ b/bugbug/models/__init__.py @@ -28,6 +28,7 @@ "regressionrange": "bugbug.models.regressionrange.RegressionRangeModel", "regressor": "bugbug.models.regressor.RegressorModel", "spambug": "bugbug.models.spambug.SpamBugModel", + "spamcomment": "bugbug.models.spamcomment.SpamCommentModel", "stepstoreproduce": "bugbug.models.stepstoreproduce.StepsToReproduceModel", "testlabelselect": "bugbug.models.testselect.TestLabelSelectModel", "testgroupselect": "bugbug.models.testselect.TestGroupSelectModel", diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py new file mode 100644 index 0000000000..c39f493ab1 --- /dev/null +++ b/bugbug/models/spamcomment.py @@ -0,0 +1,150 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import logging + +import xgboost +from imblearn.over_sampling import BorderlineSMOTE +from imblearn.pipeline import Pipeline as ImblearnPipeline +from sklearn.compose import ColumnTransformer +from sklearn.feature_extraction import DictVectorizer +from sklearn.pipeline import Pipeline + +from bugbug import bug_features, bugzilla, feature_cleanup, utils +from bugbug.model import BugModel + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class SpamCommentModel(BugModel): + def __init__(self, lemmatization=True): + BugModel.__init__(self, lemmatization) + + self.calculate_importance = False + + feature_extractors = [ + bug_features.HasSTR(), + bug_features.HasRegressionRange(), + bug_features.Severity(), + bug_features.HasCrashSignature(), + bug_features.HasURL(), + bug_features.Whiteboard(), + bug_features.Product(), + # TODO: We would like to use the component at the time of filing too, + # but we can't because the rollback script doesn't support changes to + # components yet. + # bug_features.component(), + bug_features.NumWordsComments(), + bug_features.Keywords(), + bug_features.Priority(), + bug_features.Version(), + bug_features.TargetMilestone(), + bug_features.HasAttachment(), + bug_features.Platform(), + bug_features.OpSys(), + bug_features.FiledVia(), + ] + + cleanup_functions = [ + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), + ] + + self.extraction_pipeline = Pipeline( + [ + ( + "bug_extractor", + bug_features.BugExtractor( + feature_extractors, cleanup_functions, rollback=True + ), + ), + ] + ) + + self.clf = ImblearnPipeline( + [ + ( + "union", + ColumnTransformer( + [ + ("data", DictVectorizer(), "data"), + ("title", self.text_vectorizer(min_df=0.0001), "title"), + ( + "comments", + self.text_vectorizer(min_df=0.0001), + "comments", + ), + ] + ), + ), + ("sampler", BorderlineSMOTE(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), + ), + ] + ) + + def get_labels(self): + classes = {} + + for bug_data in bugzilla.get_bugs(include_invalid=True): + bug_id = bug_data["id"] + + # Skip bugs filed by Mozillians, since we are sure they are not spam. + if "@mozilla" in bug_data["creator"]: + continue + + # A bug that was moved out of 'Invalid Bugs' is definitely a legitimate bug. + for history in bug_data["history"]: + for change in history["changes"]: + if ( + change["field_name"] == "product" + and change["removed"] == "Invalid Bugs" + ): + classes[bug_id] = 0 + + # A fixed bug is definitely a legitimate bug. + if bug_data["resolution"] == "FIXED": + classes[bug_id] = 0 + + # A bug in the 'Invalid Bugs' product is definitely a spam bug. + elif bug_data["product"] == "Invalid Bugs": + classes[bug_id] = 1 + + logger.info( + "%d bugs are classified as non-spam", + sum(label == 0 for label in classes.values()), + ) + logger.info( + "%d bugs are classified as spam", + sum(label == 1 for label in classes.values()), + ) + + return classes, [0, 1] + + def items_gen(self, classes): + # Overwriting this method to add include_invalid=True to get_bugs to + # include spam bugs. + return ( + (bug, classes[bug["id"]]) + for bug in bugzilla.get_bugs(include_invalid=True) + if bug["id"] in classes + ) + + def get_feature_names(self): + return self.clf.named_steps["union"].get_feature_names_out() + + def overwrite_classes(self, bugs, classes, probabilities): + for i, bug in enumerate(bugs): + if "@mozilla" in bug["creator"]: + if probabilities: + classes[i] = [1.0, 0.0] + else: + classes[i] = 0 + + return classes From bff58c545e6fd1142ecd12502760709a20d68ecd Mon Sep 17 00:00:00 2001 From: John Pangas Date: Mon, 15 Jan 2024 18:08:24 +0300 Subject: [PATCH 02/59] Add New Features --- bugbug/bug_features.py | 36 ++++++++++++++++++++++++++++++++++++ bugbug/models/spamcomment.py | 3 ++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 82a1dfb5f4..35d78947f2 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -879,3 +879,39 @@ def __call__( for is_type in self.bug_type_extractors if is_type(bug, bug_map) ] + + +class CommentHasLink(SingleBugFeature): + pass + + +class Commenter(SingleBugFeature): + pass + + +class CommenterNumOfComments(SingleBugFeature): + pass + + +class CommentHasKeywords(SingleBugFeature): + # Safekeywords + spam_keywords = ["free", "win", "discount", "limited time", "casino", "rent"] + pass + + +class LengthofComment(SingleBugFeature): + # Extremely short of long comments + pass + + +class TimeCommentWasPosted(SingleBugFeature): + # Time between when comment was posted and when user created account + pass + + +class CommenterIsBugAuthor(SingleBugFeature): + pass + + +class TimeDifferenceWithPreviousComment(SingleBugFeature): + pass diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index c39f493ab1..07c99b780d 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -46,6 +46,7 @@ def __init__(self, lemmatization=True): bug_features.Platform(), bug_features.OpSys(), bug_features.FiledVia(), + # Use Commenter Experience Too ] cleanup_functions = [ @@ -95,7 +96,7 @@ def get_labels(self): for bug_data in bugzilla.get_bugs(include_invalid=True): bug_id = bug_data["id"] - # Skip bugs filed by Mozillians, since we are sure they are not spam. + # Skip comments filed by Mozillians and bots, since we are sure they are not spam. if "@mozilla" in bug_data["creator"]: continue From 61b0fe0bb239433c0ed244a1c8f03e39dbd76559 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Thu, 18 Jan 2024 20:14:12 +0300 Subject: [PATCH 03/59] Include new features and change spamcom --- bugbug/bug_features.py | 3 ++- bugbug/models/spamcomment.py | 36 ++++++++++++------------------------ 2 files changed, 14 insertions(+), 25 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 35d78947f2..b2cef1a6d9 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -885,7 +885,8 @@ class CommentHasLink(SingleBugFeature): pass -class Commenter(SingleBugFeature): +class CommenterExperience(SingleBugFeature): + # The amount of time it took between posting comment and creating account pass diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 07c99b780d..c32b1dbf87 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -46,7 +46,6 @@ def __init__(self, lemmatization=True): bug_features.Platform(), bug_features.OpSys(), bug_features.FiledVia(), - # Use Commenter Experience Too ] cleanup_functions = [ @@ -93,36 +92,25 @@ def __init__(self, lemmatization=True): def get_labels(self): classes = {} - for bug_data in bugzilla.get_bugs(include_invalid=True): - bug_id = bug_data["id"] + for bug in bugzilla.get_bugs(include_invalid=True): + for comment in bug["comments"]: + comment_id = comment["id"] - # Skip comments filed by Mozillians and bots, since we are sure they are not spam. - if "@mozilla" in bug_data["creator"]: - continue + # Skip comments filed by Mozillians and bots, since we are sure they are not spam. + if "@mozilla" in comment["creator"]: + continue - # A bug that was moved out of 'Invalid Bugs' is definitely a legitimate bug. - for history in bug_data["history"]: - for change in history["changes"]: - if ( - change["field_name"] == "product" - and change["removed"] == "Invalid Bugs" - ): - classes[bug_id] = 0 - - # A fixed bug is definitely a legitimate bug. - if bug_data["resolution"] == "FIXED": - classes[bug_id] = 0 - - # A bug in the 'Invalid Bugs' product is definitely a spam bug. - elif bug_data["product"] == "Invalid Bugs": - classes[bug_id] = 1 + if "spam" in comment["tags"]: + classes[comment_id] = 1 + else: + classes[comment_id] = 0 logger.info( - "%d bugs are classified as non-spam", + "%d comments are classified as non-spam", sum(label == 0 for label in classes.values()), ) logger.info( - "%d bugs are classified as spam", + "%d comments are classified as spam", sum(label == 1 for label in classes.values()), ) From e31fa756c86977bf7fdf938fc7097146dd4e3309 Mon Sep 17 00:00:00 2001 From: Suhaib Mujahid Date: Fri, 19 Jan 2024 08:56:09 -0500 Subject: [PATCH 04/59] Version 0.0.534 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 3e2afc6b54..72a58189d9 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.0.533 +0.0.534 From d365ad3e1139217af80ac1b18ad50a26c169406d Mon Sep 17 00:00:00 2001 From: John Pangas Date: Tue, 23 Jan 2024 16:01:09 +0300 Subject: [PATCH 05/59] Create comments extractor --- bugbug/comment_features.py | 133 +++++++++++++++++++++++++++++++++++ bugbug/models/spamcomment.py | 55 ++++++--------- 2 files changed, 153 insertions(+), 35 deletions(-) create mode 100644 bugbug/comment_features.py diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py new file mode 100644 index 0000000000..f228d5e685 --- /dev/null +++ b/bugbug/comment_features.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import sys +from collections import defaultdict +from typing import Any + +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin + + +class CommentFeature(object): + pass + + +class CommentExtractor(BaseEstimator, TransformerMixin): + def __init__( + self, + feature_extractors, + cleanup_functions, + ): + assert len(set(type(fe) for fe in feature_extractors)) == len( + feature_extractors + ), "Duplicate Feature Extractors" + self.feature_extractors = feature_extractors + + assert len(set(type(cf) for cf in cleanup_functions)) == len( + cleanup_functions + ), "Duplicate Cleanup Functions" + self.cleanup_functions = cleanup_functions + + def fit(self, x, y=None): + for feature in self.feature_extractors: + if hasattr(feature, "fit"): + feature.fit(x()) + + return self + + def transform(self, comments): + comments_iter = iter(comments()) + + commenter_experience_map = defaultdict(int) + + def apply_transform(comment): + data = {} + + for feature_extractor in self.feature_extractors: + res = feature_extractor( + comment, + commenter_experience=commenter_experience_map[comment["creator"]], + ) + + if hasattr(feature_extractor, "name"): + feature_extractor_name = feature_extractor.name + else: + feature_extractor_name = feature_extractor.__class__.__name__ + + if res is None: + continue + + if isinstance(res, (list, set)): + for item in res: + data[sys.intern(f"{item} in {feature_extractor_name}")] = True + continue + + data[feature_extractor_name] = res + + commenter_experience_map[comment["creator"]] += 1 + + comment_text = comment["text"] + for cleanup_function in self.cleanup_functions: + comment_text = cleanup_function(comment_text) + + return { + "data": data, + "comment_text": comment_text, + } + + return pd.DataFrame(apply_transform(comment) for comment in comments_iter) + + +class CommenterExperience(CommentFeature): + name = "#of Comments made by Commenter before" + + def __call__(self, comment, commenter_experience, **kwargs): + return commenter_experience + + +class CommentTextHasKeywords(CommentFeature): + name = "Comment Has Certain Keywords" + + def __init__(self, keywords=set()): + self.keywords = keywords + + def __call__(self, comment, **kwargs): + return any(keyword in comment["text"].lower() for keyword in self.keywords) + + +class CommentTags(CommentFeature): + name = "Comment Tags" + + def __call__(self, comment, **kwargs): + pass + + +class CommentHasLink(CommentFeature): + name = "Comment Has a Link" + + def __call__(self, comment, **kwargs) -> Any: + return "http" in comment["text"] + + +class LengthofComment(CommentFeature): + name = "Length of Comment" + + def __call__(self, comment, **kwargs): + return len(comment["text"]) + + +class TimeCommentWasPosted(CommentFeature): + name = "Time Comment Was Posted" + + def __call__(self, comment, **kwargs): + pass + + +class TimeDifferenceWithPreviousComment(CommentFeature): + name = "Time Difference With Previous Comment" + + def __call__(self, comment, prev_comment_time, **kwargs): + pass diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index c32b1dbf87..5712f4c9d1 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -6,13 +6,13 @@ import logging import xgboost -from imblearn.over_sampling import BorderlineSMOTE from imblearn.pipeline import Pipeline as ImblearnPipeline +from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline -from bugbug import bug_features, bugzilla, feature_cleanup, utils +from bugbug import bugzilla, comment_features, feature_cleanup, utils from bugbug.model import BugModel logging.basicConfig(level=logging.INFO) @@ -26,26 +26,11 @@ def __init__(self, lemmatization=True): self.calculate_importance = False feature_extractors = [ - bug_features.HasSTR(), - bug_features.HasRegressionRange(), - bug_features.Severity(), - bug_features.HasCrashSignature(), - bug_features.HasURL(), - bug_features.Whiteboard(), - bug_features.Product(), - # TODO: We would like to use the component at the time of filing too, - # but we can't because the rollback script doesn't support changes to - # components yet. - # bug_features.component(), - bug_features.NumWordsComments(), - bug_features.Keywords(), - bug_features.Priority(), - bug_features.Version(), - bug_features.TargetMilestone(), - bug_features.HasAttachment(), - bug_features.Platform(), - bug_features.OpSys(), - bug_features.FiledVia(), + comment_features.CommenterExperience(), + comment_features.CommentHasLink(), + comment_features.CommentTextHasKeywords( + {"free", "win", "discount", "limited time", "casino", "rent"} + ), ] cleanup_functions = [ @@ -57,9 +42,9 @@ def __init__(self, lemmatization=True): self.extraction_pipeline = Pipeline( [ ( - "bug_extractor", - bug_features.BugExtractor( - feature_extractors, cleanup_functions, rollback=True + "comment_extractor", + comment_features.CommentExtractor( + feature_extractors, cleanup_functions ), ), ] @@ -72,16 +57,15 @@ def __init__(self, lemmatization=True): ColumnTransformer( [ ("data", DictVectorizer(), "data"), - ("title", self.text_vectorizer(min_df=0.0001), "title"), ( - "comments", + "comment_text", self.text_vectorizer(min_df=0.0001), - "comments", + "comment_text", ), ] ), ), - ("sampler", BorderlineSMOTE(random_state=0)), + ("sampler", RandomUnderSampler(random_state=0)), ( "estimator", xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), @@ -118,19 +102,20 @@ def get_labels(self): def items_gen(self, classes): # Overwriting this method to add include_invalid=True to get_bugs to - # include spam bugs. + # include spam bugs which may have spam comments. return ( - (bug, classes[bug["id"]]) + (comment, classes[comment["id"]]) for bug in bugzilla.get_bugs(include_invalid=True) - if bug["id"] in classes + for comment in bug["comments"] + if comment["id"] in classes ) def get_feature_names(self): return self.clf.named_steps["union"].get_feature_names_out() - def overwrite_classes(self, bugs, classes, probabilities): - for i, bug in enumerate(bugs): - if "@mozilla" in bug["creator"]: + def overwrite_classes(self, comments, classes, probabilities): + for i, comment in enumerate(comments): + if "@mozilla" in comment["creator"]: if probabilities: classes[i] = [1.0, 0.0] else: From 9ce864a9c39e97195f2cc721442d3da6b7b2a6df Mon Sep 17 00:00:00 2001 From: John Pangas Date: Tue, 23 Jan 2024 16:02:11 +0300 Subject: [PATCH 06/59] Remove comment features from Bug Features --- bugbug/bug_features.py | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index b2cef1a6d9..82a1dfb5f4 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -879,40 +879,3 @@ def __call__( for is_type in self.bug_type_extractors if is_type(bug, bug_map) ] - - -class CommentHasLink(SingleBugFeature): - pass - - -class CommenterExperience(SingleBugFeature): - # The amount of time it took between posting comment and creating account - pass - - -class CommenterNumOfComments(SingleBugFeature): - pass - - -class CommentHasKeywords(SingleBugFeature): - # Safekeywords - spam_keywords = ["free", "win", "discount", "limited time", "casino", "rent"] - pass - - -class LengthofComment(SingleBugFeature): - # Extremely short of long comments - pass - - -class TimeCommentWasPosted(SingleBugFeature): - # Time between when comment was posted and when user created account - pass - - -class CommenterIsBugAuthor(SingleBugFeature): - pass - - -class TimeDifferenceWithPreviousComment(SingleBugFeature): - pass From 77d534df70c471dc2c87d4bc6e6ed29a91106b75 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Thu, 25 Jan 2024 01:27:36 +0300 Subject: [PATCH 07/59] Add New features --- bugbug/comment_features.py | 18 +++++++++--------- bugbug/model.py | 17 +++++++++++++++++ bugbug/models/spamcomment.py | 15 ++++++++++----- 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index f228d5e685..95e3b403bf 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -98,13 +98,6 @@ def __call__(self, comment, **kwargs): return any(keyword in comment["text"].lower() for keyword in self.keywords) -class CommentTags(CommentFeature): - name = "Comment Tags" - - def __call__(self, comment, **kwargs): - pass - - class CommentHasLink(CommentFeature): name = "Comment Has a Link" @@ -126,8 +119,15 @@ def __call__(self, comment, **kwargs): pass -class TimeDifferenceWithPreviousComment(CommentFeature): - name = "Time Difference With Previous Comment" +class TimeDifferenceCommentAccountCreation(CommentFeature): + name = "Time Difference Between Account Creation and when Comment was Made " def __call__(self, comment, prev_comment_time, **kwargs): pass + + +class CommentTags(CommentFeature): + name = "Comment Tags" + + def __call__(self, comment, **kwargs): + pass diff --git a/bugbug/model.py b/bugbug/model.py index 92de0c34f2..2f8c469639 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -216,6 +216,8 @@ def get_human_readable_feature_names(self): feature_name = f"Comments contain '{feature_name}'" elif type_ == "text": feature_name = f"Combined text contains '{feature_name}'" + elif type_ == "comment_text": + feature_name = f"Comment text contains '{feature_name}'" elif type_ == "files": feature_name = f"File '{feature_name}'" elif type_ not in ("data", "couple_data"): @@ -803,3 +805,18 @@ def items_gen(self, classes): continue yield issue, classes[issue_number] + + +class CommentModel(Model): + def __init__(self, lemmatization=False): + Model.__init__(self, lemmatization) + self.training_dbs = [bugzilla.BUGS_DB] + + def items_gen(self, classes): + for bug in bugzilla.get_bugs(): + for comment in bug["comments"]: + comment_id = comment["id"] + if comment["id"] not in classes: + continue + + yield comment, classes[comment_id] diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 5712f4c9d1..c6e32e31d1 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -13,15 +13,15 @@ from sklearn.pipeline import Pipeline from bugbug import bugzilla, comment_features, feature_cleanup, utils -from bugbug.model import BugModel +from bugbug.model import CommentModel logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -class SpamCommentModel(BugModel): +class SpamCommentModel(CommentModel): def __init__(self, lemmatization=True): - BugModel.__init__(self, lemmatization) + CommentModel.__init__(self, lemmatization) self.calculate_importance = False @@ -65,7 +65,12 @@ def __init__(self, lemmatization=True): ] ), ), - ("sampler", RandomUnderSampler(random_state=0)), + ( + "sampler", + RandomUnderSampler( + random_state=0, sampling_strategy="not minority" + ), + ), ( "estimator", xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), @@ -102,7 +107,7 @@ def get_labels(self): def items_gen(self, classes): # Overwriting this method to add include_invalid=True to get_bugs to - # include spam bugs which may have spam comments. + # include spam bugs which have a number of spam comments. return ( (comment, classes[comment["id"]]) for bug in bugzilla.get_bugs(include_invalid=True) From 73f74a4200f46680e4a58186d041af95a235cdc3 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Thu, 25 Jan 2024 16:18:32 +0300 Subject: [PATCH 08/59] Refine Link feature --- bugbug/comment_features.py | 31 +++++++++++++++++-------------- bugbug/models/spamcomment.py | 6 ++---- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 95e3b403bf..60fff935e4 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -3,6 +3,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. +import re import sys from collections import defaultdict from typing import Any @@ -82,27 +83,20 @@ def apply_transform(comment): class CommenterExperience(CommentFeature): - name = "#of Comments made by Commenter before" + name = "# of Comments made by Commenter in the past" def __call__(self, comment, commenter_experience, **kwargs): return commenter_experience -class CommentTextHasKeywords(CommentFeature): - name = "Comment Has Certain Keywords" - - def __init__(self, keywords=set()): - self.keywords = keywords - - def __call__(self, comment, **kwargs): - return any(keyword in comment["text"].lower() for keyword in self.keywords) - - class CommentHasLink(CommentFeature): name = "Comment Has a Link" + # We check for links that are not from Mozilla + url_pattern = re.compile(r"http[s]?://(?!mozilla\.org|mozilla\.com)\S+") + def __call__(self, comment, **kwargs) -> Any: - return "http" in comment["text"] + return bool(self.url_pattern.search(comment["text"])) class LengthofComment(CommentFeature): @@ -122,12 +116,21 @@ def __call__(self, comment, **kwargs): class TimeDifferenceCommentAccountCreation(CommentFeature): name = "Time Difference Between Account Creation and when Comment was Made " - def __call__(self, comment, prev_comment_time, **kwargs): + def __call__(self, comment, account_creation_time, **kwargs): pass class CommentTags(CommentFeature): name = "Comment Tags" + def __init__(self, to_ignore=set()): + self.to_ignore = to_ignore + def __call__(self, comment, **kwargs): - pass + tags = [] + for tag in comment["tags"]: + if tag in self.to_ignore: + continue + + tags.append(tag) + return tags diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index c6e32e31d1..cf0449b887 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -28,9 +28,7 @@ def __init__(self, lemmatization=True): feature_extractors = [ comment_features.CommenterExperience(), comment_features.CommentHasLink(), - comment_features.CommentTextHasKeywords( - {"free", "win", "discount", "limited time", "casino", "rent"} - ), + comment_features.LengthofComment(), ] cleanup_functions = [ @@ -59,7 +57,7 @@ def __init__(self, lemmatization=True): ("data", DictVectorizer(), "data"), ( "comment_text", - self.text_vectorizer(min_df=0.0001), + self.text_vectorizer(min_df=0.001), "comment_text", ), ] From 2d6548936e4bad29d2a3fdc10e396408d1dcc7dc Mon Sep 17 00:00:00 2001 From: John Pangas Date: Mon, 29 Jan 2024 12:41:16 +0300 Subject: [PATCH 09/59] Test with TomekLinks --- bugbug/models/spamcomment.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index cf0449b887..88ee745c91 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -7,7 +7,7 @@ import xgboost from imblearn.pipeline import Pipeline as ImblearnPipeline -from imblearn.under_sampling import RandomUnderSampler +from imblearn.under_sampling import TomekLinks from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline @@ -65,9 +65,7 @@ def __init__(self, lemmatization=True): ), ( "sampler", - RandomUnderSampler( - random_state=0, sampling_strategy="not minority" - ), + TomekLinks(), ), ( "estimator", From 501a89fec79edfc4dcd0192c558ce8383774bdd6 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Mon, 29 Jan 2024 19:08:05 +0300 Subject: [PATCH 10/59] Change df in text vectorizer --- bugbug/models/spamcomment.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 88ee745c91..066c69c7de 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -7,7 +7,7 @@ import xgboost from imblearn.pipeline import Pipeline as ImblearnPipeline -from imblearn.under_sampling import TomekLinks +from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline @@ -57,7 +57,7 @@ def __init__(self, lemmatization=True): ("data", DictVectorizer(), "data"), ( "comment_text", - self.text_vectorizer(min_df=0.001), + self.text_vectorizer(min_df=0.0001), "comment_text", ), ] @@ -65,7 +65,7 @@ def __init__(self, lemmatization=True): ), ( "sampler", - TomekLinks(), + RandomUnderSampler(random_state=0), ), ( "estimator", From 606f7439473facc0d77a082987ef405349d3d042 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Fri, 2 Feb 2024 19:16:01 +0300 Subject: [PATCH 11/59] Use oversampling --- bugbug/models/spamcomment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 066c69c7de..a38f0e6ecd 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -6,8 +6,8 @@ import logging import xgboost +from imblearn.over_sampling import BorderlineSMOTE from imblearn.pipeline import Pipeline as ImblearnPipeline -from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline @@ -65,7 +65,7 @@ def __init__(self, lemmatization=True): ), ( "sampler", - RandomUnderSampler(random_state=0), + BorderlineSMOTE(random_state=0), ), ( "estimator", From 41a73cbba1d2b4bee636cba258419f27469a4db5 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Tue, 6 Feb 2024 17:26:34 +0300 Subject: [PATCH 12/59] Use max_step --- bugbug/models/spamcomment.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index a38f0e6ecd..beb401cea8 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -27,7 +27,7 @@ def __init__(self, lemmatization=True): feature_extractors = [ comment_features.CommenterExperience(), - comment_features.CommentHasLink(), + comment_features.CommentHasUnknownLink(), comment_features.LengthofComment(), ] @@ -69,7 +69,9 @@ def __init__(self, lemmatization=True): ), ( "estimator", - xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), + xgboost.XGBClassifier( + n_jobs=utils.get_physical_cpu_count(), max_delta_step=1 + ), ), ] ) From 586576db4add007801e045017af1cc824fb88bd8 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Wed, 7 Feb 2024 16:57:41 +0300 Subject: [PATCH 13/59] Include and Refine features --- bugbug/comment_features.py | 43 +++++++++++++++++++++++++----------- bugbug/models/spamcomment.py | 9 ++++++-- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 60fff935e4..4c9500805a 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -6,6 +6,7 @@ import re import sys from collections import defaultdict +from datetime import datetime from typing import Any import pandas as pd @@ -89,35 +90,51 @@ def __call__(self, comment, commenter_experience, **kwargs): return commenter_experience -class CommentHasLink(CommentFeature): - name = "Comment Has a Link" +class CommentHasUnknownLink(CommentFeature): + name = "Comment Has an Unknown Link" - # We check for links that are not from Mozilla - url_pattern = re.compile(r"http[s]?://(?!mozilla\.org|mozilla\.com)\S+") + def __init__(self, domains_to_ignore=set()): + self.domains_to_ignore = domains_to_ignore + + ignored_domains_pattern = "|".join( + re.escape(domain) for domain in self.domains_to_ignore + ) + self.url_pattern = re.compile( + rf"http[s]?://(?!((?:{ignored_domains_pattern})\.\S+))\S+" + ) def __call__(self, comment, **kwargs) -> Any: return bool(self.url_pattern.search(comment["text"])) -class LengthofComment(CommentFeature): - name = "Length of Comment" +class CharacterCount(CommentFeature): + name = "# of Characters in the Comment" def __call__(self, comment, **kwargs): return len(comment["text"]) -class TimeCommentWasPosted(CommentFeature): - name = "Time Comment Was Posted" +class WordCount(CommentFeature): + name = "# of Words in the Comment" + + def __call__(self, comment, **kwargs): + return len(comment["text"].split()) + + +class DateCommentWasPosted(CommentFeature): + name = "Date Comment Was Posted" def __call__(self, comment, **kwargs): - pass + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") + return comment_time.strftime("%Y-%m-%d") -class TimeDifferenceCommentAccountCreation(CommentFeature): - name = "Time Difference Between Account Creation and when Comment was Made " +class TimeCommentWasPosted(CommentFeature): + name = "Time Comment Was Posted" - def __call__(self, comment, account_creation_time, **kwargs): - pass + def __call__(self, comment, **kwargs): + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") + return comment_time.strftime("%H:%M:%S") class CommentTags(CommentFeature): diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index beb401cea8..518cb3633d 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -27,8 +27,13 @@ def __init__(self, lemmatization=True): feature_extractors = [ comment_features.CommenterExperience(), - comment_features.CommentHasUnknownLink(), - comment_features.LengthofComment(), + comment_features.CommentHasUnknownLink( + {"github.com/mozilla", "mozilla.com", "mozilla.org"} + ), + comment_features.CharacterCount(), + comment_features.WordCount(), + comment_features.DateCommentWasPosted(), + comment_features.TimeCommentWasPosted(), ] cleanup_functions = [ From ba7a1a1c553b71fae4aaf53b754cb95a471d0c05 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Sat, 10 Feb 2024 01:06:50 +0300 Subject: [PATCH 14/59] Split Date Features --- bugbug/comment_features.py | 59 ++++++++++++++++++++++++++++-------- bugbug/models/spamcomment.py | 9 +++--- 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 4c9500805a..94b51b46ae 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -8,6 +8,7 @@ from collections import defaultdict from datetime import datetime from typing import Any +from urllib.parse import urlparse import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin @@ -96,15 +97,24 @@ class CommentHasUnknownLink(CommentFeature): def __init__(self, domains_to_ignore=set()): self.domains_to_ignore = domains_to_ignore - ignored_domains_pattern = "|".join( - re.escape(domain) for domain in self.domains_to_ignore - ) - self.url_pattern = re.compile( - rf"http[s]?://(?!((?:{ignored_domains_pattern})\.\S+))\S+" + def __call__(self, comment, **kwargs) -> Any: + potential_urls = re.findall( + r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", comment["text"] ) - def __call__(self, comment, **kwargs) -> Any: - return bool(self.url_pattern.search(comment["text"])) + for url in potential_urls: + parsed_url = urlparse(url) + hostname = parsed_url.netloc + + if hostname: + parts = hostname.split(".") + if len(parts) > 1: + main_domain = ".".join(parts[-2:]) + + if main_domain.lower() not in self.domains_to_ignore: + return True + + return False class CharacterCount(CommentFeature): @@ -121,20 +131,43 @@ def __call__(self, comment, **kwargs): return len(comment["text"].split()) -class DateCommentWasPosted(CommentFeature): - name = "Date Comment Was Posted" +class ID: + name = "Comment ID" + + def __call__(self, comment, **kwargs): + return comment["id"] + + +class HourOfDay(CommentFeature): + name = "Hour of the Day (0-23)" + + def __call__(self, comment, **kwargs): + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") + return comment_time.hour + + +class Weekday(CommentFeature): + name = "Day of the Week (0-7)" + + def __call__(self, comment, **kwargs): + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") + return comment_time.weekday() + + +class DayOfYear(CommentFeature): + name = "Day of the Year (0-366)" def __call__(self, comment, **kwargs): comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") - return comment_time.strftime("%Y-%m-%d") + return comment_time.timetuple().tm_yday -class TimeCommentWasPosted(CommentFeature): - name = "Time Comment Was Posted" +class WeekOfYear(CommentFeature): + name = "Week of Year" def __call__(self, comment, **kwargs): comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") - return comment_time.strftime("%H:%M:%S") + return comment_time.isocalendar()[1] class CommentTags(CommentFeature): diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 518cb3633d..8c916ffd57 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -28,12 +28,13 @@ def __init__(self, lemmatization=True): feature_extractors = [ comment_features.CommenterExperience(), comment_features.CommentHasUnknownLink( - {"github.com/mozilla", "mozilla.com", "mozilla.org"} + {"github.com", "mozilla.com", "mozilla.org"} ), - comment_features.CharacterCount(), comment_features.WordCount(), - comment_features.DateCommentWasPosted(), - comment_features.TimeCommentWasPosted(), + comment_features.HourOfDay(), + comment_features.DayOfYear(), + comment_features.Weekday(), + comment_features.WeekOfYear(), ] cleanup_functions = [ From 8f429d13577d968d33cd860b18706fa363fefd05 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Sat, 10 Feb 2024 01:19:41 +0300 Subject: [PATCH 15/59] Rename features correctly --- bugbug/comment_features.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 94b51b46ae..598483b99b 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -131,13 +131,6 @@ def __call__(self, comment, **kwargs): return len(comment["text"].split()) -class ID: - name = "Comment ID" - - def __call__(self, comment, **kwargs): - return comment["id"] - - class HourOfDay(CommentFeature): name = "Hour of the Day (0-23)" From 1ef249347abb2d0e7fa4b816585ecb5ec0338423 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Mon, 12 Feb 2024 19:32:22 +0300 Subject: [PATCH 16/59] Remove Commenter Experience and Invalid Bugs --- bugbug/models/spamcomment.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 8c916ffd57..49c50ad45c 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -26,7 +26,6 @@ def __init__(self, lemmatization=True): self.calculate_importance = False feature_extractors = [ - comment_features.CommenterExperience(), comment_features.CommentHasUnknownLink( {"github.com", "mozilla.com", "mozilla.org"} ), @@ -85,7 +84,7 @@ def __init__(self, lemmatization=True): def get_labels(self): classes = {} - for bug in bugzilla.get_bugs(include_invalid=True): + for bug in bugzilla.get_bugs(include_invalid=False): for comment in bug["comments"]: comment_id = comment["id"] From 5a18517b1e5592c74ceaa9b5c561be90a3242735 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Tue, 13 Feb 2024 18:14:17 +0300 Subject: [PATCH 17/59] Remove first comment --- bugbug/models/spamcomment.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 49c50ad45c..72a0fedd1f 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -74,9 +74,7 @@ def __init__(self, lemmatization=True): ), ( "estimator", - xgboost.XGBClassifier( - n_jobs=utils.get_physical_cpu_count(), max_delta_step=1 - ), + xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), ), ] ) @@ -84,7 +82,7 @@ def __init__(self, lemmatization=True): def get_labels(self): classes = {} - for bug in bugzilla.get_bugs(include_invalid=False): + for bug in bugzilla.get_bugs(include_invalid=True): for comment in bug["comments"]: comment_id = comment["id"] @@ -92,6 +90,10 @@ def get_labels(self): if "@mozilla" in comment["creator"]: continue + # Skip the first comment, spambug model already works on this comment. + if comment["count"] == 0: + continue + if "spam" in comment["tags"]: classes[comment_id] = 1 else: From ea6c16873f4aab7fcb22857c50306703b3c862e3 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Thu, 15 Feb 2024 16:39:39 +0300 Subject: [PATCH 18/59] Include Links Dictionary --- bugbug/comment_features.py | 13 +++++++++---- bugbug/models/spamcomment.py | 10 +++++----- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 598483b99b..581b31c4a5 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -91,8 +91,8 @@ def __call__(self, comment, commenter_experience, **kwargs): return commenter_experience -class CommentHasUnknownLink(CommentFeature): - name = "Comment Has an Unknown Link" +class NumberOfLinks(CommentFeature): + name = "Number of Links in the comment" def __init__(self, domains_to_ignore=set()): self.domains_to_ignore = domains_to_ignore @@ -102,6 +102,8 @@ def __call__(self, comment, **kwargs) -> Any: r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", comment["text"] ) + links = defaultdict(int) + for url in potential_urls: parsed_url = urlparse(url) hostname = parsed_url.netloc @@ -112,9 +114,12 @@ def __call__(self, comment, **kwargs) -> Any: main_domain = ".".join(parts[-2:]) if main_domain.lower() not in self.domains_to_ignore: - return True + links['unknown'] += 1 + else: + links['mozilla'] += 1 - return False + links['total'] = len(potential_urls) + return links class CharacterCount(CommentFeature): diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 72a0fedd1f..15eb2a60c3 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -26,7 +26,7 @@ def __init__(self, lemmatization=True): self.calculate_importance = False feature_extractors = [ - comment_features.CommentHasUnknownLink( + comment_features.NumberOfLinks( {"github.com", "mozilla.com", "mozilla.org"} ), comment_features.WordCount(), @@ -86,12 +86,12 @@ def get_labels(self): for comment in bug["comments"]: comment_id = comment["id"] - # Skip comments filed by Mozillians and bots, since we are sure they are not spam. - if "@mozilla" in comment["creator"]: + # Skip the first comment because most first comments may contain links. + if str(comment["count"]) == '0': continue - # Skip the first comment, spambug model already works on this comment. - if comment["count"] == 0: + # Skip comments filed by Mozillians and bots, since we are sure they are not spam. + if "@mozilla" in comment["creator"]: continue if "spam" in comment["tags"]: From 874b19f9526777070b975e0dbddab1a91326686e Mon Sep 17 00:00:00 2001 From: John P Date: Thu, 15 Feb 2024 17:37:15 +0300 Subject: [PATCH 19/59] Fix Error and Lint --- bugbug/comment_features.py | 8 ++++---- bugbug/models/spamcomment.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 581b31c4a5..163694f9cd 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -102,7 +102,7 @@ def __call__(self, comment, **kwargs) -> Any: r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", comment["text"] ) - links = defaultdict(int) + links = {"mozilla": 0, "unknown": 0} for url in potential_urls: parsed_url = urlparse(url) @@ -114,11 +114,11 @@ def __call__(self, comment, **kwargs) -> Any: main_domain = ".".join(parts[-2:]) if main_domain.lower() not in self.domains_to_ignore: - links['unknown'] += 1 + links["unknown"] += 1 else: - links['mozilla'] += 1 + links["mozilla"] += 1 - links['total'] = len(potential_urls) + links["total"] = links["unknown"] + links["mozilla"] return links diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 15eb2a60c3..e135c4d446 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -87,7 +87,7 @@ def get_labels(self): comment_id = comment["id"] # Skip the first comment because most first comments may contain links. - if str(comment["count"]) == '0': + if str(comment["count"]) == "0": continue # Skip comments filed by Mozillians and bots, since we are sure they are not spam. From b3da2e564dbc955ff87806216b51f764df7ed6e6 Mon Sep 17 00:00:00 2001 From: John P Date: Thu, 15 Feb 2024 17:41:55 +0300 Subject: [PATCH 20/59] Refactor the Links Dictionary --- bugbug/comment_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 163694f9cd..ca06f85708 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -118,7 +118,7 @@ def __call__(self, comment, **kwargs) -> Any: else: links["mozilla"] += 1 - links["total"] = links["unknown"] + links["mozilla"] + links["total"] = sum(links.values()) return links From b49485df505d6daec991c2157b9ca1bb36e2479f Mon Sep 17 00:00:00 2001 From: John P Date: Thu, 15 Feb 2024 20:44:29 +0300 Subject: [PATCH 21/59] Use List instead --- bugbug/comment_features.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index ca06f85708..9f4ea6376c 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -102,8 +102,7 @@ def __call__(self, comment, **kwargs) -> Any: r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", comment["text"] ) - links = {"mozilla": 0, "unknown": 0} - + domains = [] for url in potential_urls: parsed_url = urlparse(url) hostname = parsed_url.netloc @@ -112,14 +111,15 @@ def __call__(self, comment, **kwargs) -> Any: parts = hostname.split(".") if len(parts) > 1: main_domain = ".".join(parts[-2:]) + domains.append(main_domain.lower()) - if main_domain.lower() not in self.domains_to_ignore: - links["unknown"] += 1 - else: - links["mozilla"] += 1 + non_mozilla_links = sum( + domain not in self.domains_to_ignore for domain in domains + ) + mozilla_links = sum(domain in self.domains_to_ignore for domain in domains) + total_links = len(domains) - links["total"] = sum(links.values()) - return links + return [non_mozilla_links, mozilla_links, total_links] class CharacterCount(CommentFeature): From a7044b02483184bdaf175148ea6f64544ed9d5dc Mon Sep 17 00:00:00 2001 From: John Pangas Date: Fri, 16 Feb 2024 18:27:41 +0300 Subject: [PATCH 22/59] Use Dictionary for # of links --- bugbug/comment_features.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 9f4ea6376c..d3963a6506 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -63,6 +63,11 @@ def apply_transform(comment): if res is None: continue + if isinstance(res, dict): + for key, value in res.items(): + data[sys.intern(key)] = value + continue + if isinstance(res, (list, set)): for item in res: data[sys.intern(f"{item} in {feature_extractor_name}")] = True @@ -109,17 +114,22 @@ def __call__(self, comment, **kwargs) -> Any: if hostname: parts = hostname.split(".") + + # FIXME: Doesn't handle websites like shop.example.com.ca properly. + # It could extract a domain to look like com.ca if len(parts) > 1: main_domain = ".".join(parts[-2:]) domains.append(main_domain.lower()) - non_mozilla_links = sum( - domain not in self.domains_to_ignore for domain in domains - ) - mozilla_links = sum(domain in self.domains_to_ignore for domain in domains) - total_links = len(domains) - - return [non_mozilla_links, mozilla_links, total_links] + return { + "# of Known links": sum( + domain in self.domains_to_ignore for domain in domains + ), + "# of Unknown links": sum( + domain not in self.domains_to_ignore for domain in domains + ), + "Total # of links": len(domains), + } class CharacterCount(CommentFeature): From 13772c7f7c100a93bfbdd862488d252d56978c58 Mon Sep 17 00:00:00 2001 From: John P Date: Mon, 19 Feb 2024 15:24:50 +0300 Subject: [PATCH 23/59] Include older bugs --- bugbug/models/spamcomment.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index e135c4d446..4da0a4cc14 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -4,8 +4,10 @@ # You can obtain one at http://mozilla.org/MPL/2.0/. import logging +from datetime import datetime import xgboost +from dateutil.relativedelta import relativedelta from imblearn.over_sampling import BorderlineSMOTE from imblearn.pipeline import Pipeline as ImblearnPipeline from sklearn.compose import ColumnTransformer @@ -79,9 +81,34 @@ def __init__(self, lemmatization=True): ] ) + @staticmethod + def __download_older_bugs_with_spam_comments(months: int) -> None: + """Retrieve older bugs within the past specified number of months which have spam comments. + + This function provides an option to extend the dataset used for model training by including older spam comments. + """ + lookup_start_date = datetime.utcnow() - relativedelta(months=months) + params = { + "f1": "creation_ts", + "o1": "greaterthan", + "v1": lookup_start_date.strftime("%Y-%m-%d"), + "f2": "comment_tag", + "o2": "substring", + "v2": "spam", + "product": bugzilla.PRODUCTS, + } + + logger.info("Downloading older bugs...") + bugs_ids = bugzilla.get_ids(params) + older_bugs = bugzilla.download_bugs(bugs_ids) + + logger.info("%d older bugs have been downloaded.", len(older_bugs)) + def get_labels(self): classes = {} + self.__download_older_bugs_with_spam_comments(months=84) + for bug in bugzilla.get_bugs(include_invalid=True): for comment in bug["comments"]: comment_id = comment["id"] From 7cf0dcdee04be6631fed5234214e4e10d30670bb Mon Sep 17 00:00:00 2001 From: John P Date: Mon, 19 Feb 2024 18:44:14 +0300 Subject: [PATCH 24/59] Replace Weekday with Weekend --- bugbug/comment_features.py | 15 +++++++++++---- bugbug/models/spamcomment.py | 18 ++++++------------ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index d3963a6506..813c45388b 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -101,11 +101,10 @@ class NumberOfLinks(CommentFeature): def __init__(self, domains_to_ignore=set()): self.domains_to_ignore = domains_to_ignore + self.pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+") def __call__(self, comment, **kwargs) -> Any: - potential_urls = re.findall( - r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", comment["text"] - ) + potential_urls = self.pattern.findall(comment["text"]) domains = [] for url in potential_urls: @@ -159,7 +158,15 @@ class Weekday(CommentFeature): def __call__(self, comment, **kwargs): comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") - return comment_time.weekday() + return comment_time.isoweekday() + + +class PostedOnWeekend(CommentFeature): + name = "Comment was Posted on Weekend" + + def __call__(self, comment, **kwargs): + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") + return comment_time.isoweekday() in (5, 6) class DayOfYear(CommentFeature): diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 4da0a4cc14..04a43949e0 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -4,10 +4,8 @@ # You can obtain one at http://mozilla.org/MPL/2.0/. import logging -from datetime import datetime import xgboost -from dateutil.relativedelta import relativedelta from imblearn.over_sampling import BorderlineSMOTE from imblearn.pipeline import Pipeline as ImblearnPipeline from sklearn.compose import ColumnTransformer @@ -34,7 +32,7 @@ def __init__(self, lemmatization=True): comment_features.WordCount(), comment_features.HourOfDay(), comment_features.DayOfYear(), - comment_features.Weekday(), + comment_features.PostedOnWeekend(), comment_features.WeekOfYear(), ] @@ -82,19 +80,15 @@ def __init__(self, lemmatization=True): ) @staticmethod - def __download_older_bugs_with_spam_comments(months: int) -> None: + def __download_older_bugs_with_spam_comments() -> None: """Retrieve older bugs within the past specified number of months which have spam comments. This function provides an option to extend the dataset used for model training by including older spam comments. """ - lookup_start_date = datetime.utcnow() - relativedelta(months=months) params = { - "f1": "creation_ts", - "o1": "greaterthan", - "v1": lookup_start_date.strftime("%Y-%m-%d"), - "f2": "comment_tag", - "o2": "substring", - "v2": "spam", + "f1": "comment_tag", + "o1": "substring", + "v1": "spam", "product": bugzilla.PRODUCTS, } @@ -107,7 +101,7 @@ def __download_older_bugs_with_spam_comments(months: int) -> None: def get_labels(self): classes = {} - self.__download_older_bugs_with_spam_comments(months=84) + self.__download_older_bugs_with_spam_comments() for bug in bugzilla.get_bugs(include_invalid=True): for comment in bug["comments"]: From cc8e6f6b7b63435364153019623ec8ea4648b320 Mon Sep 17 00:00:00 2001 From: John P Date: Tue, 20 Feb 2024 16:03:07 +0300 Subject: [PATCH 25/59] Include max_delta_step --- bugbug/models/spamcomment.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 04a43949e0..03b64c309b 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -74,7 +74,9 @@ def __init__(self, lemmatization=True): ), ( "estimator", - xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), + xgboost.XGBClassifier( + n_jobs=utils.get_physical_cpu_count(), max_delta_step=1 + ), ), ] ) From c4e4f2242abab9b305d3485b989712e147037a5a Mon Sep 17 00:00:00 2001 From: John P Date: Tue, 20 Feb 2024 17:23:32 +0300 Subject: [PATCH 26/59] Revert "Include max_delta_step" This reverts commit cc8e6f6b7b63435364153019623ec8ea4648b320. Revert --- bugbug/models/spamcomment.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 03b64c309b..04a43949e0 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -74,9 +74,7 @@ def __init__(self, lemmatization=True): ), ( "estimator", - xgboost.XGBClassifier( - n_jobs=utils.get_physical_cpu_count(), max_delta_step=1 - ), + xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), ), ] ) From 01cca1edc954b9d8ee1ce1e2b52f5ce5e9ba96cf Mon Sep 17 00:00:00 2001 From: John P Date: Tue, 20 Feb 2024 17:32:07 +0300 Subject: [PATCH 27/59] Test using scale_pos_weight --- bugbug/model.py | 15 +++++++++++++++ bugbug/models/spamcomment.py | 6 ++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/bugbug/model.py b/bugbug/model.py index 2f8c469639..bbc9a2be0d 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -174,6 +174,8 @@ def __init__(self, lemmatization=False): self.store_dataset = False + self.use_scale_pos_weight = False + self.entire_dataset_training = False # DBs required for training. @@ -390,6 +392,19 @@ def train(self, importance_cutoff=0.15, limit=None): # Split dataset in training and test. X_train, X_test, y_train, y_test = self.train_test_split(X, y) + # Use scale_pos_weight to help in extremely imbalanced datasets + if self.use_scale_pos_weight and is_binary: + y_array = np.array(y_train) + negative_samples = (y_array == self.class_names[0]).sum() + positive_samples = (y_array == self.class_names[1]).sum() + scale_pos_weight = np.sqrt(negative_samples / positive_samples) + + logger.info("Scale Pos Weight: %d", scale_pos_weight) + + self.clf.named_steps["estimator"].set_params( + scale_pos_weight=scale_pos_weight + ) + tracking_metrics = {} # Use k-fold cross validation to evaluate results. diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 04a43949e0..b4202d14ba 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -6,7 +6,7 @@ import logging import xgboost -from imblearn.over_sampling import BorderlineSMOTE +from imblearn.over_sampling import SMOTE from imblearn.pipeline import Pipeline as ImblearnPipeline from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer @@ -25,6 +25,8 @@ def __init__(self, lemmatization=True): self.calculate_importance = False + self.use_scale_pos_weight = True + feature_extractors = [ comment_features.NumberOfLinks( {"github.com", "mozilla.com", "mozilla.org"} @@ -70,7 +72,7 @@ def __init__(self, lemmatization=True): ), ( "sampler", - BorderlineSMOTE(random_state=0), + SMOTE(random_state=0), ), ( "estimator", From cc42dee2ed47320f89a26639f29ce48bc1a31c3a Mon Sep 17 00:00:00 2001 From: John P Date: Tue, 20 Feb 2024 20:48:25 +0300 Subject: [PATCH 28/59] Use URL Extract --- bugbug/comment_features.py | 14 ++++++-------- bugbug/model.py | 9 +++++---- bugbug/models/spamcomment.py | 16 ++++++++++------ requirements.txt | 1 + 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 813c45388b..4d020f3b71 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -3,7 +3,6 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. -import re import sys from collections import defaultdict from datetime import datetime @@ -12,6 +11,7 @@ import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin +from urlextract import URLExtract class CommentFeature(object): @@ -101,24 +101,22 @@ class NumberOfLinks(CommentFeature): def __init__(self, domains_to_ignore=set()): self.domains_to_ignore = domains_to_ignore - self.pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+") + self.extractor = URLExtract() def __call__(self, comment, **kwargs) -> Any: - potential_urls = self.pattern.findall(comment["text"]) - + urls = self.extractor.find_urls(comment["text"]) domains = [] - for url in potential_urls: + for url in urls: parsed_url = urlparse(url) hostname = parsed_url.netloc if hostname: parts = hostname.split(".") - - # FIXME: Doesn't handle websites like shop.example.com.ca properly. - # It could extract a domain to look like com.ca if len(parts) > 1: main_domain = ".".join(parts[-2:]) domains.append(main_domain.lower()) + else: + domains.append(url) return { "# of Known links": sum( diff --git a/bugbug/model.py b/bugbug/model.py index bbc9a2be0d..6c29140b4f 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -6,6 +6,7 @@ import logging import pickle from collections import defaultdict +from math import sqrt from os import makedirs, path from typing import Any @@ -394,10 +395,10 @@ def train(self, importance_cutoff=0.15, limit=None): # Use scale_pos_weight to help in extremely imbalanced datasets if self.use_scale_pos_weight and is_binary: - y_array = np.array(y_train) - negative_samples = (y_array == self.class_names[0]).sum() - positive_samples = (y_array == self.class_names[1]).sum() - scale_pos_weight = np.sqrt(negative_samples / positive_samples) + negative_samples = sum(label == self.class_names[0] for label in y_train) + positive_samples = sum(label == self.class_names[1] for label in y_train) + + scale_pos_weight = sqrt(negative_samples / positive_samples) logger.info("Scale Pos Weight: %d", scale_pos_weight) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index b4202d14ba..28d71bdcf5 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -105,16 +105,20 @@ def get_labels(self): self.__download_older_bugs_with_spam_comments() - for bug in bugzilla.get_bugs(include_invalid=True): + for bug in bugzilla.get_bugs(): for comment in bug["comments"]: comment_id = comment["id"] # Skip the first comment because most first comments may contain links. - if str(comment["count"]) == "0": - continue - # Skip comments filed by Mozillians and bots, since we are sure they are not spam. - if "@mozilla" in comment["creator"]: + # Skip comments whose text has been removed. + if any( + [ + comment["count"] == "0", + "@mozilla" in comment["creator"], + "(comment removed)" in comment["text"], + ] + ): continue if "spam" in comment["tags"]: @@ -138,7 +142,7 @@ def items_gen(self, classes): # include spam bugs which have a number of spam comments. return ( (comment, classes[comment["id"]]) - for bug in bugzilla.get_bugs(include_invalid=True) + for bug in bugzilla.get_bugs() for comment in bug["comments"] if comment["id"] in classes ) diff --git a/requirements.txt b/requirements.txt index a7fff04333..d1f704d01c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,5 +29,6 @@ tabulate==0.9.0 taskcluster==60.3.4 tenacity==8.2.3 tqdm==4.66.1 +urlextract==1.8.0 xgboost==2.0.3 zstandard==0.22.0 From 4b8cf49ca64d45c9f9c1538dd2f53b7ab697d4ec Mon Sep 17 00:00:00 2001 From: John Pangas Date: Wed, 21 Feb 2024 16:32:47 +0300 Subject: [PATCH 29/59] Revert to Using Regex --- bugbug/comment_features.py | 20 ++++++++++++-------- requirements.txt | 1 - 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 4d020f3b71..0cbce98321 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -3,6 +3,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. +import re import sys from collections import defaultdict from datetime import datetime @@ -11,7 +12,6 @@ import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin -from urlextract import URLExtract class CommentFeature(object): @@ -101,23 +101,27 @@ class NumberOfLinks(CommentFeature): def __init__(self, domains_to_ignore=set()): self.domains_to_ignore = domains_to_ignore - self.extractor = URLExtract() + self.pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+") def __call__(self, comment, **kwargs) -> Any: - urls = self.extractor.find_urls(comment["text"]) + potential_urls = re.findall( + r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", comment["text"] + ) + potential_urls = self.pattern.findall(comment["text"]) + domains = [] - for url in urls: + for url in potential_urls: parsed_url = urlparse(url) hostname = parsed_url.netloc - if hostname: parts = hostname.split(".") + # FIXME: Doesn't handle websites like shop.example.com.ca properly. + # It could extract a domain to look like com.ca + # Try with libraries like URL Extract + if len(parts) > 1: main_domain = ".".join(parts[-2:]) domains.append(main_domain.lower()) - else: - domains.append(url) - return { "# of Known links": sum( domain in self.domains_to_ignore for domain in domains diff --git a/requirements.txt b/requirements.txt index d1f704d01c..a7fff04333 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,6 +29,5 @@ tabulate==0.9.0 taskcluster==60.3.4 tenacity==8.2.3 tqdm==4.66.1 -urlextract==1.8.0 xgboost==2.0.3 zstandard==0.22.0 From 5c5da8cccefdc5059ae0e8174644682b48a15d0b Mon Sep 17 00:00:00 2001 From: John Pangas Date: Thu, 22 Feb 2024 14:31:19 +0300 Subject: [PATCH 30/59] Introduce new extraction func and features --- bugbug/comment_features.py | 66 ++++++++++++++++++++++-------------- bugbug/model.py | 6 ++-- bugbug/models/spamcomment.py | 12 +++---- bugbug/utils.py | 42 +++++++++++++++++++++++ 4 files changed, 92 insertions(+), 34 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 0cbce98321..fdfd8f6c30 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -3,16 +3,16 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. -import re import sys from collections import defaultdict from datetime import datetime from typing import Any -from urllib.parse import urlparse import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin +from bugbug.utils import extract_urls_and_domains + class CommentFeature(object): pass @@ -100,34 +100,15 @@ class NumberOfLinks(CommentFeature): name = "Number of Links in the comment" def __init__(self, domains_to_ignore=set()): - self.domains_to_ignore = domains_to_ignore - self.pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+") + self.known_domains = domains_to_ignore def __call__(self, comment, **kwargs) -> Any: - potential_urls = re.findall( - r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", comment["text"] - ) - potential_urls = self.pattern.findall(comment["text"]) - - domains = [] - for url in potential_urls: - parsed_url = urlparse(url) - hostname = parsed_url.netloc - if hostname: - parts = hostname.split(".") - # FIXME: Doesn't handle websites like shop.example.com.ca properly. - # It could extract a domain to look like com.ca - # Try with libraries like URL Extract - - if len(parts) > 1: - main_domain = ".".join(parts[-2:]) - domains.append(main_domain.lower()) + domains = extract_urls_and_domains(comment["text"])["domains"] + return { - "# of Known links": sum( - domain in self.domains_to_ignore for domain in domains - ), + "# of Known links": sum(domain in self.known_domains for domain in domains), "# of Unknown links": sum( - domain not in self.domains_to_ignore for domain in domains + domain not in self.known_domains for domain in domains ), "Total # of links": len(domains), } @@ -147,6 +128,39 @@ def __call__(self, comment, **kwargs): return len(comment["text"].split()) +class UnknownLinkAtBeginning(CommentFeature): + name = "Unknown Link found at Beginning of the Comment" + + def __init__(self, domains_to_ignore=set()): + self.known_domains = domains_to_ignore + + def __call__(self, comment, **kwargs): + urls = extract_urls_and_domains(comment["text"], self.known_domains)["urls"] + + first_word = comment["text"].split()[0] + + if first_word in urls: + return True + + return False + + +class UnknownLinkAtEnd(CommentFeature): + name = "Unknown Link found at End of the Comment" + + def __init__(self, domains_to_ignore=set()): + self.known_domains = domains_to_ignore + + def __call__(self, comment, **kwargs): + urls = extract_urls_and_domains(comment["text"], self.known_domains)["urls"] + last_word = comment["text"].split()[-1] + + if last_word in urls: + return True + + return False + + class HourOfDay(CommentFeature): name = "Hour of the Day (0-23)" diff --git a/bugbug/model.py b/bugbug/model.py index 6c29140b4f..c3ef9ec836 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -395,8 +395,10 @@ def train(self, importance_cutoff=0.15, limit=None): # Use scale_pos_weight to help in extremely imbalanced datasets if self.use_scale_pos_weight and is_binary: - negative_samples = sum(label == self.class_names[0] for label in y_train) - positive_samples = sum(label == self.class_names[1] for label in y_train) + negative_samples = sum(label == 0 for label in y_train) + positive_samples = sum(label == 1 for label in y_train) + logger.info("Negative Samples: %d", negative_samples) + logger.info("Positive Samples: %d", positive_samples) scale_pos_weight = sqrt(negative_samples / positive_samples) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 28d71bdcf5..334c3e67be 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -18,6 +18,8 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +SAFE_DOMAINS = {"github.com", "mozilla.com", "mozilla.org"} + class SpamCommentModel(CommentModel): def __init__(self, lemmatization=True): @@ -28,14 +30,14 @@ def __init__(self, lemmatization=True): self.use_scale_pos_weight = True feature_extractors = [ - comment_features.NumberOfLinks( - {"github.com", "mozilla.com", "mozilla.org"} - ), + comment_features.NumberOfLinks(SAFE_DOMAINS), comment_features.WordCount(), comment_features.HourOfDay(), comment_features.DayOfYear(), - comment_features.PostedOnWeekend(), comment_features.WeekOfYear(), + comment_features.Weekday(), + # comment_features.UnknownLinkAtBeginning(SAFE_DOMAINS), + # comment_features.UnknownLinkAtEnd(SAFE_DOMAINS), ] cleanup_functions = [ @@ -138,8 +140,6 @@ def get_labels(self): return classes, [0, 1] def items_gen(self, classes): - # Overwriting this method to add include_invalid=True to get_bugs to - # include spam bugs which have a number of spam comments. return ( (comment, classes[comment["id"]]) for bug in bugzilla.get_bugs() diff --git a/bugbug/utils.py b/bugbug/utils.py index c0e141933e..ee4e2edebd 100644 --- a/bugbug/utils.py +++ b/bugbug/utils.py @@ -18,6 +18,7 @@ from datetime import datetime from functools import lru_cache from typing import Any, Iterator +from urllib.parse import urlparse import boto3 import dateutil.parser @@ -558,3 +559,44 @@ def escape_markdown(text: str) -> str: def keep_as_is(x): """A tokenizer that does nothing.""" return x + + +def extract_urls_and_domains(text: str, domains_to_ignore: set = set()) -> dict: + """Extracts URLs and domains from a given text, optionally filtering out ignored domains. + + Args: + - text: The input text string where URLs and domains need to be found. + - domains_to_ignore: A set of domain names to exclude from the results. e.g. mozilla.com + + Returns: + A dictionary containing: + - "urls": A list of extracted URLs. + - "domains": A list of extracted domain names (excluding ignored domains if provided). + (Note: current domain extraction is basic and has limitations) + """ + pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+") + urls = pattern.findall(text) + + domains = [] + urls_to_remove = [] + + for url in urls: + parsed_url = urlparse(url) + hostname = parsed_url.netloc + if hostname: + parts = hostname.split(".") + # FIXME: Doesn't handle websites like shop.example.com.ca properly. + # It could extract a domain to look like com.ca + # Try with libraries like URL Extract + + if len(parts) > 1: + main_domain = ".".join(parts[-2:]).lower() + if main_domain in domains_to_ignore: + urls_to_remove.append(url) + else: + domains.append(main_domain) + + if not domains_to_ignore: + urls = [url for url in urls if url not in urls_to_remove] + + return {"urls": urls, "domains": domains} From dc16331c82a2fcf841c69183eabf675e38291dec Mon Sep 17 00:00:00 2001 From: John P Date: Thu, 22 Feb 2024 17:43:50 +0300 Subject: [PATCH 31/59] Include tests for extraction function --- bugbug/models/spamcomment.py | 4 +- tests/test_utils.py | 102 +++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 2 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 334c3e67be..5ab728523b 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -36,8 +36,8 @@ def __init__(self, lemmatization=True): comment_features.DayOfYear(), comment_features.WeekOfYear(), comment_features.Weekday(), - # comment_features.UnknownLinkAtBeginning(SAFE_DOMAINS), - # comment_features.UnknownLinkAtEnd(SAFE_DOMAINS), + comment_features.UnknownLinkAtBeginning(SAFE_DOMAINS), + comment_features.UnknownLinkAtEnd(SAFE_DOMAINS), ] cleanup_functions = [ diff --git a/tests/test_utils.py b/tests/test_utils.py index fb1e700eab..e65dfccefd 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -466,3 +466,105 @@ def test_StructuredColumnTransformer() -> None: .view(np.dtype("int64")), ColumnTransformer(transformers).fit_transform(df), ) + + +@pytest.mark.parametrize( + "test_input, expected_urls, expected_domains", + [ + ("This is a sample text without any links.", [], []), + ( + "Visit https://www.testdomain.com for more info.", + ["https://www.testdomain.com"], + ["testdomain.com"], + ), + ( + "Links: http://www.example.com, but ignore https://www.mozilla.com", + ["http://www.example.com"], + ["example.com"], + ), + ( + "Check out https://example.org ,sign up on www.anothersite.net and visit mozilla.org", + ["https://example.org", "www.anothersite.net"], + ["example.org", "anothersite.net"], + ), + ( + "Visit https://www.example.org.uk ,sign up on www.anothersite.net.ac and visit firefox.mozilla.org", + ["https://www.example.org.uk", "www.anothersite.net.ac"], + ["example.org.uk", "anothersite.net.ac"], + ), + ( + "Check out http://example.com/a/abc/cat.jpg ,sign up on www.anothersite.net/abc/cde and visit mozilla.org/signup", + ["http://example.com/a/abc/cat.jpg", "www.anothersite.net/abc/cde"], + ["example.com", "anothersite.net.ac"], + ), + ( + "Visit https://www.example.org.uk/a/abc/cat.jpg ,sign up on www.anothersite.net.ac/abc/cde and visit shop.mozilla.org/signup", + [ + "https://www.example.org.uk/a/abc/cat.jpg", + "www.anothersite.net.ac/abc/cde", + ], + ["example.org.uk", "anothersite.net.ac"], + ), + ], +) +@pytest.mark.xfail(reason="The function to extract does not function as expected") +def test_url_extraction_ignore_domains(test_input, expected_urls, expected_domains): + """Tests extraction of URLs and domains while ignoring some domains""" + domains_to_ignore = {"mozilla.com", "mozilla.org"} + result = utils.extract_urls_and_domains(test_input, domains_to_ignore) + + assert result["urls"] == expected_urls + assert result["domains"] == expected_domains + + +@pytest.mark.parametrize( + "test_input, expected_urls, expected_domains", + [ + ("This is a sample text without any links.", [], []), + ( + "Visit http://www.testdomain.com for more info.", + ["http://www.testdomain.com"], + ["testdomain.com"], + ), + ( + "Links: https://www.example.com, but do not ignore https://mozilla.com", + ["https://www.example.com", "https://mozilla.com"], + ["example.com", "mozilla.com"], + ), + ( + "Check out http://www.example.org ,sign up on www.anothersite.net and visit mozilla.org", + ["http://www.example.org", "www.anothersite.net", "mozilla.org"], + ["example.org", "anothersite.net", "mozilla.org"], + ), + ( + "Visit http://example.org.uk ,sign up on www.anothersite.net.ac and visit firefox.mozilla.org", + ["http://example.org.uk", "www.anothersite.net.ac", "firefox.mozilla.org"], + ["example.org", "anothersite.net.ac", "mozilla.org"], + ), + ( + "Check out https://www.example.com/a/abc/cat.jpg ,sign up on www.anothersite.net/abc/cde and visit mozilla.org/signup", + [ + "https://www.example.com/a/abc/cat.jpg", + "www.anothersite.net/abc/cde", + "mozilla.org/signup", + ], + ["example.com", "anothersite.net.ac", "mozilla.org.uk"], + ), + ( + "Visit http://example.org.uk/a/abc/cat.jpg ,sign up on www.anothersite.net.ac/abc/cde and visit shop.mozilla.org.uk/signup", + [ + "http://example.org.uk/a/abc/cat.jpg", + "www.anothersite.net.ac/abc/cde", + "shop.mozilla.org.uk/signup", + ], + ["example.org.uk", "anothersite.net.ac", "shop.mozilla.org.uk"], + ), + ], +) +@pytest.mark.xfail(reason="The function to extract does not function as expected") +def test_url_extraction_domains(test_input, expected_urls, expected_domains): + """Tests extraction of URLs and domains without ignoring any domains""" + result = utils.extract_urls_and_domains(test_input) + + assert result["urls"] == expected_urls + assert result["domains"] == expected_domains From e5b03495e8295d5f0a17863cbd2dfd9de8c10c07 Mon Sep 17 00:00:00 2001 From: John P Date: Thu, 22 Feb 2024 17:45:05 +0300 Subject: [PATCH 32/59] Change scale_pos_weight value --- bugbug/model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bugbug/model.py b/bugbug/model.py index c3ef9ec836..b6095e97a4 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -6,7 +6,6 @@ import logging import pickle from collections import defaultdict -from math import sqrt from os import makedirs, path from typing import Any @@ -400,7 +399,7 @@ def train(self, importance_cutoff=0.15, limit=None): logger.info("Negative Samples: %d", negative_samples) logger.info("Positive Samples: %d", positive_samples) - scale_pos_weight = sqrt(negative_samples / positive_samples) + scale_pos_weight = float(negative_samples / positive_samples) logger.info("Scale Pos Weight: %d", scale_pos_weight) From 644795aa97233676e597f73083527b9faf8ddfbd Mon Sep 17 00:00:00 2001 From: John P Date: Thu, 22 Feb 2024 20:53:43 +0300 Subject: [PATCH 33/59] Change regex for extraction --- bugbug/comment_features.py | 15 +++------- bugbug/utils.py | 34 +++++++-------------- tests/test_utils.py | 60 +++++++++++++++++++++----------------- 3 files changed, 48 insertions(+), 61 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index fdfd8f6c30..678dbeb3a8 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -137,12 +137,8 @@ def __init__(self, domains_to_ignore=set()): def __call__(self, comment, **kwargs): urls = extract_urls_and_domains(comment["text"], self.known_domains)["urls"] - first_word = comment["text"].split()[0] - - if first_word in urls: - return True - - return False + words = comment["text"].split() + return words[0] in urls if words else False class UnknownLinkAtEnd(CommentFeature): @@ -153,12 +149,9 @@ def __init__(self, domains_to_ignore=set()): def __call__(self, comment, **kwargs): urls = extract_urls_and_domains(comment["text"], self.known_domains)["urls"] - last_word = comment["text"].split()[-1] - - if last_word in urls: - return True - return False + words = comment["text"].split() + return words[-1] in urls if words else False class HourOfDay(CommentFeature): diff --git a/bugbug/utils.py b/bugbug/utils.py index ee4e2edebd..633ea3904f 100644 --- a/bugbug/utils.py +++ b/bugbug/utils.py @@ -18,7 +18,6 @@ from datetime import datetime from functools import lru_cache from typing import Any, Iterator -from urllib.parse import urlparse import boto3 import dateutil.parser @@ -28,6 +27,7 @@ import requests import scipy import taskcluster +import tldextract import zstandard from pkg_resources import DistributionNotFound from requests.packages.urllib3.util.retry import Retry @@ -574,29 +574,17 @@ def extract_urls_and_domains(text: str, domains_to_ignore: set = set()) -> dict: - "domains": A list of extracted domain names (excluding ignored domains if provided). (Note: current domain extraction is basic and has limitations) """ - pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+") - urls = pattern.findall(text) + pattern = re.compile(r"(?:https?://|www\.)(?:[^\s/?#]+)+(?:[\/?#][^\s]*)?") + potential_urls = pattern.findall(text) domains = [] - urls_to_remove = [] - - for url in urls: - parsed_url = urlparse(url) - hostname = parsed_url.netloc - if hostname: - parts = hostname.split(".") - # FIXME: Doesn't handle websites like shop.example.com.ca properly. - # It could extract a domain to look like com.ca - # Try with libraries like URL Extract - - if len(parts) > 1: - main_domain = ".".join(parts[-2:]).lower() - if main_domain in domains_to_ignore: - urls_to_remove.append(url) - else: - domains.append(main_domain) - - if not domains_to_ignore: - urls = [url for url in urls if url not in urls_to_remove] + urls = [] + + for url in potential_urls: + url_info = tldextract.extract(url) + domain = url_info.registered_domain + if domain and domain not in domains_to_ignore: + domains.append(domain) + urls.append(url) return {"urls": urls, "domains": domains} diff --git a/tests/test_utils.py b/tests/test_utils.py index e65dfccefd..6b93f62293 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -478,27 +478,27 @@ def test_StructuredColumnTransformer() -> None: ["testdomain.com"], ), ( - "Links: http://www.example.com, but ignore https://www.mozilla.com", + "Links: http://www.example.com but ignore https://www.mozilla.com", ["http://www.example.com"], ["example.com"], ), ( - "Check out https://example.org ,sign up on www.anothersite.net and visit mozilla.org", + "Check out https://example.org ,sign up on www.anothersite.net and proceed to https://firefox.mozilla.org", ["https://example.org", "www.anothersite.net"], ["example.org", "anothersite.net"], ), ( - "Visit https://www.example.org.uk ,sign up on www.anothersite.net.ac and visit firefox.mozilla.org", + "Visit https://www.example.org.uk ,sign up on www.anothersite.net.ac and proceed to www.test.mozilla.org", ["https://www.example.org.uk", "www.anothersite.net.ac"], ["example.org.uk", "anothersite.net.ac"], ), ( - "Check out http://example.com/a/abc/cat.jpg ,sign up on www.anothersite.net/abc/cde and visit mozilla.org/signup", + "Check out http://example.com/a/abc/cat.jpg ,sign up on www.anothersite.net/abc/cde and proceed to https://firefox.mozilla.com/download/macos", ["http://example.com/a/abc/cat.jpg", "www.anothersite.net/abc/cde"], - ["example.com", "anothersite.net.ac"], + ["example.com", "anothersite.net"], ), ( - "Visit https://www.example.org.uk/a/abc/cat.jpg ,sign up on www.anothersite.net.ac/abc/cde and visit shop.mozilla.org/signup", + "Visit https://www.example.org.uk/a/abc/cat.jpg ,sign up on www.anothersite.net.ac/abc/cde and visit https://www.mozilla.com/signup", [ "https://www.example.org.uk/a/abc/cat.jpg", "www.anothersite.net.ac/abc/cde", @@ -507,7 +507,6 @@ def test_StructuredColumnTransformer() -> None: ), ], ) -@pytest.mark.xfail(reason="The function to extract does not function as expected") def test_url_extraction_ignore_domains(test_input, expected_urls, expected_domains): """Tests extraction of URLs and domains while ignoring some domains""" domains_to_ignore = {"mozilla.com", "mozilla.org"} @@ -522,48 +521,55 @@ def test_url_extraction_ignore_domains(test_input, expected_urls, expected_domai [ ("This is a sample text without any links.", [], []), ( - "Visit http://www.testdomain.com for more info.", - ["http://www.testdomain.com"], + "Visit https://www.testdomain.com for more info.", + ["https://www.testdomain.com"], ["testdomain.com"], ), ( - "Links: https://www.example.com, but do not ignore https://mozilla.com", - ["https://www.example.com", "https://mozilla.com"], + "Links: http://www.example.com , but do not ignore https://www.mozilla.com", + ["http://www.example.com", "https://www.mozilla.com"], ["example.com", "mozilla.com"], ), ( - "Check out http://www.example.org ,sign up on www.anothersite.net and visit mozilla.org", - ["http://www.example.org", "www.anothersite.net", "mozilla.org"], + "Check out https://example.org ,sign up on www.anothersite.net and proceed to https://firefox.mozilla.org", + [ + "https://example.org", + "www.anothersite.net", + "https://firefox.mozilla.org", + ], ["example.org", "anothersite.net", "mozilla.org"], ), ( - "Visit http://example.org.uk ,sign up on www.anothersite.net.ac and visit firefox.mozilla.org", - ["http://example.org.uk", "www.anothersite.net.ac", "firefox.mozilla.org"], - ["example.org", "anothersite.net.ac", "mozilla.org"], + "Visit https://www.example.org.uk ,sign up on www.anothersite.net.ac and proceed to www.test.mozilla.org", + [ + "https://www.example.org.uk", + "www.anothersite.net.ac", + "www.test.mozilla.org", + ], + ["example.org.uk", "anothersite.net.ac", "mozilla.org"], ), ( - "Check out https://www.example.com/a/abc/cat.jpg ,sign up on www.anothersite.net/abc/cde and visit mozilla.org/signup", + "Check out http://example.com/a/abc/cat.jpg ,sign up on www.anothersite.net/abc/cde and proceed to https://firefox.mozilla.com/download/macos", [ - "https://www.example.com/a/abc/cat.jpg", + "http://example.com/a/abc/cat.jpg", "www.anothersite.net/abc/cde", - "mozilla.org/signup", + "https://firefox.mozilla.com/download/macos", ], - ["example.com", "anothersite.net.ac", "mozilla.org.uk"], + ["example.com", "anothersite.net", "mozilla.com"], ), ( - "Visit http://example.org.uk/a/abc/cat.jpg ,sign up on www.anothersite.net.ac/abc/cde and visit shop.mozilla.org.uk/signup", + "Visit http://www.example.org.uk/a/abc/cat.jpg ,sign up on www.anothersite.net.ac/abc/cde and visit https://www.mozilla.com/signup", [ - "http://example.org.uk/a/abc/cat.jpg", + "http://www.example.org.uk/a/abc/cat.jpg", "www.anothersite.net.ac/abc/cde", - "shop.mozilla.org.uk/signup", + "https://www.mozilla.com/signup", ], - ["example.org.uk", "anothersite.net.ac", "shop.mozilla.org.uk"], + ["example.org.uk", "anothersite.net.ac", "mozilla.com"], ), ], ) -@pytest.mark.xfail(reason="The function to extract does not function as expected") -def test_url_extraction_domains(test_input, expected_urls, expected_domains): - """Tests extraction of URLs and domains without ignoring any domains""" +def test_url_extraction(test_input, expected_urls, expected_domains): + """Tests extraction of URLs and domains without ignoring domains""" result = utils.extract_urls_and_domains(test_input) assert result["urls"] == expected_urls From 45097dafe6565186137174b819ca0a4e372e883c Mon Sep 17 00:00:00 2001 From: John P Date: Thu, 22 Feb 2024 20:54:09 +0300 Subject: [PATCH 34/59] Include tld_extract library --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a7fff04333..0ef2acc664 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,6 +28,7 @@ shap[plots]==0.44.1 tabulate==0.9.0 taskcluster==60.3.4 tenacity==8.2.3 +tldextract==5.1.1 tqdm==4.66.1 xgboost==2.0.3 zstandard==0.22.0 From 0a06ea3efe3c7076fe71eb5261d4f9c24acff904 Mon Sep 17 00:00:00 2001 From: John P Date: Thu, 22 Feb 2024 21:46:38 +0300 Subject: [PATCH 35/59] Test without scale_pos_weight --- bugbug/models/spamcomment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 5ab728523b..c87cf71cef 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -27,7 +27,7 @@ def __init__(self, lemmatization=True): self.calculate_importance = False - self.use_scale_pos_weight = True + self.use_scale_pos_weight = False feature_extractors = [ comment_features.NumberOfLinks(SAFE_DOMAINS), From e1937640def5e1177589b3f03676c9063827de88 Mon Sep 17 00:00:00 2001 From: John P Date: Fri, 23 Feb 2024 00:58:39 +0300 Subject: [PATCH 36/59] Test with n_estimators changed --- bugbug/models/spamcomment.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index c87cf71cef..1a15ccbe29 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -78,7 +78,9 @@ def __init__(self, lemmatization=True): ), ( "estimator", - xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), + xgboost.XGBClassifier( + n_jobs=utils.get_physical_cpu_count(), n_estimators=200 + ), ), ] ) From dda9b9559355f9d9089c6d752a3b4bff3bc9a09c Mon Sep 17 00:00:00 2001 From: John P Date: Fri, 23 Feb 2024 09:40:23 +0300 Subject: [PATCH 37/59] Test with GridSearch CV Values --- bugbug/models/spamcomment.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 1a15ccbe29..79a89af47a 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -79,7 +79,18 @@ def __init__(self, lemmatization=True): ( "estimator", xgboost.XGBClassifier( - n_jobs=utils.get_physical_cpu_count(), n_estimators=200 + n_jobs=utils.get_physical_cpu_count(), + learning_rate=0.01, + n_estimators=1000, + max_depth=4, + min_child_weight=6, + gamma=0, + subsample=0.8, + colsample_bytree=0.8, + reg_alpha=0.005, + objective="binary:logistic", + nthread=4, + scale_pos_weight=1, ), ), ] From 5ba0c225278b85b0407e0fc1103f9964a6cd11af Mon Sep 17 00:00:00 2001 From: John P Date: Fri, 23 Feb 2024 10:33:53 +0300 Subject: [PATCH 38/59] Remove scale_pos_weight from model.py --- bugbug/model.py | 17 ----------------- bugbug/models/spamcomment.py | 2 -- 2 files changed, 19 deletions(-) diff --git a/bugbug/model.py b/bugbug/model.py index b6095e97a4..2f8c469639 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -174,8 +174,6 @@ def __init__(self, lemmatization=False): self.store_dataset = False - self.use_scale_pos_weight = False - self.entire_dataset_training = False # DBs required for training. @@ -392,21 +390,6 @@ def train(self, importance_cutoff=0.15, limit=None): # Split dataset in training and test. X_train, X_test, y_train, y_test = self.train_test_split(X, y) - # Use scale_pos_weight to help in extremely imbalanced datasets - if self.use_scale_pos_weight and is_binary: - negative_samples = sum(label == 0 for label in y_train) - positive_samples = sum(label == 1 for label in y_train) - logger.info("Negative Samples: %d", negative_samples) - logger.info("Positive Samples: %d", positive_samples) - - scale_pos_weight = float(negative_samples / positive_samples) - - logger.info("Scale Pos Weight: %d", scale_pos_weight) - - self.clf.named_steps["estimator"].set_params( - scale_pos_weight=scale_pos_weight - ) - tracking_metrics = {} # Use k-fold cross validation to evaluate results. diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 79a89af47a..ab60e0d2ff 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -27,8 +27,6 @@ def __init__(self, lemmatization=True): self.calculate_importance = False - self.use_scale_pos_weight = False - feature_extractors = [ comment_features.NumberOfLinks(SAFE_DOMAINS), comment_features.WordCount(), From ca16b983ca8020f834da3efa498dd0e94434afea Mon Sep 17 00:00:00 2001 From: John P Date: Fri, 23 Feb 2024 11:42:43 +0300 Subject: [PATCH 39/59] Set n_estimators to 1000 --- bugbug/models/spamcomment.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index ab60e0d2ff..007e7abb40 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -80,15 +80,6 @@ def __init__(self, lemmatization=True): n_jobs=utils.get_physical_cpu_count(), learning_rate=0.01, n_estimators=1000, - max_depth=4, - min_child_weight=6, - gamma=0, - subsample=0.8, - colsample_bytree=0.8, - reg_alpha=0.005, - objective="binary:logistic", - nthread=4, - scale_pos_weight=1, ), ), ] From 18d18f0261becd27cb55cfd168cae6dd50386e15 Mon Sep 17 00:00:00 2001 From: John P Date: Fri, 23 Feb 2024 13:21:06 +0300 Subject: [PATCH 40/59] Revert "Remove scale_pos_weight from model.py" This reverts commit 5ba0c225278b85b0407e0fc1103f9964a6cd11af. --- bugbug/model.py | 17 +++++++++++++++++ bugbug/models/spamcomment.py | 2 ++ 2 files changed, 19 insertions(+) diff --git a/bugbug/model.py b/bugbug/model.py index 2f8c469639..b6095e97a4 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -174,6 +174,8 @@ def __init__(self, lemmatization=False): self.store_dataset = False + self.use_scale_pos_weight = False + self.entire_dataset_training = False # DBs required for training. @@ -390,6 +392,21 @@ def train(self, importance_cutoff=0.15, limit=None): # Split dataset in training and test. X_train, X_test, y_train, y_test = self.train_test_split(X, y) + # Use scale_pos_weight to help in extremely imbalanced datasets + if self.use_scale_pos_weight and is_binary: + negative_samples = sum(label == 0 for label in y_train) + positive_samples = sum(label == 1 for label in y_train) + logger.info("Negative Samples: %d", negative_samples) + logger.info("Positive Samples: %d", positive_samples) + + scale_pos_weight = float(negative_samples / positive_samples) + + logger.info("Scale Pos Weight: %d", scale_pos_weight) + + self.clf.named_steps["estimator"].set_params( + scale_pos_weight=scale_pos_weight + ) + tracking_metrics = {} # Use k-fold cross validation to evaluate results. diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 007e7abb40..d5c94568ff 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -27,6 +27,8 @@ def __init__(self, lemmatization=True): self.calculate_importance = False + self.use_scale_pos_weight = False + feature_extractors = [ comment_features.NumberOfLinks(SAFE_DOMAINS), comment_features.WordCount(), From 1d35968ad1e074d76f8f6bb0ec3ff54df33edf74 Mon Sep 17 00:00:00 2001 From: John P Date: Fri, 23 Feb 2024 14:20:02 +0300 Subject: [PATCH 41/59] Remove comments which have 'redacted- --- bugbug/model.py | 3 ++- bugbug/models/spamcomment.py | 13 +++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/bugbug/model.py b/bugbug/model.py index b6095e97a4..c3ef9ec836 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -6,6 +6,7 @@ import logging import pickle from collections import defaultdict +from math import sqrt from os import makedirs, path from typing import Any @@ -399,7 +400,7 @@ def train(self, importance_cutoff=0.15, limit=None): logger.info("Negative Samples: %d", negative_samples) logger.info("Positive Samples: %d", positive_samples) - scale_pos_weight = float(negative_samples / positive_samples) + scale_pos_weight = sqrt(negative_samples / positive_samples) logger.info("Scale Pos Weight: %d", scale_pos_weight) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index d5c94568ff..f89af7dd54 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -6,7 +6,7 @@ import logging import xgboost -from imblearn.over_sampling import SMOTE +from imblearn.over_sampling import BorderlineSMOTE from imblearn.pipeline import Pipeline as ImblearnPipeline from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer @@ -27,12 +27,12 @@ def __init__(self, lemmatization=True): self.calculate_importance = False - self.use_scale_pos_weight = False + self.use_scale_pos_weight = True feature_extractors = [ comment_features.NumberOfLinks(SAFE_DOMAINS), comment_features.WordCount(), - comment_features.HourOfDay(), + # comment_features.HourOfDay(), comment_features.DayOfYear(), comment_features.WeekOfYear(), comment_features.Weekday(), @@ -74,14 +74,14 @@ def __init__(self, lemmatization=True): ), ( "sampler", - SMOTE(random_state=0), + BorderlineSMOTE(random_state=0), ), ( "estimator", xgboost.XGBClassifier( n_jobs=utils.get_physical_cpu_count(), learning_rate=0.01, - n_estimators=1000, + n_estimators=500, ), ), ] @@ -117,11 +117,12 @@ def get_labels(self): # Skip the first comment because most first comments may contain links. # Skip comments filed by Mozillians and bots, since we are sure they are not spam. - # Skip comments whose text has been removed. + # Skip comments whose text has been removed or redacted. if any( [ comment["count"] == "0", "@mozilla" in comment["creator"], + "redacted -" in comment["text"], "(comment removed)" in comment["text"], ] ): From 0a21b61dbccdd44107af4066293040ae62b612c5 Mon Sep 17 00:00:00 2001 From: John P Date: Sun, 25 Feb 2024 20:19:49 +0300 Subject: [PATCH 42/59] Test with new parameters --- bugbug/model.py | 2 +- bugbug/models/spamcomment.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bugbug/model.py b/bugbug/model.py index c3ef9ec836..5dd497a3dc 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -400,7 +400,7 @@ def train(self, importance_cutoff=0.15, limit=None): logger.info("Negative Samples: %d", negative_samples) logger.info("Positive Samples: %d", positive_samples) - scale_pos_weight = sqrt(negative_samples / positive_samples) + scale_pos_weight = sqrt(negative_samples / positive_samples) / 2 logger.info("Scale Pos Weight: %d", scale_pos_weight) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index f89af7dd54..efedeb443f 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -6,7 +6,7 @@ import logging import xgboost -from imblearn.over_sampling import BorderlineSMOTE +from imblearn.over_sampling import SMOTE from imblearn.pipeline import Pipeline as ImblearnPipeline from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer @@ -74,13 +74,12 @@ def __init__(self, lemmatization=True): ), ( "sampler", - BorderlineSMOTE(random_state=0), + SMOTE(random_state=0), ), ( "estimator", xgboost.XGBClassifier( n_jobs=utils.get_physical_cpu_count(), - learning_rate=0.01, n_estimators=500, ), ), From 00a9f9f16176b847177c88c0368a7ba8a00c556c Mon Sep 17 00:00:00 2001 From: John P Date: Sun, 25 Feb 2024 22:06:27 +0300 Subject: [PATCH 43/59] Change df --- bugbug/models/spamcomment.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index efedeb443f..55ae6e3723 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -66,7 +66,7 @@ def __init__(self, lemmatization=True): ("data", DictVectorizer(), "data"), ( "comment_text", - self.text_vectorizer(min_df=0.0001), + self.text_vectorizer(min_df=0.001), "comment_text", ), ] @@ -117,11 +117,13 @@ def get_labels(self): # Skip the first comment because most first comments may contain links. # Skip comments filed by Mozillians and bots, since we are sure they are not spam. # Skip comments whose text has been removed or redacted. + # TODO: Skip comments for Mozillians using non-Mozilla email domains. if any( [ comment["count"] == "0", "@mozilla" in comment["creator"], - "redacted -" in comment["text"], + "@softvision" in comment["creator"], + "[redacted -" in comment["text"], "(comment removed)" in comment["text"], ] ): From f55d137750351625c4666a505fd0d2f7c3b1988f Mon Sep 17 00:00:00 2001 From: John P Date: Mon, 26 Feb 2024 16:12:20 +0300 Subject: [PATCH 44/59] Test: Include tags as feature --- bugbug/models/spamcomment.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 55ae6e3723..59c63346e1 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -32,7 +32,8 @@ def __init__(self, lemmatization=True): feature_extractors = [ comment_features.NumberOfLinks(SAFE_DOMAINS), comment_features.WordCount(), - # comment_features.HourOfDay(), + comment_features.CommentTags({"spam"}), + comment_features.HourOfDay(), comment_features.DayOfYear(), comment_features.WeekOfYear(), comment_features.Weekday(), From dbcb311c4b526da19197d9f32c2c18c70233c978 Mon Sep 17 00:00:00 2001 From: John P Date: Mon, 26 Feb 2024 17:56:37 +0300 Subject: [PATCH 45/59] Exclude comment tags --- bugbug/models/spamcomment.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 59c63346e1..18fc2f5d46 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -32,7 +32,6 @@ def __init__(self, lemmatization=True): feature_extractors = [ comment_features.NumberOfLinks(SAFE_DOMAINS), comment_features.WordCount(), - comment_features.CommentTags({"spam"}), comment_features.HourOfDay(), comment_features.DayOfYear(), comment_features.WeekOfYear(), @@ -67,7 +66,7 @@ def __init__(self, lemmatization=True): ("data", DictVectorizer(), "data"), ( "comment_text", - self.text_vectorizer(min_df=0.001), + self.text_vectorizer(min_df=0.0015), "comment_text", ), ] From 1b437da932bbfe34dbbf23fb60d942436be2af34 Mon Sep 17 00:00:00 2001 From: John P Date: Tue, 27 Feb 2024 15:00:18 +0300 Subject: [PATCH 46/59] Exclude emails from commit authors --- bugbug/models/spamcomment.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 18fc2f5d46..e2462de18e 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -12,7 +12,7 @@ from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline -from bugbug import bugzilla, comment_features, feature_cleanup, utils +from bugbug import bugzilla, comment_features, db, feature_cleanup, repository, utils from bugbug.model import CommentModel logging.basicConfig(level=logging.INFO) @@ -110,6 +110,13 @@ def get_labels(self): self.__download_older_bugs_with_spam_comments() + # Get emails of commit authors. Comments from such people will be skipped. + assert db.download(repository.COMMITS_DB) + commit_emails = { + commit["author_email"] + for commit in repository.get_commits(include_backouts=True) + } + for bug in bugzilla.get_bugs(): for comment in bug["comments"]: comment_id = comment["id"] @@ -117,10 +124,10 @@ def get_labels(self): # Skip the first comment because most first comments may contain links. # Skip comments filed by Mozillians and bots, since we are sure they are not spam. # Skip comments whose text has been removed or redacted. - # TODO: Skip comments for Mozillians using non-Mozilla email domains. if any( [ comment["count"] == "0", + comment["creator"] in commit_emails, "@mozilla" in comment["creator"], "@softvision" in comment["creator"], "[redacted -" in comment["text"], From 16e14c5ec47b832c6d2a43b5eaf4cb278363b3a3 Mon Sep 17 00:00:00 2001 From: John P Date: Tue, 27 Feb 2024 15:52:24 +0300 Subject: [PATCH 47/59] Test without scale pos weight --- bugbug/model.py | 2 +- bugbug/models/spamcomment.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bugbug/model.py b/bugbug/model.py index 5dd497a3dc..c3ef9ec836 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -400,7 +400,7 @@ def train(self, importance_cutoff=0.15, limit=None): logger.info("Negative Samples: %d", negative_samples) logger.info("Positive Samples: %d", positive_samples) - scale_pos_weight = sqrt(negative_samples / positive_samples) / 2 + scale_pos_weight = sqrt(negative_samples / positive_samples) logger.info("Scale Pos Weight: %d", scale_pos_weight) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index e2462de18e..b1d78d4568 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -27,7 +27,7 @@ def __init__(self, lemmatization=True): self.calculate_importance = False - self.use_scale_pos_weight = True + self.use_scale_pos_weight = False feature_extractors = [ comment_features.NumberOfLinks(SAFE_DOMAINS), @@ -66,7 +66,7 @@ def __init__(self, lemmatization=True): ("data", DictVectorizer(), "data"), ( "comment_text", - self.text_vectorizer(min_df=0.0015), + self.text_vectorizer(min_df=0.0001), "comment_text", ), ] @@ -80,7 +80,6 @@ def __init__(self, lemmatization=True): "estimator", xgboost.XGBClassifier( n_jobs=utils.get_physical_cpu_count(), - n_estimators=500, ), ), ] From 94ab2835a353de53d1da68ba859001f54f95cc4d Mon Sep 17 00:00:00 2001 From: John P Date: Tue, 27 Feb 2024 17:03:58 +0300 Subject: [PATCH 48/59] Test with scale_pos_weight adjusted --- bugbug/model.py | 3 +-- bugbug/models/spamcomment.py | 10 ++++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/bugbug/model.py b/bugbug/model.py index c3ef9ec836..8e2642ae12 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -6,7 +6,6 @@ import logging import pickle from collections import defaultdict -from math import sqrt from os import makedirs, path from typing import Any @@ -400,7 +399,7 @@ def train(self, importance_cutoff=0.15, limit=None): logger.info("Negative Samples: %d", negative_samples) logger.info("Positive Samples: %d", positive_samples) - scale_pos_weight = sqrt(negative_samples / positive_samples) + scale_pos_weight = (negative_samples / positive_samples) / 4 logger.info("Scale Pos Weight: %d", scale_pos_weight) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index b1d78d4568..ce5f1d0a08 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -6,7 +6,7 @@ import logging import xgboost -from imblearn.over_sampling import SMOTE +from imblearn.over_sampling import BorderlineSMOTE from imblearn.pipeline import Pipeline as ImblearnPipeline from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer @@ -27,7 +27,7 @@ def __init__(self, lemmatization=True): self.calculate_importance = False - self.use_scale_pos_weight = False + self.use_scale_pos_weight = True feature_extractors = [ comment_features.NumberOfLinks(SAFE_DOMAINS), @@ -74,13 +74,11 @@ def __init__(self, lemmatization=True): ), ( "sampler", - SMOTE(random_state=0), + BorderlineSMOTE(random_state=0), ), ( "estimator", - xgboost.XGBClassifier( - n_jobs=utils.get_physical_cpu_count(), - ), + xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), ), ] ) From 5a58108969d792b9b46d70639b4271f166637639 Mon Sep 17 00:00:00 2001 From: John P Date: Wed, 28 Feb 2024 11:45:36 +0300 Subject: [PATCH 49/59] Adjust scale pos weight --- bugbug/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bugbug/model.py b/bugbug/model.py index 8e2642ae12..4e76eed1d2 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -399,7 +399,7 @@ def train(self, importance_cutoff=0.15, limit=None): logger.info("Negative Samples: %d", negative_samples) logger.info("Positive Samples: %d", positive_samples) - scale_pos_weight = (negative_samples / positive_samples) / 4 + scale_pos_weight = (negative_samples / positive_samples) / 10 logger.info("Scale Pos Weight: %d", scale_pos_weight) From 3eab9887b5a04e2a90058164a76cd8f28080d574 Mon Sep 17 00:00:00 2001 From: John P Date: Fri, 1 Mar 2024 21:26:25 +0300 Subject: [PATCH 50/59] Test wihout WeekOfYear --- bugbug/models/spamcomment.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index ce5f1d0a08..83d8da276e 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -34,7 +34,6 @@ def __init__(self, lemmatization=True): comment_features.WordCount(), comment_features.HourOfDay(), comment_features.DayOfYear(), - comment_features.WeekOfYear(), comment_features.Weekday(), comment_features.UnknownLinkAtBeginning(SAFE_DOMAINS), comment_features.UnknownLinkAtEnd(SAFE_DOMAINS), From bd16d56d869c73e84fc7be3d290a8b1d84b68b37 Mon Sep 17 00:00:00 2001 From: John P Date: Wed, 6 Mar 2024 18:22:09 +0300 Subject: [PATCH 51/59] Include comment classifier --- bugbug/bugzilla.py | 10 +++++ bugbug/models/spamcomment.py | 39 ++++++++++++------ scripts/comment_classifier.py | 77 +++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 12 deletions(-) create mode 100644 scripts/comment_classifier.py diff --git a/bugbug/bugzilla.py b/bugbug/bugzilla.py index ab9bfd1f90..34efcbe063 100644 --- a/bugbug/bugzilla.py +++ b/bugbug/bugzilla.py @@ -407,6 +407,16 @@ def get_groups_users(group_names: list[str]) -> list[str]: ] +def get_comment(comment_id) -> dict: + r = utils.get_session("bugzilla").get( + f"https://bugzilla.mozilla.org/rest/bug/comment/{comment_id}", + headers={"X-Bugzilla-API-Key": Bugzilla.TOKEN, "User-Agent": "bugbug"}, + ) + r.raise_for_status() + + return r.json()["comments"][str(comment_id)] + + def get_revision_ids(bug: BugDict) -> list[int]: revision_ids = [] diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 83d8da276e..1af17badf0 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -101,31 +101,46 @@ def __download_older_bugs_with_spam_comments() -> None: logger.info("%d older bugs have been downloaded.", len(older_bugs)) - def get_labels(self): - classes = {} - - self.__download_older_bugs_with_spam_comments() + @staticmethod + def __safe_comment(comment) -> bool: + """Determines if a comment is certainly safe (not spam) and should be excluded from the training set. - # Get emails of commit authors. Comments from such people will be skipped. + This function applies filtering rules to identify comments that are likely + authored by legitimate Mozillians or bots. + """ + # Get emails of commit authors. assert db.download(repository.COMMITS_DB) commit_emails = { commit["author_email"] for commit in repository.get_commits(include_backouts=True) } + # Ignore comments filed by Mozillians and bots, since we are sure they are not spam. + if any( + [ + comment["creator"] in commit_emails, + "@mozilla" in comment["creator"], + "@softvision" in comment["creator"], + ] + ): + return True + + return False + + def get_labels(self): + classes = {} + + self.__download_older_bugs_with_spam_comments() + for bug in bugzilla.get_bugs(): for comment in bug["comments"]: comment_id = comment["id"] - # Skip the first comment because most first comments may contain links. - # Skip comments filed by Mozillians and bots, since we are sure they are not spam. - # Skip comments whose text has been removed or redacted. if any( [ comment["count"] == "0", - comment["creator"] in commit_emails, - "@mozilla" in comment["creator"], - "@softvision" in comment["creator"], + self.__safe_comment(comment), + comment["creator"] == bug["creator"], "[redacted -" in comment["text"], "(comment removed)" in comment["text"], ] @@ -161,7 +176,7 @@ def get_feature_names(self): def overwrite_classes(self, comments, classes, probabilities): for i, comment in enumerate(comments): - if "@mozilla" in comment["creator"]: + if self.__safe_comment(comment): if probabilities: classes[i] = [1.0, 0.0] else: diff --git a/scripts/comment_classifier.py b/scripts/comment_classifier.py new file mode 100644 index 0000000000..1fb163345f --- /dev/null +++ b/scripts/comment_classifier.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- + +import argparse +import os +from logging import INFO, basicConfig, getLogger + +import numpy as np +import requests + +from bugbug import bugzilla +from bugbug.models import get_model_class +from bugbug.utils import download_model + +basicConfig(level=INFO) +logger = getLogger(__name__) + + +def classify_comments(model_name: str, comment_id: int) -> None: + model_file_name = f"{model_name}model" + + if not os.path.exists(model_file_name): + logger.info("%s does not exist. Downloading the model....", model_file_name) + try: + download_model(model_name) + except requests.HTTPError: + logger.error( + "A pre-trained model is not available, you will need to train it yourself using the trainer script" + ) + raise SystemExit(1) + + model_class = get_model_class(model_name) + model = model_class.load(model_file_name) + + if comment_id: + # Get a comment by its id + comment = bugzilla.get_comment(comment_id) + assert comment, f"A comment with a comment id of {comment_id} was not found" + + print( + f'https://bugzilla.mozilla.org/show_bug.cgi?id={comment["bug_id"]}#{comment["count"]}' + ) + + if model.calculate_importance: + probas, importance = model.classify( + comment, probabilities=True, importances=True + ) + + model.print_feature_importances( + importance["importances"], class_probabilities=probas + ) + else: + probas = model.classify(comment, probabilities=True, importances=False) + + probability = probas[0] + pred_index = np.argmax(probability) + if len(probability) > 2: + pred_class = model.le.inverse_transform([pred_index])[0] + else: + pred_class = "Positive" if pred_index == 1 else "Negative" + print(f"{pred_class} {probability}") + input() + + +def main() -> None: + description = "Perform evaluation on comments using the specified model" + parser = argparse.ArgumentParser(description=description) + + parser.add_argument("model", help="Which model to use for evaluation") + parser.add_argument("--comment-id", help="Classify the given comment id", type=int) + + args = parser.parse_args() + + classify_comments(args.model, args.comment_id) + + +if __name__ == "__main__": + main() From 0a11f3c975a5399fe9f67c4c68f1a30815bea349 Mon Sep 17 00:00:00 2001 From: John P Date: Wed, 6 Mar 2024 20:55:22 +0300 Subject: [PATCH 52/59] Include script in setup --- bugbug/models/spamcomment.py | 19 ++++++++----------- scripts/comment_classifier.py | 4 ++-- setup.py | 1 + 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 1af17badf0..03730c78f7 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -27,7 +27,7 @@ def __init__(self, lemmatization=True): self.calculate_importance = False - self.use_scale_pos_weight = True + self.use_scale_pos_weight = False feature_extractors = [ comment_features.NumberOfLinks(SAFE_DOMAINS), @@ -102,11 +102,11 @@ def __download_older_bugs_with_spam_comments() -> None: logger.info("%d older bugs have been downloaded.", len(older_bugs)) @staticmethod - def __safe_comment(comment) -> bool: - """Determines if a comment is certainly safe (not spam) and should be excluded from the training set. + def __is_safe_comment(comment) -> bool: + """Determines if a comment is certainly safe (not spam) based on certain conditions. This function applies filtering rules to identify comments that are likely - authored by legitimate Mozillians or bots. + authored by legitimate contributors or bots. Such comments are definitely not spam. """ # Get emails of commit authors. assert db.download(repository.COMMITS_DB) @@ -116,16 +116,13 @@ def __safe_comment(comment) -> bool: } # Ignore comments filed by Mozillians and bots, since we are sure they are not spam. - if any( + return any( [ comment["creator"] in commit_emails, "@mozilla" in comment["creator"], "@softvision" in comment["creator"], ] - ): - return True - - return False + ) def get_labels(self): classes = {} @@ -139,7 +136,7 @@ def get_labels(self): if any( [ comment["count"] == "0", - self.__safe_comment(comment), + self.__is_safe_comment(comment), comment["creator"] == bug["creator"], "[redacted -" in comment["text"], "(comment removed)" in comment["text"], @@ -176,7 +173,7 @@ def get_feature_names(self): def overwrite_classes(self, comments, classes, probabilities): for i, comment in enumerate(comments): - if self.__safe_comment(comment): + if self.__is_safe_comment(comment): if probabilities: classes[i] = [1.0, 0.0] else: diff --git a/scripts/comment_classifier.py b/scripts/comment_classifier.py index 1fb163345f..682219a661 100644 --- a/scripts/comment_classifier.py +++ b/scripts/comment_classifier.py @@ -15,7 +15,7 @@ logger = getLogger(__name__) -def classify_comments(model_name: str, comment_id: int) -> None: +def classify_comment(model_name: str, comment_id: int) -> None: model_file_name = f"{model_name}model" if not os.path.exists(model_file_name): @@ -70,7 +70,7 @@ def main() -> None: args = parser.parse_args() - classify_comments(args.model, args.comment_id) + classify_comment(args.model, args.comment_id) if __name__ == "__main__": diff --git a/setup.py b/setup.py index 22919e5481..f0bc40e43c 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ def read_requirements(file_): "bugbug-maintenance-effectiveness-indicator = scripts.maintenance_effectiveness_indicator:main", "bugbug-microannotate-generate = scripts.microannotate_generator:main", "bugbug-classify-commit = scripts.commit_classifier:main", + "bugbug-classify-comment scripts.comment_classifier:main", "bugbug-classify-bug = scripts.bug_classifier:main", "bugbug-regressor-finder = scripts.regressor_finder:main", "bugbug-retrieve-training-metrics = scripts.retrieve_training_metrics:main", From a3956b4918599737a7420554f9e037bbd11cc4ea Mon Sep 17 00:00:00 2001 From: John P Date: Wed, 6 Mar 2024 20:57:42 +0300 Subject: [PATCH 53/59] Fix script error --- bugbug/models/spamcomment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 03730c78f7..bea7ff3a79 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -27,7 +27,7 @@ def __init__(self, lemmatization=True): self.calculate_importance = False - self.use_scale_pos_weight = False + self.use_scale_pos_weight = True feature_extractors = [ comment_features.NumberOfLinks(SAFE_DOMAINS), From 5c93d238043cac71dfd91a80395f3ed65abed047 Mon Sep 17 00:00:00 2001 From: John P Date: Wed, 6 Mar 2024 21:07:12 +0300 Subject: [PATCH 54/59] Fix setup error --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f0bc40e43c..d80bc620ea 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ def read_requirements(file_): "bugbug-maintenance-effectiveness-indicator = scripts.maintenance_effectiveness_indicator:main", "bugbug-microannotate-generate = scripts.microannotate_generator:main", "bugbug-classify-commit = scripts.commit_classifier:main", - "bugbug-classify-comment scripts.comment_classifier:main", + "bugbug-classify-comment = scripts.comment_classifier:main", "bugbug-classify-bug = scripts.bug_classifier:main", "bugbug-regressor-finder = scripts.regressor_finder:main", "bugbug-retrieve-training-metrics = scripts.retrieve_training_metrics:main", From 15c8d5ac3d2bc18bfacf273f08e8101f6748ff13 Mon Sep 17 00:00:00 2001 From: John P Date: Thu, 7 Mar 2024 11:35:27 +0300 Subject: [PATCH 55/59] Classify all comments --- bugbug/bugzilla.py | 2 +- bugbug/models/spamcomment.py | 24 ++++++++++-------------- scripts/comment_classifier.py | 23 ++++++++++++++++------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/bugbug/bugzilla.py b/bugbug/bugzilla.py index 34efcbe063..31cb960abd 100644 --- a/bugbug/bugzilla.py +++ b/bugbug/bugzilla.py @@ -414,7 +414,7 @@ def get_comment(comment_id) -> dict: ) r.raise_for_status() - return r.json()["comments"][str(comment_id)] + return r.json()["comments"] def get_revision_ids(bug: BugDict) -> list[int]: diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index bea7ff3a79..b3415afed9 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -12,7 +12,7 @@ from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline -from bugbug import bugzilla, comment_features, db, feature_cleanup, repository, utils +from bugbug import bugzilla, comment_features, feature_cleanup, repository, utils from bugbug.model import CommentModel logging.basicConfig(level=logging.INFO) @@ -29,6 +29,11 @@ def __init__(self, lemmatization=True): self.use_scale_pos_weight = True + self.commit_emails = { + commit["author_email"] + for commit in repository.get_commits(include_backouts=True) + } + feature_extractors = [ comment_features.NumberOfLinks(SAFE_DOMAINS), comment_features.WordCount(), @@ -101,24 +106,15 @@ def __download_older_bugs_with_spam_comments() -> None: logger.info("%d older bugs have been downloaded.", len(older_bugs)) - @staticmethod - def __is_safe_comment(comment) -> bool: + def is_safe_comment(self, comment) -> bool: """Determines if a comment is certainly safe (not spam) based on certain conditions. This function applies filtering rules to identify comments that are likely authored by legitimate contributors or bots. Such comments are definitely not spam. """ - # Get emails of commit authors. - assert db.download(repository.COMMITS_DB) - commit_emails = { - commit["author_email"] - for commit in repository.get_commits(include_backouts=True) - } - - # Ignore comments filed by Mozillians and bots, since we are sure they are not spam. return any( [ - comment["creator"] in commit_emails, + comment["creator"] in self.commit_emails, "@mozilla" in comment["creator"], "@softvision" in comment["creator"], ] @@ -136,7 +132,7 @@ def get_labels(self): if any( [ comment["count"] == "0", - self.__is_safe_comment(comment), + self.is_safe_comment(comment), comment["creator"] == bug["creator"], "[redacted -" in comment["text"], "(comment removed)" in comment["text"], @@ -173,7 +169,7 @@ def get_feature_names(self): def overwrite_classes(self, comments, classes, probabilities): for i, comment in enumerate(comments): - if self.__is_safe_comment(comment): + if self.is_safe_comment(comment): if probabilities: classes[i] = [1.0, 0.0] else: diff --git a/scripts/comment_classifier.py b/scripts/comment_classifier.py index 682219a661..007a2ac0c1 100644 --- a/scripts/comment_classifier.py +++ b/scripts/comment_classifier.py @@ -7,7 +7,7 @@ import numpy as np import requests -from bugbug import bugzilla +from bugbug import bugzilla, db from bugbug.models import get_model_class from bugbug.utils import download_model @@ -15,7 +15,7 @@ logger = getLogger(__name__) -def classify_comment(model_name: str, comment_id: int) -> None: +def classify_comments(model_name: str, comment_id: int) -> None: model_file_name = f"{model_name}model" if not os.path.exists(model_file_name): @@ -33,11 +33,20 @@ def classify_comment(model_name: str, comment_id: int) -> None: if comment_id: # Get a comment by its id - comment = bugzilla.get_comment(comment_id) - assert comment, f"A comment with a comment id of {comment_id} was not found" - + comments = list(bugzilla.get_comment(comment_id).values()) + assert comments, f"A comment with a comment id of {comment_id} was not found" + else: + assert db.download(bugzilla.BUGS_DB) + bugs = bugzilla.get_bugs() + comments = [ + {**comment, "bug_id": bug["id"]} + for bug in bugs + for comment in bug["comments"] + ] + + for comment in comments: print( - f'https://bugzilla.mozilla.org/show_bug.cgi?id={comment["bug_id"]}#{comment["count"]}' + f'https://bugzilla.mozilla.org/show_bug.cgi?id={comment["bug_id"]}#c{comment["count"]}' ) if model.calculate_importance: @@ -70,7 +79,7 @@ def main() -> None: args = parser.parse_args() - classify_comment(args.model, args.comment_id) + classify_comments(args.model, args.comment_id) if __name__ == "__main__": From 5f953ac92304c3f7947ba077b39beaf4e1f1c188 Mon Sep 17 00:00:00 2001 From: John P Date: Wed, 13 Mar 2024 15:45:29 +0300 Subject: [PATCH 56/59] Include spamcom in model names --- bugbug/bugzilla.py | 1 + http_service/bugbug_http/models.py | 62 ++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/bugbug/bugzilla.py b/bugbug/bugzilla.py index 31cb960abd..5fe8bb9a18 100644 --- a/bugbug/bugzilla.py +++ b/bugbug/bugzilla.py @@ -412,6 +412,7 @@ def get_comment(comment_id) -> dict: f"https://bugzilla.mozilla.org/rest/bug/comment/{comment_id}", headers={"X-Bugzilla-API-Key": Bugzilla.TOKEN, "User-Agent": "bugbug"}, ) + # TODO: Do we raise the error or we just ignore the comment? r.raise_for_status() return r.json()["comments"] diff --git a/http_service/bugbug_http/models.py b/http_service/bugbug_http/models.py index 8b90fe1cee..d6fe9d8662 100644 --- a/http_service/bugbug_http/models.py +++ b/http_service/bugbug_http/models.py @@ -37,6 +37,7 @@ "testgroupselect", "accessibility", "performancebug", + "spamcomment", ] DEFAULT_EXPIRATION_TTL = 7 * 24 * 3600 # A week @@ -182,6 +183,67 @@ def classify_issue( return "OK" +def classify_comment( + model_name: str, comment_ids: Sequence[int], bugzilla_token: str +) -> str: + from bugbug_http.app import JobInfo + + # This should be called in a process worker so it should be safe to set + # the token here + comment_ids_set = set(map(int, comment_ids)) + bugzilla.set_token(bugzilla_token) + + comments = { + comment_id: bugzilla.get_comment(comment_id).values() + for comment_id in comment_ids + } + + missing_comments = comment_ids_set.difference(comments.keys()) + + for comment_id in missing_comments: + job = JobInfo(classify_comment, model_name, comment_id) + + # TODO: Find a better error format + setkey(job.result_key, orjson.dumps({"available": False})) + + if not comments: + return "NOK" + + model = MODEL_CACHE.get(model_name) + + if not model: + LOGGER.info("Missing model %r, aborting" % model_name) + return "NOK" + + model_extra_data = model.get_extra_data() + + # TODO: Classify could choke on a single bug which could make the whole + # job to fails. What should we do here? + probs = model.classify(list(comments.values()), True) + indexes = probs.argmax(axis=-1) + suggestions = model.le.inverse_transform(indexes) + + probs_list = probs.tolist() + indexes_list = indexes.tolist() + suggestions_list = suggestions.tolist() + + for i, comment_id in enumerate(comments.keys()): + data = { + "prob": probs_list[i], + "index": indexes_list[i], + "class": suggestions_list[i], + "extra_data": model_extra_data, + } + + job = JobInfo(classify_comment, model_name, comment_id) + setkey(job.result_key, orjson.dumps(data), compress=True) + + # TODO: Save the comment last change + # We shall need to update one of the comment keys to show an updated comment + + return "OK" + + def classify_broken_site_report(model_name: str, reports_data: list[dict]) -> str: from bugbug_http.app import JobInfo From 4237f8fbc166d56c4b17ccd32d52b5036ab2eb62 Mon Sep 17 00:00:00 2001 From: John P Date: Thu, 14 Mar 2024 11:40:23 +0300 Subject: [PATCH 57/59] Remove comment independent files --- bugbug/bugzilla.py | 11 ---- http_service/bugbug_http/models.py | 62 --------------------- scripts/comment_classifier.py | 86 ------------------------------ setup.py | 1 - 4 files changed, 160 deletions(-) delete mode 100644 scripts/comment_classifier.py diff --git a/bugbug/bugzilla.py b/bugbug/bugzilla.py index 35427acf84..b47085a623 100644 --- a/bugbug/bugzilla.py +++ b/bugbug/bugzilla.py @@ -409,17 +409,6 @@ def get_groups_users(group_names: list[str]) -> list[str]: ] -def get_comment(comment_id) -> dict: - r = utils.get_session("bugzilla").get( - f"https://bugzilla.mozilla.org/rest/bug/comment/{comment_id}", - headers={"X-Bugzilla-API-Key": Bugzilla.TOKEN, "User-Agent": "bugbug"}, - ) - # TODO: Do we raise the error or we just ignore the comment? - r.raise_for_status() - - return r.json()["comments"] - - def get_revision_ids(bug: BugDict) -> list[int]: revision_ids = [] diff --git a/http_service/bugbug_http/models.py b/http_service/bugbug_http/models.py index f4e3589610..5f7c8b6ea8 100644 --- a/http_service/bugbug_http/models.py +++ b/http_service/bugbug_http/models.py @@ -37,7 +37,6 @@ "testgroupselect", "accessibility", "performancebug", - "spamcomment", "worksforme", ] @@ -184,67 +183,6 @@ def classify_issue( return "OK" -def classify_comment( - model_name: str, comment_ids: Sequence[int], bugzilla_token: str -) -> str: - from bugbug_http.app import JobInfo - - # This should be called in a process worker so it should be safe to set - # the token here - comment_ids_set = set(map(int, comment_ids)) - bugzilla.set_token(bugzilla_token) - - comments = { - comment_id: bugzilla.get_comment(comment_id).values() - for comment_id in comment_ids - } - - missing_comments = comment_ids_set.difference(comments.keys()) - - for comment_id in missing_comments: - job = JobInfo(classify_comment, model_name, comment_id) - - # TODO: Find a better error format - setkey(job.result_key, orjson.dumps({"available": False})) - - if not comments: - return "NOK" - - model = MODEL_CACHE.get(model_name) - - if not model: - LOGGER.info("Missing model %r, aborting" % model_name) - return "NOK" - - model_extra_data = model.get_extra_data() - - # TODO: Classify could choke on a single bug which could make the whole - # job to fails. What should we do here? - probs = model.classify(list(comments.values()), True) - indexes = probs.argmax(axis=-1) - suggestions = model.le.inverse_transform(indexes) - - probs_list = probs.tolist() - indexes_list = indexes.tolist() - suggestions_list = suggestions.tolist() - - for i, comment_id in enumerate(comments.keys()): - data = { - "prob": probs_list[i], - "index": indexes_list[i], - "class": suggestions_list[i], - "extra_data": model_extra_data, - } - - job = JobInfo(classify_comment, model_name, comment_id) - setkey(job.result_key, orjson.dumps(data), compress=True) - - # TODO: Save the comment last change - # We shall need to update one of the comment keys to show an updated comment - - return "OK" - - def classify_broken_site_report(model_name: str, reports_data: list[dict]) -> str: from bugbug_http.app import JobInfo diff --git a/scripts/comment_classifier.py b/scripts/comment_classifier.py deleted file mode 100644 index 007a2ac0c1..0000000000 --- a/scripts/comment_classifier.py +++ /dev/null @@ -1,86 +0,0 @@ -# -*- coding: utf-8 -*- - -import argparse -import os -from logging import INFO, basicConfig, getLogger - -import numpy as np -import requests - -from bugbug import bugzilla, db -from bugbug.models import get_model_class -from bugbug.utils import download_model - -basicConfig(level=INFO) -logger = getLogger(__name__) - - -def classify_comments(model_name: str, comment_id: int) -> None: - model_file_name = f"{model_name}model" - - if not os.path.exists(model_file_name): - logger.info("%s does not exist. Downloading the model....", model_file_name) - try: - download_model(model_name) - except requests.HTTPError: - logger.error( - "A pre-trained model is not available, you will need to train it yourself using the trainer script" - ) - raise SystemExit(1) - - model_class = get_model_class(model_name) - model = model_class.load(model_file_name) - - if comment_id: - # Get a comment by its id - comments = list(bugzilla.get_comment(comment_id).values()) - assert comments, f"A comment with a comment id of {comment_id} was not found" - else: - assert db.download(bugzilla.BUGS_DB) - bugs = bugzilla.get_bugs() - comments = [ - {**comment, "bug_id": bug["id"]} - for bug in bugs - for comment in bug["comments"] - ] - - for comment in comments: - print( - f'https://bugzilla.mozilla.org/show_bug.cgi?id={comment["bug_id"]}#c{comment["count"]}' - ) - - if model.calculate_importance: - probas, importance = model.classify( - comment, probabilities=True, importances=True - ) - - model.print_feature_importances( - importance["importances"], class_probabilities=probas - ) - else: - probas = model.classify(comment, probabilities=True, importances=False) - - probability = probas[0] - pred_index = np.argmax(probability) - if len(probability) > 2: - pred_class = model.le.inverse_transform([pred_index])[0] - else: - pred_class = "Positive" if pred_index == 1 else "Negative" - print(f"{pred_class} {probability}") - input() - - -def main() -> None: - description = "Perform evaluation on comments using the specified model" - parser = argparse.ArgumentParser(description=description) - - parser.add_argument("model", help="Which model to use for evaluation") - parser.add_argument("--comment-id", help="Classify the given comment id", type=int) - - args = parser.parse_args() - - classify_comments(args.model, args.comment_id) - - -if __name__ == "__main__": - main() diff --git a/setup.py b/setup.py index d80bc620ea..22919e5481 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,6 @@ def read_requirements(file_): "bugbug-maintenance-effectiveness-indicator = scripts.maintenance_effectiveness_indicator:main", "bugbug-microannotate-generate = scripts.microannotate_generator:main", "bugbug-classify-commit = scripts.commit_classifier:main", - "bugbug-classify-comment = scripts.comment_classifier:main", "bugbug-classify-bug = scripts.bug_classifier:main", "bugbug-regressor-finder = scripts.regressor_finder:main", "bugbug-retrieve-training-metrics = scripts.retrieve_training_metrics:main", From 5490d0122c3a1dea2ec92966e414d4074a5ffa0c Mon Sep 17 00:00:00 2001 From: John P Date: Tue, 26 Mar 2024 17:11:08 +0300 Subject: [PATCH 58/59] Use(bug,comment) tuple --- bugbug/comment_features.py | 71 +++++++++++++++++++++++------------- bugbug/model.py | 2 +- bugbug/models/spamcomment.py | 3 +- 3 files changed, 48 insertions(+), 28 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 678dbeb3a8..481cf9c960 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -4,7 +4,6 @@ # You can obtain one at http://mozilla.org/MPL/2.0/. import sys -from collections import defaultdict from datetime import datetime from typing import Any @@ -41,18 +40,16 @@ def fit(self, x, y=None): return self - def transform(self, comments): - comments_iter = iter(comments()) + def transform(self, items): + items_iter = iter(items()) - commenter_experience_map = defaultdict(int) - - def apply_transform(comment): + def apply_transform(item): + bug, comment = item data = {} for feature_extractor in self.feature_extractors: res = feature_extractor( - comment, - commenter_experience=commenter_experience_map[comment["creator"]], + item, ) if hasattr(feature_extractor, "name"): @@ -75,8 +72,6 @@ def apply_transform(comment): data[feature_extractor_name] = res - commenter_experience_map[comment["creator"]] += 1 - comment_text = comment["text"] for cleanup_function in self.cleanup_functions: comment_text = cleanup_function(comment_text) @@ -86,14 +81,16 @@ def apply_transform(comment): "comment_text": comment_text, } - return pd.DataFrame(apply_transform(comment) for comment in comments_iter) + return pd.DataFrame(apply_transform(item) for item in items_iter) -class CommenterExperience(CommentFeature): - name = "# of Comments made by Commenter in the past" +class BugTitleLength(CommentFeature): + name = "Length of Bug's Title" - def __call__(self, comment, commenter_experience, **kwargs): - return commenter_experience + def __call__(self, item, **kwargs): + bug, _ = item + + return len(bug["title"]) class NumberOfLinks(CommentFeature): @@ -102,7 +99,9 @@ class NumberOfLinks(CommentFeature): def __init__(self, domains_to_ignore=set()): self.known_domains = domains_to_ignore - def __call__(self, comment, **kwargs) -> Any: + def __call__(self, item, **kwargs) -> Any: + _, comment = item + domains = extract_urls_and_domains(comment["text"])["domains"] return { @@ -117,14 +116,18 @@ def __call__(self, comment, **kwargs) -> Any: class CharacterCount(CommentFeature): name = "# of Characters in the Comment" - def __call__(self, comment, **kwargs): + def __call__(self, item, **kwargs): + _, comment = item + return len(comment["text"]) class WordCount(CommentFeature): name = "# of Words in the Comment" - def __call__(self, comment, **kwargs): + def __call__(self, item, **kwargs): + _, comment = item + return len(comment["text"].split()) @@ -134,7 +137,9 @@ class UnknownLinkAtBeginning(CommentFeature): def __init__(self, domains_to_ignore=set()): self.known_domains = domains_to_ignore - def __call__(self, comment, **kwargs): + def __call__(self, item, **kwargs): + _, comment = item + urls = extract_urls_and_domains(comment["text"], self.known_domains)["urls"] words = comment["text"].split() @@ -147,7 +152,9 @@ class UnknownLinkAtEnd(CommentFeature): def __init__(self, domains_to_ignore=set()): self.known_domains = domains_to_ignore - def __call__(self, comment, **kwargs): + def __call__(self, item, **kwargs): + _, comment = item + urls = extract_urls_and_domains(comment["text"], self.known_domains)["urls"] words = comment["text"].split() @@ -157,7 +164,9 @@ def __call__(self, comment, **kwargs): class HourOfDay(CommentFeature): name = "Hour of the Day (0-23)" - def __call__(self, comment, **kwargs): + def __call__(self, item, **kwargs): + _, comment = item + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") return comment_time.hour @@ -165,7 +174,9 @@ def __call__(self, comment, **kwargs): class Weekday(CommentFeature): name = "Day of the Week (0-7)" - def __call__(self, comment, **kwargs): + def __call__(self, item, **kwargs): + _, comment = item + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") return comment_time.isoweekday() @@ -173,7 +184,9 @@ def __call__(self, comment, **kwargs): class PostedOnWeekend(CommentFeature): name = "Comment was Posted on Weekend" - def __call__(self, comment, **kwargs): + def __call__(self, item, **kwargs): + _, comment = item + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") return comment_time.isoweekday() in (5, 6) @@ -181,7 +194,9 @@ def __call__(self, comment, **kwargs): class DayOfYear(CommentFeature): name = "Day of the Year (0-366)" - def __call__(self, comment, **kwargs): + def __call__(self, item, **kwargs): + _, comment = item + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") return comment_time.timetuple().tm_yday @@ -189,7 +204,9 @@ def __call__(self, comment, **kwargs): class WeekOfYear(CommentFeature): name = "Week of Year" - def __call__(self, comment, **kwargs): + def __call__(self, item, **kwargs): + _, comment = item + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") return comment_time.isocalendar()[1] @@ -200,8 +217,10 @@ class CommentTags(CommentFeature): def __init__(self, to_ignore=set()): self.to_ignore = to_ignore - def __call__(self, comment, **kwargs): + def __call__(self, item, **kwargs): + _, comment = item tags = [] + for tag in comment["tags"]: if tag in self.to_ignore: continue diff --git a/bugbug/model.py b/bugbug/model.py index 4e76eed1d2..a0699979f6 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -836,4 +836,4 @@ def items_gen(self, classes): if comment["id"] not in classes: continue - yield comment, classes[comment_id] + yield (bug, comment), classes[comment_id] diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index b3415afed9..68f7bbef6d 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -42,6 +42,7 @@ def __init__(self, lemmatization=True): comment_features.Weekday(), comment_features.UnknownLinkAtBeginning(SAFE_DOMAINS), comment_features.UnknownLinkAtEnd(SAFE_DOMAINS), + comment_features.BugTitleLength(), ] cleanup_functions = [ @@ -158,7 +159,7 @@ def get_labels(self): def items_gen(self, classes): return ( - (comment, classes[comment["id"]]) + ((bug, comment), classes[comment["id"]]) for bug in bugzilla.get_bugs() for comment in bug["comments"] if comment["id"] in classes From d95852dff2300dc4efcda68bfa042ac0b68080ea Mon Sep 17 00:00:00 2001 From: John P Date: Tue, 2 Apr 2024 17:27:38 +0300 Subject: [PATCH 59/59] Include BugvsCreator Feature --- bugbug/comment_features.py | 10 +++++----- bugbug/models/spamcomment.py | 3 +-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py index 481cf9c960..814b9109c5 100644 --- a/bugbug/comment_features.py +++ b/bugbug/comment_features.py @@ -84,13 +84,13 @@ def apply_transform(item): return pd.DataFrame(apply_transform(item) for item in items_iter) -class BugTitleLength(CommentFeature): - name = "Length of Bug's Title" +class CommentCreatorIsBugCreator(CommentFeature): + name = "Comment Creator is the Bug Creator" - def __call__(self, item, **kwargs): - bug, _ = item + def __call__(self, item, **kwargs) -> Any: + bug, comment = item - return len(bug["title"]) + return bug["creator"] == comment["creator"] class NumberOfLinks(CommentFeature): diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py index 68f7bbef6d..6d24a07ed9 100644 --- a/bugbug/models/spamcomment.py +++ b/bugbug/models/spamcomment.py @@ -42,7 +42,7 @@ def __init__(self, lemmatization=True): comment_features.Weekday(), comment_features.UnknownLinkAtBeginning(SAFE_DOMAINS), comment_features.UnknownLinkAtEnd(SAFE_DOMAINS), - comment_features.BugTitleLength(), + comment_features.CommentCreatorIsBugCreator(), ] cleanup_functions = [ @@ -134,7 +134,6 @@ def get_labels(self): [ comment["count"] == "0", self.is_safe_comment(comment), - comment["creator"] == bug["creator"], "[redacted -" in comment["text"], "(comment removed)" in comment["text"], ]