From 9adba3f671f1228dad6252f93bf794ef9311ee1a Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Thu, 20 Jun 2024 12:17:36 -0400 Subject: [PATCH 01/41] Added file path feature extraction --- bugbug/bug_features.py | 16 ++++++++++++++++ bugbug/models/component.py | 1 + 2 files changed, 17 insertions(+) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 48bb2d649c..1c10c94aed 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -905,3 +905,19 @@ class BugType(SingleBugFeature): def __call__(self, bug, **kwargs): return bug["type"] + +class ExtractFilePaths(SingleBugFeature): + """Extract file paths (partial and full) from bug data.""" + + name = "Extract File Paths" + + def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: + text = ( + bug.get("summary", "") + + " " + + " ".join(comment["text"] for comment in bug.get("comments", [])) + ) + + paths = re.findall(r"\b[\w/\\]+/\w+\.\w+\b", text) + + return sorted(set(paths)) diff --git a/bugbug/models/component.py b/bugbug/models/component.py index 39d9116fe3..304c7d8df1 100644 --- a/bugbug/models/component.py +++ b/bugbug/models/component.py @@ -84,6 +84,7 @@ def __init__(self, lemmatization=False): bug_features.Whiteboard(), bug_features.Patches(), bug_features.Landings(), + bug_features.ExtractFilePaths(), ] cleanup_functions = [ From 4a4318113925401f74716c2fce71eb9efeaa177e Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Thu, 20 Jun 2024 15:07:08 -0400 Subject: [PATCH 02/41] Improved regex for splitting filepaths --- bugbug/bug_features.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 1c10c94aed..d3d1b3bd56 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -905,6 +905,22 @@ class BugType(SingleBugFeature): def __call__(self, bug, **kwargs): return bug["type"] +# class ExtractFilePaths(SingleBugFeature): +# """Extract file paths (partial and full) from bug data.""" + +# name = "Extract File Paths" + +# def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: +# text = ( +# bug.get("summary", "") +# + " " +# + " ".join(comment["text"] for comment in bug.get("comments", [])) +# ) + +# paths = re.findall(r"\b[\w/\\]+/\w+\.\w+\b", text) + +# return sorted(set(paths)) + class ExtractFilePaths(SingleBugFeature): """Extract file paths (partial and full) from bug data.""" @@ -912,12 +928,19 @@ class ExtractFilePaths(SingleBugFeature): name = "Extract File Paths" def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: - text = ( - bug.get("summary", "") - + " " - + " ".join(comment["text"] for comment in bug.get("comments", [])) - ) + text = bug.get("summary", "") + " " + bug["comments"][0]["text"] + + regex = r"\b[a-zA-Z0-9_/.\-]+(?:\.(?!com|org|net|edu|gov)[a-zA-Z]{2,4})\b" + + file_paths = re.findall(regex, text) + file_paths = [path for path in file_paths if "/" in path or "." in path] + + all_sub_paths = [] + for path in file_paths: + parts = path.split("/") + sub_paths = ["/".join(parts[: i + 1]) for i in range(len(parts))] - paths = re.findall(r"\b[\w/\\]+/\w+\.\w+\b", text) + all_sub_paths.extend(sub_paths) + all_sub_paths.extend(parts) - return sorted(set(paths)) + return sorted(set(all_sub_paths)) From bdbf73d10c11259b55add013f15c0bc58c0ac269 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 25 Jun 2024 14:07:48 -0400 Subject: [PATCH 03/41] Moved `/` and `.` check into regex --- bugbug/bug_features.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index d3d1b3bd56..c3bdd842f4 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -930,10 +930,9 @@ class ExtractFilePaths(SingleBugFeature): def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: text = bug.get("summary", "") + " " + bug["comments"][0]["text"] - regex = r"\b[a-zA-Z0-9_/.\-]+(?:\.(?!com|org|net|edu|gov)[a-zA-Z]{2,4})\b" + regex = r"\b[a-zA-Z0-9_/.\-]+(?:/[a-zA-Z0-9_/.\-]+)*\.(?!com|org|net|edu|gov)[a-zA-Z]{2,4}\b" file_paths = re.findall(regex, text) - file_paths = [path for path in file_paths if "/" in path or "." in path] all_sub_paths = [] for path in file_paths: @@ -943,4 +942,4 @@ def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: all_sub_paths.extend(sub_paths) all_sub_paths.extend(parts) - return sorted(set(all_sub_paths)) + return all_sub_paths From 9ce9d7698625e04ec852f67167a8a45bcee1458d Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 25 Jun 2024 14:13:08 -0400 Subject: [PATCH 04/41] Moved regex initialization to the constructor --- bugbug/bug_features.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index c3bdd842f4..a35c54f400 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -926,13 +926,12 @@ class ExtractFilePaths(SingleBugFeature): """Extract file paths (partial and full) from bug data.""" name = "Extract File Paths" + regex = r"\b[a-zA-Z0-9_/.\-]+(?:/[a-zA-Z0-9_/.\-]+)*\.(?!com|org|net|edu|gov)[a-zA-Z]{2,4}\b" def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: text = bug.get("summary", "") + " " + bug["comments"][0]["text"] - regex = r"\b[a-zA-Z0-9_/.\-]+(?:/[a-zA-Z0-9_/.\-]+)*\.(?!com|org|net|edu|gov)[a-zA-Z]{2,4}\b" - - file_paths = re.findall(regex, text) + file_paths = re.findall(self.regex, text) all_sub_paths = [] for path in file_paths: From 802c9bd859293ad136fdfc3e4facb72a3cf0e486 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 2 Jul 2024 10:28:28 -0400 Subject: [PATCH 05/41] Compiled regex using `re.compile` and move to constructor --- bugbug/bug_features.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index a35c54f400..d894455ed7 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -926,12 +926,16 @@ class ExtractFilePaths(SingleBugFeature): """Extract file paths (partial and full) from bug data.""" name = "Extract File Paths" - regex = r"\b[a-zA-Z0-9_/.\-]+(?:/[a-zA-Z0-9_/.\-]+)*\.(?!com|org|net|edu|gov)[a-zA-Z]{2,4}\b" + + def __init__(self): + self.regex = re.compile( + r"\b[a-zA-Z0-9_/.\-]+(?:/[a-zA-Z0-9_/.\-]+)*\.(?!com|org|net|edu|gov)[a-zA-Z]{2,4}\b" + ) def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: text = bug.get("summary", "") + " " + bug["comments"][0]["text"] - file_paths = re.findall(self.regex, text) + file_paths = self.regex.findall(text) all_sub_paths = [] for path in file_paths: From 165e68a39aad13a330588a7ebd2f2cb04920ea11 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Thu, 18 Jul 2024 11:02:12 -0400 Subject: [PATCH 06/41] Renamed `ExtractFilePaths` to `FilePaths` --- bugbug/bug_features.py | 2 +- bugbug/models/component.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index d894455ed7..39d4245e39 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -922,7 +922,7 @@ def __call__(self, bug, **kwargs): # return sorted(set(paths)) -class ExtractFilePaths(SingleBugFeature): +class FilePaths(SingleBugFeature): """Extract file paths (partial and full) from bug data.""" name = "Extract File Paths" diff --git a/bugbug/models/component.py b/bugbug/models/component.py index 304c7d8df1..773d46de0d 100644 --- a/bugbug/models/component.py +++ b/bugbug/models/component.py @@ -84,7 +84,7 @@ def __init__(self, lemmatization=False): bug_features.Whiteboard(), bug_features.Patches(), bug_features.Landings(), - bug_features.ExtractFilePaths(), + bug_features.FilePaths(), ] cleanup_functions = [ From 49601ebb591d1af1fe49dc5ff98daa6defe622d8 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Thu, 18 Jul 2024 11:09:03 -0400 Subject: [PATCH 07/41] Removed temporary list creation in `FilePaths` feature --- bugbug/bug_features.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 39d4245e39..f5aee26163 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -937,12 +937,11 @@ def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: file_paths = self.regex.findall(text) - all_sub_paths = [] + all_sub_paths: list[str] = [] + for path in file_paths: parts = path.split("/") - sub_paths = ["/".join(parts[: i + 1]) for i in range(len(parts))] - - all_sub_paths.extend(sub_paths) + all_sub_paths.extend("/".join(parts[: i + 1]) for i in range(len(parts))) all_sub_paths.extend(parts) return all_sub_paths From 91663074f4edafa94257f9305a74ca1926fad803 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Thu, 18 Jul 2024 16:31:48 -0400 Subject: [PATCH 08/41] Fixed `FilePaths` feature to accurately extract file paths and avoid URLs --- bugbug/bug_features.py | 201 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 197 insertions(+), 4 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index f5aee26163..36328b5e30 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -3,6 +3,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. +import mimetypes import re import sys from collections import defaultdict @@ -928,20 +929,212 @@ class FilePaths(SingleBugFeature): name = "Extract File Paths" def __init__(self): - self.regex = re.compile( - r"\b[a-zA-Z0-9_/.\-]+(?:/[a-zA-Z0-9_/.\-]+)*\.(?!com|org|net|edu|gov)[a-zA-Z]{2,4}\b" + self.non_file_path_keywords = [ + "http://", + "https://", + "www.", + ".com", + ".org", + ".net", + ".edu", + ".gov", + "@", + ] + + self.valid_extensions = set( + ext.lstrip(".") for ext in mimetypes.types_map.keys() ) + common_extensions = { + "cpp", + "c", + "h", + "hpp", + "py", + "java", + "class", + "js", + "jsx", + "ts", + "tsx", + "html", + "css", + "scss", + "rb", + "php", + "swift", + "kt", + "kts", + "go", + "rs", + "lua", + "sh", + "bat", + "ps1", + "pl", + "pm", + "r", + "m", + "mm", + "h", + "cs", + "vb", + "fs", + "fsharp", + "sln", + "vbproj", + "csproj", + "xaml", + "xml", + "json", + "yaml", + "yml", + "ini", + "cfg", + "conf", + "toml", + "md", + "txt", + "log", + "sql", + "db", + "sqlite", + "db3", + "csv", + "tsv", + "xlsx", + "xls", + "doc", + "docx", + "ppt", + "pptx", + "pdf", + "tex", + "bib", + "dvi", + "epub", + "mobi", + "azw3", + "rtf", + "odt", + "ods", + "odp", + "key", + "numbers", + "pages", + "gpx", + "kml", + "kmz", + "svg", + "ai", + "psd", + "xcf", + "png", + "jpg", + "jpeg", + "gif", + "bmp", + "tiff", + "ico", + "webp", + "mp3", + "wav", + "flac", + "ogg", + "m4a", + "wma", + "aac", + "mp4", + "mkv", + "avi", + "mov", + "flv", + "wmv", + "webm", + "3gp", + "m4v", + "mpg", + "mpeg", + "swf", + "fla", + "iso", + "img", + "dmg", + "zip", + "rar", + "tar", + "gz", + "bz2", + "xz", + "7z", + "s7z", + "rpm", + "deb", + "pkg", + "bin", + "exe", + "dll", + "so", + "o", + "a", + "lib", + "out", + "class", + "jar", + "war", + "ear", + "apk", + "ipa", + "plist", + "cfg", + "ini", + "conf", + "properties", + "yaml", + "yml", + "env", + "rc", + "htaccess", + "gitignore", + "dockerfile", + "makefile", + "gradle", + "pom", + "bat", + "cmd", + "vbs", + } + self.valid_extensions.update(common_extensions) + self.valid_extensions = sorted(self.valid_extensions, key=len, reverse=True) + + def remove_urls(self, text: str) -> str: + """Remove URLs and strings containing specific domain extensions from the given text.""" + for keyword in self.non_file_path_keywords: + if keyword in text: + text = re.sub(r"\S*" + re.escape(keyword) + r"\S*", "", text) + return text + + def extract_valid_file_path(self, word: str) -> str: + """Extract the valid file path from the word if it contains a valid extension.""" + for ext in self.valid_extensions: + if f".{ext}" in word: + ext_index = word.find(f".{ext}") + prefix = word[:ext_index] + prefix = re.sub(r"[^a-zA-Z0-9_\-./]", "", prefix) + return prefix + f".{ext}" + return "" def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: text = bug.get("summary", "") + " " + bug["comments"][0]["text"] + text = self.remove_urls(text) - file_paths = self.regex.findall(text) + words = text.split() + file_paths = [self.extract_valid_file_path(word) for word in words] + file_paths = [path for path in file_paths if path] all_sub_paths: list[str] = [] for path in file_paths: parts = path.split("/") all_sub_paths.extend("/".join(parts[: i + 1]) for i in range(len(parts))) - all_sub_paths.extend(parts) return all_sub_paths From 8250977ebf5bc303ff7139ee2eb23f0d71db440a Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 19 Jul 2024 14:02:12 -0400 Subject: [PATCH 09/41] Revised version to extract only file paths with valid file extensions --- bugbug/bug_features.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 36328b5e30..5578a04c5c 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -1116,11 +1116,14 @@ def remove_urls(self, text: str) -> str: def extract_valid_file_path(self, word: str) -> str: """Extract the valid file path from the word if it contains a valid extension.""" for ext in self.valid_extensions: - if f".{ext}" in word: - ext_index = word.find(f".{ext}") + ext_pattern = re.compile(rf"\.{ext}(?![a-zA-Z])") + match = ext_pattern.search(word) + if match: + ext_index = match.start() prefix = word[:ext_index] - prefix = re.sub(r"[^a-zA-Z0-9_\-./]", "", prefix) - return prefix + f".{ext}" + alphanumeric_sequence = re.findall(r"[a-zA-Z0-9/_]+", prefix) + if alphanumeric_sequence: + return alphanumeric_sequence[-1] + f".{ext}" return "" def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: @@ -1131,10 +1134,12 @@ def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: file_paths = [self.extract_valid_file_path(word) for word in words] file_paths = [path for path in file_paths if path] - all_sub_paths: list[str] = [] + all_paths: list[str] = [] for path in file_paths: parts = path.split("/") - all_sub_paths.extend("/".join(parts[: i + 1]) for i in range(len(parts))) - - return all_sub_paths + all_paths.extend(part for part in parts if part) + all_paths.extend( + subpath for i in range(len(parts)) if (subpath := "/".join(parts[i:])) + ) + return all_paths From d1eb019ae3ed3a906f72bd6fd4819e1b855ca647 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Mon, 22 Jul 2024 12:50:12 -0400 Subject: [PATCH 10/41] Initialized and compiled regex in compiler --- bugbug/bug_features.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 5578a04c5c..6cdc5d55bc 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -1105,25 +1105,28 @@ def __init__(self): } self.valid_extensions.update(common_extensions) self.valid_extensions = sorted(self.valid_extensions, key=len, reverse=True) + extension_pattern_string = "|".join( + re.escape(ext) for ext in self.valid_extensions + ) + self.extension_pattern = re.compile( + rf"\.({extension_pattern_string})(?![a-zA-Z])" + ) def remove_urls(self, text: str) -> str: - """Remove URLs and strings containing specific domain extensions from the given text.""" for keyword in self.non_file_path_keywords: if keyword in text: text = re.sub(r"\S*" + re.escape(keyword) + r"\S*", "", text) return text def extract_valid_file_path(self, word: str) -> str: - """Extract the valid file path from the word if it contains a valid extension.""" - for ext in self.valid_extensions: - ext_pattern = re.compile(rf"\.{ext}(?![a-zA-Z])") - match = ext_pattern.search(word) - if match: - ext_index = match.start() - prefix = word[:ext_index] - alphanumeric_sequence = re.findall(r"[a-zA-Z0-9/_]+", prefix) - if alphanumeric_sequence: - return alphanumeric_sequence[-1] + f".{ext}" + match = self.extension_pattern.search(word) + if match: + ext = match.group(1) + ext_index = match.start() + prefix = word[:ext_index] + alphanumeric_sequence = re.findall(r"[a-zA-Z0-9/_]+", prefix) + if alphanumeric_sequence: + return f"{alphanumeric_sequence[-1]}.{ext}" return "" def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: From f0f111866d9f0e8261956cab659df810751f38d7 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 24 Jul 2024 11:37:50 -0400 Subject: [PATCH 11/41] Made code more Pythonic --- bugbug/bug_features.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 6cdc5d55bc..42a1cd6f66 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -1130,19 +1130,26 @@ def extract_valid_file_path(self, word: str) -> str: return "" def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: - text = bug.get("summary", "") + " " + bug["comments"][0]["text"] - text = self.remove_urls(text) + text = self.remove_urls( + bug.get("summary", "") + " " + bug["comments"][0]["text"] + ) - words = text.split() - file_paths = [self.extract_valid_file_path(word) for word in words] - file_paths = [path for path in file_paths if path] + file_paths = [ + path + for word in text.split() + if (path := self.extract_valid_file_path(word)) + ] all_paths: list[str] = [] for path in file_paths: parts = path.split("/") all_paths.extend(part for part in parts if part) - all_paths.extend( - subpath for i in range(len(parts)) if (subpath := "/".join(parts[i:])) - ) + if len(parts) > 1: + all_paths.extend( + subpath + for i in range(len(parts)) + if (subpath := "/".join(parts[i:])) + ) + return all_paths From fad7daec3351cc248e25504626f56c429dca76fa Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 24 Jul 2024 11:51:03 -0400 Subject: [PATCH 12/41] Added 2 tests for `FilePaths` feature --- tests/fixtures/bug_features/file_paths.json | 16 ++ tests/test_bug_features.py | 179 ++++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 tests/fixtures/bug_features/file_paths.json diff --git a/tests/fixtures/bug_features/file_paths.json b/tests/fixtures/bug_features/file_paths.json new file mode 100644 index 0000000000..09499ff7f8 --- /dev/null +++ b/tests/fixtures/bug_features/file_paths.json @@ -0,0 +1,16 @@ +{ + "summary": " cleanup", + "comments": [ + { + "text": "Fix for\n{{ \nanthonyd (2 warnings)\n1.\tlayout/html/base/src/nsFrame.cpp:3879 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n2.\tlayout/html/base/src/nsFrame.cpp:3908 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n}} (NB: lines should be lines - 2, due to checkin \"in progress\")\nwill be included." + } + ] +} +{ + "summary": "Spidermonkey regression causes treehydra trunk to fail 6 tests", + "comments": [ + { + "text": "Today I'm trying to get callgraph stuff hooked into dxr, and I'm unable to get a working treehydra. I've updated tried updating just dehydra, then I updated gcc w/plugins using the new stuff in the patch queue, and it doesn't matter. Running make check_treehydra fails like this:\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad3.js locks_bad3.cc\n Failure msg: Expected 'locks_bad3.cc:10: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good.js locks_good.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good2.js locks_good2.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad4.js locks_bad4.cc\n Failure msg: Expected 'locks_bad4.cc:13: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad2.js locks_bad2.cc\n Failure msg: Expected 'locks_bad2.cc:12: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad1.js locks_bad1.cc\n Failure msg: Expected 'locks_bad1.cc:11: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\n\nUnit Test Suite Summary:\n 32 passed\n 6 failed\n 0 error(s)\nmake[1]: *** [check_treehydra] Error 1\nmake[1]: Leaving directory `/var/www/html/dxr/tools/gcc-dehydra/dehydra/test'\nmake: *** [check] Error 2" + } + ] +} \ No newline at end of file diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index f659411157..9e9399a499 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -17,6 +17,7 @@ CommentLength, Component, DeltaNightlyRequestMerge, + FilePaths, HasCrashSignature, HasCVEInAlias, HasGithubURL, @@ -187,3 +188,181 @@ def test_BugTypes(read) -> None: BugTypes, [["performance"], ["memory"], ["power"], ["security"], ["crash"]], ) + + +def test_FilePaths(read): + read( + "file_paths.json", + FilePaths, + [ + [ + "nsFrame.cpp", + "layout", + "html", + "base", + "src", + "nsFrame.cpp", + "layout/html/base/src/nsFrame.cpp", + "html/base/src/nsFrame.cpp", + "base/src/nsFrame.cpp", + "src/nsFrame.cpp", + "nsFrame.cpp", + "layout", + "html", + "base", + "src", + "nsFrame.cpp", + "layout/html/base/src/nsFrame.cpp", + "html/base/src/nsFrame.cpp", + "base/src/nsFrame.cpp", + "src/nsFrame.cpp", + "nsFrame.cpp", + ], + [ + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_bad3.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_good.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_good2.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_bad4.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_bad2.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_bad1.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + ], + ], + ) From fdd6123a732d59b45a78fe8b1b5529a8f7eae419 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Mon, 29 Jul 2024 09:30:09 -0400 Subject: [PATCH 13/41] Restructured `file_paths.json` --- tests/fixtures/bug_features/file_paths.json | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/tests/fixtures/bug_features/file_paths.json b/tests/fixtures/bug_features/file_paths.json index 09499ff7f8..65c9a7f73e 100644 --- a/tests/fixtures/bug_features/file_paths.json +++ b/tests/fixtures/bug_features/file_paths.json @@ -1,16 +1,2 @@ -{ - "summary": " cleanup", - "comments": [ - { - "text": "Fix for\n{{ \nanthonyd (2 warnings)\n1.\tlayout/html/base/src/nsFrame.cpp:3879 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n2.\tlayout/html/base/src/nsFrame.cpp:3908 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n}} (NB: lines should be lines - 2, due to checkin \"in progress\")\nwill be included." - } - ] -} -{ - "summary": "Spidermonkey regression causes treehydra trunk to fail 6 tests", - "comments": [ - { - "text": "Today I'm trying to get callgraph stuff hooked into dxr, and I'm unable to get a working treehydra. I've updated tried updating just dehydra, then I updated gcc w/plugins using the new stuff in the patch queue, and it doesn't matter. Running make check_treehydra fails like this:\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad3.js locks_bad3.cc\n Failure msg: Expected 'locks_bad3.cc:10: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good.js locks_good.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good2.js locks_good2.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad4.js locks_bad4.cc\n Failure msg: Expected 'locks_bad4.cc:13: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad2.js locks_bad2.cc\n Failure msg: Expected 'locks_bad2.cc:12: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad1.js locks_bad1.cc\n Failure msg: Expected 'locks_bad1.cc:11: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\n\nUnit Test Suite Summary:\n 32 passed\n 6 failed\n 0 error(s)\nmake[1]: *** [check_treehydra] Error 1\nmake[1]: Leaving directory `/var/www/html/dxr/tools/gcc-dehydra/dehydra/test'\nmake: *** [check] Error 2" - } - ] -} \ No newline at end of file +{"summary": " cleanup", "comments": [{"text": "Fix for\n{{ \nanthonyd (2 warnings)\n1.\tlayout/html/base/src/nsFrame.cpp:3879 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n2.\tlayout/html/base/src/nsFrame.cpp:3908 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n}} (NB: lines should be lines - 2, due to checkin \"in progress\")\nwill be included."}]} +{"summary": "Spidermonkey regression causes treehydra trunk to fail 6 tests", "comments": [{"text": "Today I'm trying to get callgraph stuff hooked into dxr, and I'm unable to get a working treehydra. I've updated tried updating just dehydra, then I updated gcc w/plugins using the new stuff in the patch queue, and it doesn't matter. Running make check_treehydra fails like this:\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad3.js locks_bad3.cc\n Failure msg: Expected 'locks_bad3.cc:10: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good.js locks_good.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good2.js locks_good2.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad4.js locks_bad4.cc\n Failure msg: Expected 'locks_bad4.cc:13: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad2.js locks_bad2.cc\n Failure msg: Expected 'locks_bad2.cc:12: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad1.js locks_bad1.cc\n Failure msg: Expected 'locks_bad1.cc:11: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\n\nUnit Test Suite Summary:\n 32 passed\n 6 failed\n 0 error(s)\nmake[1]: *** [check_treehydra] Error 1\nmake[1]: Leaving directory `/var/www/html/dxr/tools/gcc-dehydra/dehydra/test'\nmake: *** [check] Error 2"}]} \ No newline at end of file From 82a038a4a23c7c81dcb3519f4d138f6f8055be44 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 4 Sep 2024 11:43:52 -0400 Subject: [PATCH 14/41] Replaced hard-coding programming language extensions with `pygment.lexers` --- bugbug/bug_features.py | 167 ++--------------------------------------- 1 file changed, 7 insertions(+), 160 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 42a1cd6f66..c975c48002 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -15,6 +15,7 @@ from dateutil import parser from libmozdata import versions from libmozdata.bugzilla import Bugzilla +from pygments.lexers import get_all_lexers from sklearn.base import BaseEstimator, TransformerMixin from bugbug import bug_snapshot, bugzilla, repository, utils @@ -944,167 +945,13 @@ def __init__(self): self.valid_extensions = set( ext.lstrip(".") for ext in mimetypes.types_map.keys() ) - common_extensions = { - "cpp", - "c", - "h", - "hpp", - "py", - "java", - "class", - "js", - "jsx", - "ts", - "tsx", - "html", - "css", - "scss", - "rb", - "php", - "swift", - "kt", - "kts", - "go", - "rs", - "lua", - "sh", - "bat", - "ps1", - "pl", - "pm", - "r", - "m", - "mm", - "h", - "cs", - "vb", - "fs", - "fsharp", - "sln", - "vbproj", - "csproj", - "xaml", - "xml", - "json", - "yaml", - "yml", - "ini", - "cfg", - "conf", - "toml", - "md", - "txt", - "log", - "sql", - "db", - "sqlite", - "db3", - "csv", - "tsv", - "xlsx", - "xls", - "doc", - "docx", - "ppt", - "pptx", - "pdf", - "tex", - "bib", - "dvi", - "epub", - "mobi", - "azw3", - "rtf", - "odt", - "ods", - "odp", - "key", - "numbers", - "pages", - "gpx", - "kml", - "kmz", - "svg", - "ai", - "psd", - "xcf", - "png", - "jpg", - "jpeg", - "gif", - "bmp", - "tiff", - "ico", - "webp", - "mp3", - "wav", - "flac", - "ogg", - "m4a", - "wma", - "aac", - "mp4", - "mkv", - "avi", - "mov", - "flv", - "wmv", - "webm", - "3gp", - "m4v", - "mpg", - "mpeg", - "swf", - "fla", - "iso", - "img", - "dmg", - "zip", - "rar", - "tar", - "gz", - "bz2", - "xz", - "7z", - "s7z", - "rpm", - "deb", - "pkg", - "bin", - "exe", - "dll", - "so", - "o", - "a", - "lib", - "out", - "class", - "jar", - "war", - "ear", - "apk", - "ipa", - "plist", - "cfg", - "ini", - "conf", - "properties", - "yaml", - "yml", - "env", - "rc", - "htaccess", - "gitignore", - "dockerfile", - "makefile", - "gradle", - "pom", - "bat", - "cmd", - "vbs", - } - self.valid_extensions.update(common_extensions) + + lexers = get_all_lexers() + lexer_extensions = set(ext[2:] for lexer in lexers for ext in lexer[2]) + + self.valid_extensions.update(lexer_extensions) self.valid_extensions = sorted(self.valid_extensions, key=len, reverse=True) + extension_pattern_string = "|".join( re.escape(ext) for ext in self.valid_extensions ) From bfd633417a0ce2a46a352f4128007f5f41e83215 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 4 Sep 2024 13:52:02 -0400 Subject: [PATCH 15/41] Fixed tests to reflect more file extensions --- tests/test_bug_features.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index 9e9399a499..3ea547a47b 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -223,6 +223,8 @@ def test_FilePaths(read): "/gcc_treehydra.so", "gcc_treehydra.so", "test_locks_bad3.js", + "locks_bad3.cc", + "locks_bad3.cc", "libs", "treehydra.js", "/libs/treehydra.js", @@ -247,6 +249,7 @@ def test_FilePaths(read): "/gcc_treehydra.so", "gcc_treehydra.so", "test_locks_good.js", + "locks_good.cc", "libs", "treehydra.js", "/libs/treehydra.js", @@ -271,6 +274,7 @@ def test_FilePaths(read): "/gcc_treehydra.so", "gcc_treehydra.so", "test_locks_good2.js", + "locks_good2.cc", "libs", "treehydra.js", "/libs/treehydra.js", @@ -295,6 +299,8 @@ def test_FilePaths(read): "/gcc_treehydra.so", "gcc_treehydra.so", "test_locks_bad4.js", + "locks_bad4.cc", + "locks_bad4.cc", "libs", "treehydra.js", "/libs/treehydra.js", @@ -319,6 +325,8 @@ def test_FilePaths(read): "/gcc_treehydra.so", "gcc_treehydra.so", "test_locks_bad2.js", + "locks_bad2.cc", + "locks_bad2.cc", "libs", "treehydra.js", "/libs/treehydra.js", @@ -343,6 +351,8 @@ def test_FilePaths(read): "/gcc_treehydra.so", "gcc_treehydra.so", "test_locks_bad1.js", + "locks_bad1.cc", + "locks_bad1.cc", "libs", "treehydra.js", "/libs/treehydra.js", From 5f4ec728f8e594df61f2304bdfb0ae6fd703388e Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 4 Sep 2024 15:42:18 -0400 Subject: [PATCH 16/41] Added `publicsuffix2` to generate list of tlds --- bugbug/bug_features.py | 16 +++++++++++----- requirements.txt | 3 +++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index c975c48002..058811d7c9 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -15,6 +15,7 @@ from dateutil import parser from libmozdata import versions from libmozdata.bugzilla import Bugzilla +from publicsuffix2 import PublicSuffixList from pygments.lexers import get_all_lexers from sklearn.base import BaseEstimator, TransformerMixin @@ -934,11 +935,6 @@ def __init__(self): "http://", "https://", "www.", - ".com", - ".org", - ".net", - ".edu", - ".gov", "@", ] @@ -955,10 +951,20 @@ def __init__(self): extension_pattern_string = "|".join( re.escape(ext) for ext in self.valid_extensions ) + self.extension_pattern = re.compile( rf"\.({extension_pattern_string})(?![a-zA-Z])" ) + psl = PublicSuffixList() + tlds = set() + for entry in psl.tlds: + if "." not in entry: + tlds.add("." + entry) + + filtered_tlds = [tld for tld in tlds if tld[1:] not in self.valid_extensions] + self.non_file_path_keywords.extend(filtered_tlds) + def remove_urls(self, text: str) -> str: for keyword in self.non_file_path_keywords: if keyword in text: diff --git a/requirements.txt b/requirements.txt index 283f41f69f..aba9ceef58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,7 +21,10 @@ numpy==1.26.4 orjson==3.10.9 ortools==9.11.4210 pandas==2.2.3 +<<<<<<< HEAD psutil==6.1.0 +publicsuffix2==2.20191221 +>>>>>>> 0f27b3cb (Added `publicsuffix2` to generate list of tlds) pydriller==1.12 pyOpenSSL>=0.14 # Could not find a version that satisfies the requirement pyOpenSSL>=0.14; extra == "security" (from requests[security]>=2.7.0->libmozdata==0.1.43) python-dateutil==2.9.0.post0 From 1f3921bc7d13e6f0bd81784417ad6fd6a4ec9b1b Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 18 Oct 2024 10:52:57 -0400 Subject: [PATCH 17/41] Replaced all addition strings with f-strings --- bugbug/bug_features.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 058811d7c9..b98eb50d00 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -957,10 +957,7 @@ def __init__(self): ) psl = PublicSuffixList() - tlds = set() - for entry in psl.tlds: - if "." not in entry: - tlds.add("." + entry) + tlds = set(tlds=set(f".{entry}" for entry in psl.tlds if "." not in entry)) filtered_tlds = [tld for tld in tlds if tld[1:] not in self.valid_extensions] self.non_file_path_keywords.extend(filtered_tlds) @@ -968,7 +965,7 @@ def __init__(self): def remove_urls(self, text: str) -> str: for keyword in self.non_file_path_keywords: if keyword in text: - text = re.sub(r"\S*" + re.escape(keyword) + r"\S*", "", text) + text = re.sub(rf"\S*{re.escape(keyword)}\S*", "", text) return text def extract_valid_file_path(self, word: str) -> str: @@ -984,7 +981,7 @@ def extract_valid_file_path(self, word: str) -> str: def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: text = self.remove_urls( - bug.get("summary", "") + " " + bug["comments"][0]["text"] + f"{bug.get('summary', '')} {bug['comments'][0]['text']}" ) file_paths = [ From 0cf24822b3fd08ff88e970b3cf7bd6aa9fc410ab Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 18 Oct 2024 11:47:11 -0400 Subject: [PATCH 18/41] Removed fixture from file path test --- tests/test_bug_features.py | 391 +++++++++++++++++++------------------ 1 file changed, 202 insertions(+), 189 deletions(-) diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index 3ea547a47b..ea3a5486b0 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -39,15 +39,26 @@ @pytest.fixture def read(get_fixture_path): - def _read(path, feature_extractor_class, expected_results): + def _read( + path, + feature_extractor_class, + expected_results, + use_inline_data=False, + inline_data="", + ): feature_extractor = feature_extractor_class() - path = get_fixture_path(os.path.join("bug_features", path)) + if use_inline_data: + results = ( + feature_extractor(json.loads(line)) for line in inline_data.splitlines() + ) + else: + path = get_fixture_path(os.path.join("bug_features", path)) + with open(path, "r") as f: + results = (feature_extractor(json.loads(line)) for line in f) - with open(path, "r") as f: - results = (feature_extractor(json.loads(line)) for line in f) - for result, expected_result in zip(results, expected_results): - assert result == expected_result + for result, expected_result in zip(results, expected_results): + assert result == expected_result return _read @@ -191,188 +202,190 @@ def test_BugTypes(read) -> None: def test_FilePaths(read): - read( - "file_paths.json", - FilePaths, + inline_data = """{"summary": " cleanup", "comments": [{"text": "Fix for\n{{ \nanthonyd (2 warnings)\n1.\tlayout/html/base/src/nsFrame.cpp:3879 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n2.\tlayout/html/base/src/nsFrame.cpp:3908 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n}} (NB: lines should be lines - 2, due to checkin \"in progress\")\nwill be included."}]} + {"summary": "Spidermonkey regression causes treehydra trunk to fail 6 tests", "comments": [{"text": "Today I'm trying to get callgraph stuff hooked into dxr, and I'm unable to get a working treehydra. I've updated tried updating just dehydra, then I updated gcc w/plugins using the new stuff in the patch queue, and it doesn't matter. Running make check_treehydra fails like this:\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad3.js locks_bad3.cc\n Failure msg: Expected 'locks_bad3.cc:10: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good.js locks_good.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good2.js locks_good2.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad4.js locks_bad4.cc\n Failure msg: Expected 'locks_bad4.cc:13: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad2.js locks_bad2.cc\n Failure msg: Expected 'locks_bad2.cc:12: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad1.js locks_bad1.cc\n Failure msg: Expected 'locks_bad1.cc:11: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\n\nUnit Test Suite Summary:\n 32 passed\n 6 failed\n 0 error(s)\nmake[1]: *** [check_treehydra] Error 1\nmake[1]: Leaving directory `/var/www/html/dxr/tools/gcc-dehydra/dehydra/test'\nmake: *** [check] Error 2"}]} + """ + + expected_results = [ [ - [ - "nsFrame.cpp", - "layout", - "html", - "base", - "src", - "nsFrame.cpp", - "layout/html/base/src/nsFrame.cpp", - "html/base/src/nsFrame.cpp", - "base/src/nsFrame.cpp", - "src/nsFrame.cpp", - "nsFrame.cpp", - "layout", - "html", - "base", - "src", - "nsFrame.cpp", - "layout/html/base/src/nsFrame.cpp", - "html/base/src/nsFrame.cpp", - "base/src/nsFrame.cpp", - "src/nsFrame.cpp", - "nsFrame.cpp", - ], - [ - "gcc_treehydra.so", - "/gcc_treehydra.so", - "gcc_treehydra.so", - "test_locks_bad3.js", - "locks_bad3.cc", - "locks_bad3.cc", - "libs", - "treehydra.js", - "/libs/treehydra.js", - "libs/treehydra.js", - "treehydra.js", - "libs", - "treehydra.js", - "/libs/treehydra.js", - "libs/treehydra.js", - "treehydra.js", - "libs", - "unstable", - "esp.js", - "/libs/unstable/esp.js", - "libs/unstable/esp.js", - "unstable/esp.js", - "esp.js", - "esp_lock.js", - "/esp_lock.js", - "esp_lock.js", - "gcc_treehydra.so", - "/gcc_treehydra.so", - "gcc_treehydra.so", - "test_locks_good.js", - "locks_good.cc", - "libs", - "treehydra.js", - "/libs/treehydra.js", - "libs/treehydra.js", - "treehydra.js", - "libs", - "treehydra.js", - "/libs/treehydra.js", - "libs/treehydra.js", - "treehydra.js", - "libs", - "unstable", - "esp.js", - "/libs/unstable/esp.js", - "libs/unstable/esp.js", - "unstable/esp.js", - "esp.js", - "esp_lock.js", - "/esp_lock.js", - "esp_lock.js", - "gcc_treehydra.so", - "/gcc_treehydra.so", - "gcc_treehydra.so", - "test_locks_good2.js", - "locks_good2.cc", - "libs", - "treehydra.js", - "/libs/treehydra.js", - "libs/treehydra.js", - "treehydra.js", - "libs", - "treehydra.js", - "/libs/treehydra.js", - "libs/treehydra.js", - "treehydra.js", - "libs", - "unstable", - "esp.js", - "/libs/unstable/esp.js", - "libs/unstable/esp.js", - "unstable/esp.js", - "esp.js", - "esp_lock.js", - "/esp_lock.js", - "esp_lock.js", - "gcc_treehydra.so", - "/gcc_treehydra.so", - "gcc_treehydra.so", - "test_locks_bad4.js", - "locks_bad4.cc", - "locks_bad4.cc", - "libs", - "treehydra.js", - "/libs/treehydra.js", - "libs/treehydra.js", - "treehydra.js", - "libs", - "treehydra.js", - "/libs/treehydra.js", - "libs/treehydra.js", - "treehydra.js", - "libs", - "unstable", - "esp.js", - "/libs/unstable/esp.js", - "libs/unstable/esp.js", - "unstable/esp.js", - "esp.js", - "esp_lock.js", - "/esp_lock.js", - "esp_lock.js", - "gcc_treehydra.so", - "/gcc_treehydra.so", - "gcc_treehydra.so", - "test_locks_bad2.js", - "locks_bad2.cc", - "locks_bad2.cc", - "libs", - "treehydra.js", - "/libs/treehydra.js", - "libs/treehydra.js", - "treehydra.js", - "libs", - "treehydra.js", - "/libs/treehydra.js", - "libs/treehydra.js", - "treehydra.js", - "libs", - "unstable", - "esp.js", - "/libs/unstable/esp.js", - "libs/unstable/esp.js", - "unstable/esp.js", - "esp.js", - "esp_lock.js", - "/esp_lock.js", - "esp_lock.js", - "gcc_treehydra.so", - "/gcc_treehydra.so", - "gcc_treehydra.so", - "test_locks_bad1.js", - "locks_bad1.cc", - "locks_bad1.cc", - "libs", - "treehydra.js", - "/libs/treehydra.js", - "libs/treehydra.js", - "treehydra.js", - "libs", - "treehydra.js", - "/libs/treehydra.js", - "libs/treehydra.js", - "treehydra.js", - "libs", - "unstable", - "esp.js", - "/libs/unstable/esp.js", - "libs/unstable/esp.js", - "unstable/esp.js", - "esp.js", - "esp_lock.js", - "/esp_lock.js", - "esp_lock.js", - ], + "nsFrame.cpp", + "layout", + "html", + "base", + "src", + "nsFrame.cpp", + "layout/html/base/src/nsFrame.cpp", + "html/base/src/nsFrame.cpp", + "base/src/nsFrame.cpp", + "src/nsFrame.cpp", + "nsFrame.cpp", + "layout", + "html", + "base", + "src", + "nsFrame.cpp", + "layout/html/base/src/nsFrame.cpp", + "html/base/src/nsFrame.cpp", + "base/src/nsFrame.cpp", + "src/nsFrame.cpp", + "nsFrame.cpp", ], - ) + [ + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_bad3.js", + "locks_bad3.cc", + "locks_bad3.cc", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_good.js", + "locks_good.cc", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_good2.js", + "locks_good2.cc", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_bad4.js", + "locks_bad4.cc", + "locks_bad4.cc", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_bad2.js", + "locks_bad2.cc", + "locks_bad2.cc", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_bad1.js", + "locks_bad1.cc", + "locks_bad1.cc", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + ], + ] + + read("", FilePaths, expected_results, use_inline_data=True, inline_data=inline_data) From deadc1811370625d42dbe6b53244aba3d23af051 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 18 Oct 2024 11:59:04 -0400 Subject: [PATCH 19/41] Fixed test errors --- bugbug/bug_features.py | 2 +- tests/test_bug_features.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index b98eb50d00..721de5298b 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -957,7 +957,7 @@ def __init__(self): ) psl = PublicSuffixList() - tlds = set(tlds=set(f".{entry}" for entry in psl.tlds if "." not in entry)) + tlds = set(f".{entry}" for entry in psl.tlds if "." not in entry) filtered_tlds = [tld for tld in tlds if tld[1:] not in self.valid_extensions] self.non_file_path_keywords.extend(filtered_tlds) diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index ea3a5486b0..3be991c39c 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -55,7 +55,9 @@ def _read( else: path = get_fixture_path(os.path.join("bug_features", path)) with open(path, "r") as f: - results = (feature_extractor(json.loads(line)) for line in f) + lines = f.readlines() + + results = (feature_extractor(json.loads(line)) for line in lines) for result, expected_result in zip(results, expected_results): assert result == expected_result From a3f0ede7b541112356c6301801b558f00ecff4ee Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 18 Oct 2024 12:16:23 -0400 Subject: [PATCH 20/41] Added custom delimiter --- tests/test_bug_features.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index 3be991c39c..f27a94c0da 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -50,7 +50,8 @@ def _read( if use_inline_data: results = ( - feature_extractor(json.loads(line)) for line in inline_data.splitlines() + feature_extractor(json.loads(line.strip())) + for line in inline_data.split("###") ) else: path = get_fixture_path(os.path.join("bug_features", path)) @@ -205,6 +206,7 @@ def test_BugTypes(read) -> None: def test_FilePaths(read): inline_data = """{"summary": " cleanup", "comments": [{"text": "Fix for\n{{ \nanthonyd (2 warnings)\n1.\tlayout/html/base/src/nsFrame.cpp:3879 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n2.\tlayout/html/base/src/nsFrame.cpp:3908 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n}} (NB: lines should be lines - 2, due to checkin \"in progress\")\nwill be included."}]} + ### {"summary": "Spidermonkey regression causes treehydra trunk to fail 6 tests", "comments": [{"text": "Today I'm trying to get callgraph stuff hooked into dxr, and I'm unable to get a working treehydra. I've updated tried updating just dehydra, then I updated gcc w/plugins using the new stuff in the patch queue, and it doesn't matter. Running make check_treehydra fails like this:\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad3.js locks_bad3.cc\n Failure msg: Expected 'locks_bad3.cc:10: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good.js locks_good.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good2.js locks_good2.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad4.js locks_bad4.cc\n Failure msg: Expected 'locks_bad4.cc:13: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad2.js locks_bad2.cc\n Failure msg: Expected 'locks_bad2.cc:12: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad1.js locks_bad1.cc\n Failure msg: Expected 'locks_bad1.cc:11: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\n\nUnit Test Suite Summary:\n 32 passed\n 6 failed\n 0 error(s)\nmake[1]: *** [check_treehydra] Error 1\nmake[1]: Leaving directory `/var/www/html/dxr/tools/gcc-dehydra/dehydra/test'\nmake: *** [check] Error 2"}]} """ From a96a1e29a9c0e74c436ba04119710d4b016b80dd Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 18 Oct 2024 12:49:39 -0400 Subject: [PATCH 21/41] Fixed json input --- tests/test_bug_features.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index f27a94c0da..117536a13a 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -49,10 +49,7 @@ def _read( feature_extractor = feature_extractor_class() if use_inline_data: - results = ( - feature_extractor(json.loads(line.strip())) - for line in inline_data.split("###") - ) + results = (feature_extractor(item) for item in inline_data) else: path = get_fixture_path(os.path.join("bug_features", path)) with open(path, "r") as f: @@ -205,11 +202,24 @@ def test_BugTypes(read) -> None: def test_FilePaths(read): - inline_data = """{"summary": " cleanup", "comments": [{"text": "Fix for\n{{ \nanthonyd (2 warnings)\n1.\tlayout/html/base/src/nsFrame.cpp:3879 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n2.\tlayout/html/base/src/nsFrame.cpp:3908 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n}} (NB: lines should be lines - 2, due to checkin \"in progress\")\nwill be included."}]} - ### - {"summary": "Spidermonkey regression causes treehydra trunk to fail 6 tests", "comments": [{"text": "Today I'm trying to get callgraph stuff hooked into dxr, and I'm unable to get a working treehydra. I've updated tried updating just dehydra, then I updated gcc w/plugins using the new stuff in the patch queue, and it doesn't matter. Running make check_treehydra fails like this:\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad3.js locks_bad3.cc\n Failure msg: Expected 'locks_bad3.cc:10: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good.js locks_good.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good2.js locks_good2.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad4.js locks_bad4.cc\n Failure msg: Expected 'locks_bad4.cc:13: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad2.js locks_bad2.cc\n Failure msg: Expected 'locks_bad2.cc:12: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad1.js locks_bad1.cc\n Failure msg: Expected 'locks_bad1.cc:11: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\n\nUnit Test Suite Summary:\n 32 passed\n 6 failed\n 0 error(s)\nmake[1]: *** [check_treehydra] Error 1\nmake[1]: Leaving directory `/var/www/html/dxr/tools/gcc-dehydra/dehydra/test'\nmake: *** [check] Error 2"}]} - """ - + inline_data = [ + { + "summary": " cleanup", + "comments": [ + { + "text": "Fix for\n{{ \nanthonyd (2 warnings)\n1.\tlayout/html/base/src/nsFrame.cpp:3879 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n2.\tlayout/html/base/src/nsFrame.cpp:3908 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n}} (NB: lines should be lines - 2, due to checkin \"in progress\")\nwill be included." + } + ], + }, + { + "summary": "Spidermonkey regression causes treehydra trunk to fail 6 tests", + "comments": [ + { + "text": 'Today I\'m trying to get callgraph stuff hooked into dxr, and I\'m unable to get a working treehydra. I\'ve updated tried updating just dehydra, then I updated gcc w/plugins using the new stuff in the patch queue, and it doesn\'t matter. Running make check_treehydra fails like this:\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad3.js locks_bad3.cc\n Failure msg: Expected \'locks_bad3.cc:10: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good.js locks_good.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good2.js locks_good2.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad4.js locks_bad4.cc\n Failure msg: Expected \'locks_bad4.cc:13: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad2.js locks_bad2.cc\n Failure msg: Expected \'locks_bad2.cc:12: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad1.js locks_bad1.cc\n Failure msg: Expected \'locks_bad1.cc:11: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\n\nUnit Test Suite Summary:\n 32 passed\n 6 failed\n 0 error(s)\nmake[1]: *** [check_treehydra] Error 1\nmake[1]: Leaving directory `/var/www/html/dxr/tools/gcc-dehydra/dehydra/test\'\nmake: *** [check] Error 2' + } + ], + }, + ] expected_results = [ [ "nsFrame.cpp", From 176079c5cd981d2ae2cb6d3dbfeb541932339959 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 18 Oct 2024 13:06:56 -0400 Subject: [PATCH 22/41] Deleted fixture for file paths --- tests/fixtures/bug_features/file_paths.json | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 tests/fixtures/bug_features/file_paths.json diff --git a/tests/fixtures/bug_features/file_paths.json b/tests/fixtures/bug_features/file_paths.json deleted file mode 100644 index 65c9a7f73e..0000000000 --- a/tests/fixtures/bug_features/file_paths.json +++ /dev/null @@ -1,2 +0,0 @@ -{"summary": " cleanup", "comments": [{"text": "Fix for\n{{ \nanthonyd (2 warnings)\n1.\tlayout/html/base/src/nsFrame.cpp:3879 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n2.\tlayout/html/base/src/nsFrame.cpp:3908 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n}} (NB: lines should be lines - 2, due to checkin \"in progress\")\nwill be included."}]} -{"summary": "Spidermonkey regression causes treehydra trunk to fail 6 tests", "comments": [{"text": "Today I'm trying to get callgraph stuff hooked into dxr, and I'm unable to get a working treehydra. I've updated tried updating just dehydra, then I updated gcc w/plugins using the new stuff in the patch queue, and it doesn't matter. Running make check_treehydra fails like this:\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad3.js locks_bad3.cc\n Failure msg: Expected 'locks_bad3.cc:10: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good.js locks_good.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good2.js locks_good2.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad4.js locks_bad4.cc\n Failure msg: Expected 'locks_bad4.cc:13: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad2.js locks_bad2.cc\n Failure msg: Expected 'locks_bad2.cc:12: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad1.js locks_bad1.cc\n Failure msg: Expected 'locks_bad1.cc:11: error: precondition not met' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error(\"No case_val in this lazy object\")\n../libs/treehydra.js:12: #1: unhandledLazyProperty(\"case_val\")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\n\nUnit Test Suite Summary:\n 32 passed\n 6 failed\n 0 error(s)\nmake[1]: *** [check_treehydra] Error 1\nmake[1]: Leaving directory `/var/www/html/dxr/tools/gcc-dehydra/dehydra/test'\nmake: *** [check] Error 2"}]} \ No newline at end of file From 19a289c15eed3bd3bdd38555159c95c016a18677 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 18 Oct 2024 15:42:30 -0400 Subject: [PATCH 23/41] Pre-compile regex --- bugbug/bug_features.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 721de5298b..f396143ec3 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -962,11 +962,13 @@ def __init__(self): filtered_tlds = [tld for tld in tlds if tld[1:] not in self.valid_extensions] self.non_file_path_keywords.extend(filtered_tlds) + keyword_pattern_string = "|".join( + re.escape(keyword) for keyword in self.non_file_path_keywords + ) + self.keyword_pattern = re.compile(rf"\S*({keyword_pattern_string})\S*") + def remove_urls(self, text: str) -> str: - for keyword in self.non_file_path_keywords: - if keyword in text: - text = re.sub(rf"\S*{re.escape(keyword)}\S*", "", text) - return text + return self.keyword_pattern.sub("", text) def extract_valid_file_path(self, word: str) -> str: match = self.extension_pattern.search(word) From 4fd1f045cba23e96fd85dbd4dfa863e487d64f37 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 18 Oct 2024 15:52:01 -0400 Subject: [PATCH 24/41] Removed comment --- bugbug/bug_features.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index f396143ec3..52ac4863e2 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -908,21 +908,6 @@ class BugType(SingleBugFeature): def __call__(self, bug, **kwargs): return bug["type"] -# class ExtractFilePaths(SingleBugFeature): -# """Extract file paths (partial and full) from bug data.""" - -# name = "Extract File Paths" - -# def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: -# text = ( -# bug.get("summary", "") -# + " " -# + " ".join(comment["text"] for comment in bug.get("comments", [])) -# ) - -# paths = re.findall(r"\b[\w/\\]+/\w+\.\w+\b", text) - -# return sorted(set(paths)) class FilePaths(SingleBugFeature): From 0d8d9ddf7aeb3e843bcd6f20e56f0f7be3b337ee Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Mon, 21 Oct 2024 10:35:53 -0400 Subject: [PATCH 25/41] Changed default value of `inline_data` to `None` --- tests/test_bug_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index 117536a13a..853cd630db 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -44,7 +44,7 @@ def _read( feature_extractor_class, expected_results, use_inline_data=False, - inline_data="", + inline_data=None, ): feature_extractor = feature_extractor_class() From c28ad975859b90d3d26d40e4ef36d7bb70a8794f Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Mon, 21 Oct 2024 10:51:26 -0400 Subject: [PATCH 26/41] Removed inline data boolean --- tests/test_bug_features.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index 853cd630db..69b0dbeedc 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -43,12 +43,11 @@ def _read( path, feature_extractor_class, expected_results, - use_inline_data=False, inline_data=None, ): feature_extractor = feature_extractor_class() - if use_inline_data: + if inline_data: results = (feature_extractor(item) for item in inline_data) else: path = get_fixture_path(os.path.join("bug_features", path)) @@ -402,4 +401,4 @@ def test_FilePaths(read): ], ] - read("", FilePaths, expected_results, use_inline_data=True, inline_data=inline_data) + read("", FilePaths, expected_results, inline_data=inline_data) From 24a5375697184c447fe2263b8156d2aecb6dca01 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Mon, 21 Oct 2024 11:14:42 -0400 Subject: [PATCH 27/41] Removed `readlines()` --- tests/test_bug_features.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index 69b0dbeedc..4569d9758d 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -52,9 +52,7 @@ def _read( else: path = get_fixture_path(os.path.join("bug_features", path)) with open(path, "r") as f: - lines = f.readlines() - - results = (feature_extractor(json.loads(line)) for line in lines) + results = (feature_extractor(json.loads(line)) for line in f) for result, expected_result in zip(results, expected_results): assert result == expected_result From 2bcfb18ba5a0ff69be133b2278e11428156b480a Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Mon, 21 Oct 2024 13:50:50 -0400 Subject: [PATCH 28/41] Converted results into a list --- tests/test_bug_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index 4569d9758d..841abba2f1 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -52,7 +52,7 @@ def _read( else: path = get_fixture_path(os.path.join("bug_features", path)) with open(path, "r") as f: - results = (feature_extractor(json.loads(line)) for line in f) + results = list(feature_extractor(json.loads(line)) for line in f) for result, expected_result in zip(results, expected_results): assert result == expected_result From 9bed4a17fcea87d84d991ecba6607c883e6a8870 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Mon, 21 Oct 2024 15:32:10 -0400 Subject: [PATCH 29/41] Moved FilePaths test to function --- tests/test_bug_features.py | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index 841abba2f1..c4b39b0ced 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -39,25 +39,17 @@ @pytest.fixture def read(get_fixture_path): - def _read( - path, - feature_extractor_class, - expected_results, - inline_data=None, - ): + def _read(path, feature_extractor_class, expected_results): feature_extractor = feature_extractor_class() - if inline_data: - results = (feature_extractor(item) for item in inline_data) - else: - path = get_fixture_path(os.path.join("bug_features", path)) - with open(path, "r") as f: - results = list(feature_extractor(json.loads(line)) for line in f) + path = get_fixture_path(os.path.join("bug_features", path)) - for result, expected_result in zip(results, expected_results): - assert result == expected_result + with open(path, "r") as f: + results = (feature_extractor(json.loads(line)) for line in f) + for result, expected_result in zip(results, expected_results): + assert result == expected_result - return _read + return _read def test_has_str(read): @@ -399,4 +391,7 @@ def test_FilePaths(read): ], ] - read("", FilePaths, expected_results, inline_data=inline_data) + results = (FilePaths(item) for item in inline_data) + + for result, expected_result in zip(results, expected_results): + assert result == expected_result From e76c0d044f0190a978f6e5aa7b853fbe5961da65 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Mon, 21 Oct 2024 15:33:20 -0400 Subject: [PATCH 30/41] Fixed indentation --- tests/test_bug_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index c4b39b0ced..a0849d3883 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -49,7 +49,7 @@ def _read(path, feature_extractor_class, expected_results): for result, expected_result in zip(results, expected_results): assert result == expected_result - return _read + return _read def test_has_str(read): From 1c00a106ad6e658f8e7c5539da85cab38beb9d04 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Mon, 21 Oct 2024 15:52:07 -0400 Subject: [PATCH 31/41] Fixed assertion --- tests/test_bug_features.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index a0849d3883..fdc405aa11 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -391,7 +391,6 @@ def test_FilePaths(read): ], ] - results = (FilePaths(item) for item in inline_data) - - for result, expected_result in zip(results, expected_results): - assert result == expected_result + feature_extractor = FilePaths() + results = [feature_extractor(item) for item in inline_data] + assert results == expected_results From f2a9d39115df07aeafea1f9a93edd4a0e96a6b7d Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 23 Oct 2024 09:45:49 -0400 Subject: [PATCH 32/41] Changed `valid_extensions` to a local variable instead of an attribute --- bugbug/bug_features.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 52ac4863e2..1f35909f45 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -923,19 +923,15 @@ def __init__(self): "@", ] - self.valid_extensions = set( - ext.lstrip(".") for ext in mimetypes.types_map.keys() - ) + valid_extensions = set(ext.lstrip(".") for ext in mimetypes.types_map.keys()) lexers = get_all_lexers() lexer_extensions = set(ext[2:] for lexer in lexers for ext in lexer[2]) - self.valid_extensions.update(lexer_extensions) - self.valid_extensions = sorted(self.valid_extensions, key=len, reverse=True) + valid_extensions.update(lexer_extensions) + valid_extensions = sorted(valid_extensions, key=len, reverse=True) - extension_pattern_string = "|".join( - re.escape(ext) for ext in self.valid_extensions - ) + extension_pattern_string = "|".join(re.escape(ext) for ext in valid_extensions) self.extension_pattern = re.compile( rf"\.({extension_pattern_string})(?![a-zA-Z])" @@ -944,7 +940,7 @@ def __init__(self): psl = PublicSuffixList() tlds = set(f".{entry}" for entry in psl.tlds if "." not in entry) - filtered_tlds = [tld for tld in tlds if tld[1:] not in self.valid_extensions] + filtered_tlds = [tld for tld in tlds if tld[1:] not in valid_extensions] self.non_file_path_keywords.extend(filtered_tlds) keyword_pattern_string = "|".join( From b677ccbc710126d89117393404f7be401f2c7a7b Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 23 Oct 2024 17:52:34 -0400 Subject: [PATCH 33/41] Converted `non_file_path_keywords` from attribute to local variable --- bugbug/bug_features.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 1f35909f45..93e12f705a 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -916,7 +916,7 @@ class FilePaths(SingleBugFeature): name = "Extract File Paths" def __init__(self): - self.non_file_path_keywords = [ + non_file_path_keywords = [ "http://", "https://", "www.", @@ -941,10 +941,10 @@ def __init__(self): tlds = set(f".{entry}" for entry in psl.tlds if "." not in entry) filtered_tlds = [tld for tld in tlds if tld[1:] not in valid_extensions] - self.non_file_path_keywords.extend(filtered_tlds) + non_file_path_keywords.extend(filtered_tlds) keyword_pattern_string = "|".join( - re.escape(keyword) for keyword in self.non_file_path_keywords + re.escape(keyword) for keyword in non_file_path_keywords ) self.keyword_pattern = re.compile(rf"\S*({keyword_pattern_string})\S*") From 70f72f5fedcd99e6e92d9cb8d431ac4c010bd6e2 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 23 Oct 2024 17:55:15 -0400 Subject: [PATCH 34/41] Added comment explaining sorting `valid_extensions` --- bugbug/bug_features.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 93e12f705a..45a19cb3e5 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -929,6 +929,8 @@ def __init__(self): lexer_extensions = set(ext[2:] for lexer in lexers for ext in lexer[2]) valid_extensions.update(lexer_extensions) + + # Sorted from longest to shortest length to avoid partial matches (e.g. ".css" over ".c") valid_extensions = sorted(valid_extensions, key=len, reverse=True) extension_pattern_string = "|".join(re.escape(ext) for ext in valid_extensions) From 836d42d9c084530d878958800b6eb5f009fa37b0 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 23 Oct 2024 18:27:52 -0400 Subject: [PATCH 35/41] Removed deletion of URLs from string --- bugbug/bug_features.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 45a19cb3e5..ace80265c3 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -950,10 +950,13 @@ def __init__(self): ) self.keyword_pattern = re.compile(rf"\S*({keyword_pattern_string})\S*") - def remove_urls(self, text: str) -> str: - return self.keyword_pattern.sub("", text) + def is_valid_file_path_candidate(self, word: str) -> bool: + return not self.keyword_pattern.search(word) def extract_valid_file_path(self, word: str) -> str: + if not self.is_valid_file_path_candidate(word): + return "" + match = self.extension_pattern.search(word) if match: ext = match.group(1) @@ -965,9 +968,7 @@ def extract_valid_file_path(self, word: str) -> str: return "" def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: - text = self.remove_urls( - f"{bug.get('summary', '')} {bug['comments'][0]['text']}" - ) + text = f"{bug.get('summary', '')} {bug['comments'][0]['text']}" file_paths = [ path From d6f800219fd86927076c7eea128807dd61aeb125 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 25 Oct 2024 09:37:15 -0400 Subject: [PATCH 36/41] Removed sorting (test) --- bugbug/bug_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index ace80265c3..368fc4e94e 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -931,7 +931,7 @@ def __init__(self): valid_extensions.update(lexer_extensions) # Sorted from longest to shortest length to avoid partial matches (e.g. ".css" over ".c") - valid_extensions = sorted(valid_extensions, key=len, reverse=True) + # valid_extensions = sorted(valid_extensions, key=len, reverse=True) extension_pattern_string = "|".join(re.escape(ext) for ext in valid_extensions) From e11be5ba003d4d7b09a65cb17d3a328bd9c42232 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 25 Oct 2024 09:49:05 -0400 Subject: [PATCH 37/41] Removed sorting comment --- bugbug/bug_features.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 368fc4e94e..d3487d3f02 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -930,9 +930,6 @@ def __init__(self): valid_extensions.update(lexer_extensions) - # Sorted from longest to shortest length to avoid partial matches (e.g. ".css" over ".c") - # valid_extensions = sorted(valid_extensions, key=len, reverse=True) - extension_pattern_string = "|".join(re.escape(ext) for ext in valid_extensions) self.extension_pattern = re.compile( From 38432cf1eb58b68eac8af55474231af4ec84a6a7 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 25 Oct 2024 09:52:22 -0400 Subject: [PATCH 38/41] Simplified updating valid extensions set with lexers --- bugbug/bug_features.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index d3487d3f02..c6857ab376 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -925,10 +925,9 @@ def __init__(self): valid_extensions = set(ext.lstrip(".") for ext in mimetypes.types_map.keys()) - lexers = get_all_lexers() - lexer_extensions = set(ext[2:] for lexer in lexers for ext in lexer[2]) - - valid_extensions.update(lexer_extensions) + valid_extensions.update( + ext[2:] for (_, _, exts) in get_all_lexers() for ext in exts + ) extension_pattern_string = "|".join(re.escape(ext) for ext in valid_extensions) From 9373c0b8ff46584b93c25597ba6960e5b4cd9c6a Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 25 Oct 2024 10:28:04 -0400 Subject: [PATCH 39/41] Fixed ValueError --- bugbug/bug_features.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index c6857ab376..facfafa165 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -925,8 +925,8 @@ def __init__(self): valid_extensions = set(ext.lstrip(".") for ext in mimetypes.types_map.keys()) - valid_extensions.update( - ext[2:] for (_, _, exts) in get_all_lexers() for ext in exts + valid_extensions = set( + ext[2:] for (_, _, exts, *_) in get_all_lexers() for ext in exts ) extension_pattern_string = "|".join(re.escape(ext) for ext in valid_extensions) From 2022eb4d235de28c36900f55ffc1a5a334e33f3b Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 25 Oct 2024 10:47:51 -0400 Subject: [PATCH 40/41] Fixed ValueError --- bugbug/bug_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index facfafa165..92bf841a6f 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -925,7 +925,7 @@ def __init__(self): valid_extensions = set(ext.lstrip(".") for ext in mimetypes.types_map.keys()) - valid_extensions = set( + valid_extensions.update( ext[2:] for (_, _, exts, *_) in get_all_lexers() for ext in exts ) From 64e5c3b7f0f70929797ceea285963c124043272e Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Mon, 28 Oct 2024 09:55:10 -0400 Subject: [PATCH 41/41] Removed tracking --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index aba9ceef58..c2ee32bc89 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,10 +21,8 @@ numpy==1.26.4 orjson==3.10.9 ortools==9.11.4210 pandas==2.2.3 -<<<<<<< HEAD psutil==6.1.0 publicsuffix2==2.20191221 ->>>>>>> 0f27b3cb (Added `publicsuffix2` to generate list of tlds) pydriller==1.12 pyOpenSSL>=0.14 # Could not find a version that satisfies the requirement pyOpenSSL>=0.14; extra == "security" (from requests[security]>=2.7.0->libmozdata==0.1.43) python-dateutil==2.9.0.post0