From 63bcd8f80a026cc4af89d0b8bee55a9574c78ac6 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 25 Jun 2024 16:39:54 -0400 Subject: [PATCH 1/2] Added SMOTE oversampling and random undersampling --- bugbug/models/component.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bugbug/models/component.py b/bugbug/models/component.py index e8fa9f8eb1..4cbd867e4d 100644 --- a/bugbug/models/component.py +++ b/bugbug/models/component.py @@ -10,6 +10,9 @@ import dateutil.parser import xgboost from dateutil.relativedelta import relativedelta +from imblearn.over_sampling import SMOTE +from imblearn.pipeline import Pipeline as ImblearnPipeline +from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline @@ -103,7 +106,7 @@ def __init__(self, lemmatization=False): ] ) - self.clf = Pipeline( + self.clf = ImblearnPipeline( [ ( "union", @@ -119,6 +122,8 @@ def __init__(self, lemmatization=False): ] ), ), + ("smote", SMOTE(random_state=42)), + ("undersample", RandomUnderSampler(random_state=42)), ( "estimator", xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), From f45094704fca7448789a4ba9c6e5217f43cd4375 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 28 Jun 2024 08:19:01 -0400 Subject: [PATCH 2/2] Added SMOTE --- bugbug/models/component.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bugbug/models/component.py b/bugbug/models/component.py index 4cbd867e4d..fb171e6f73 100644 --- a/bugbug/models/component.py +++ b/bugbug/models/component.py @@ -12,7 +12,6 @@ from dateutil.relativedelta import relativedelta from imblearn.over_sampling import SMOTE from imblearn.pipeline import Pipeline as ImblearnPipeline -from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline @@ -122,8 +121,7 @@ def __init__(self, lemmatization=False): ] ), ), - ("smote", SMOTE(random_state=42)), - ("undersample", RandomUnderSampler(random_state=42)), + ("sampler", SMOTE(random_state=1, sampling_strategy="all")), ( "estimator", xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()),