mirror of
https://github.com/twitter/the-algorithm.git
synced 2025-06-10 14:48:16 -05:00
Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
This commit is contained in:
0
trust_and_safety_models/toxicity/data/__init__.py
Normal file
0
trust_and_safety_models/toxicity/data/__init__.py
Normal file
118
trust_and_safety_models/toxicity/data/data_preprocessing.py
Normal file
118
trust_and_safety_models/toxicity/data/data_preprocessing.py
Normal file
@ -0,0 +1,118 @@
|
||||
from abc import ABC
|
||||
import re
|
||||
|
||||
from toxicity_ml_pipeline.settings.hcomp_settings import TOXIC_35
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
TOXIC_35_set = set(TOXIC_35)
|
||||
|
||||
url_group = r"(\bhttps?:\/\/\S+)"
|
||||
mention_group = r"(\B@\S+)"
|
||||
urls_mentions_re = re.compile(url_group + r"|" + mention_group, re.IGNORECASE)
|
||||
url_re = re.compile(url_group, re.IGNORECASE)
|
||||
mention_re = re.compile(mention_group, re.IGNORECASE)
|
||||
newline_re = re.compile(r"\n+", re.IGNORECASE)
|
||||
and_re = re.compile(r"&\s?amp\s?;", re.IGNORECASE)
|
||||
|
||||
|
||||
class DataframeCleaner(ABC):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def _clean(self, df):
|
||||
return df
|
||||
|
||||
def _systematic_preprocessing(self, df):
|
||||
df.reset_index(inplace=True, drop=True)
|
||||
if "media_url" in df.columns:
|
||||
print(".... removing tweets with media")
|
||||
df.drop(df[~df.media_url.isna()].index, inplace=True, axis=0)
|
||||
else:
|
||||
print("WARNING you are not removing tweets with media to train a BERT model.")
|
||||
|
||||
print(".... deleting duplicates")
|
||||
df.drop_duplicates("text", inplace=True, keep="last")
|
||||
print(f"Got {df.shape[0]} after cleaning")
|
||||
|
||||
return df.reset_index(inplace=False, drop=True)
|
||||
|
||||
def _postprocess(self, df, *args, **kwargs):
|
||||
return df
|
||||
|
||||
def __call__(self, df, *args, **kwargs):
|
||||
print(f"Got {df.shape[0]} before cleaning")
|
||||
|
||||
df["raw_text"] = df.text
|
||||
df = self._clean(df)
|
||||
|
||||
df = self._systematic_preprocessing(df)
|
||||
|
||||
return self._postprocess(df, *args, **kwargs)
|
||||
|
||||
|
||||
def mapping_func(el):
|
||||
if el.aggregated_content in TOXIC_35_set:
|
||||
return 2
|
||||
if el.label == 1:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
class DefaultENNoPreprocessor(DataframeCleaner):
|
||||
def _postprocess(self, df, *args, **kwargs):
|
||||
if "toxic_count" in df.columns and "non_toxic_count" in df.columns:
|
||||
df["vote"] = df.toxic_count / (df.toxic_count + df.non_toxic_count)
|
||||
df["agreement_rate"] = np.max((df.vote, 1 - df.vote), axis=0)
|
||||
|
||||
if "label_column" in kwargs and kwargs["label_column"] != "label":
|
||||
if kwargs["label_column"] == "aggregated_content":
|
||||
print("Replacing v3 label by v3.5 label.")
|
||||
if "num_classes" in kwargs and kwargs["num_classes"] < 3:
|
||||
df["label"] = np.where(df.aggregated_content.isin(TOXIC_35_set), 1, 0)
|
||||
elif "num_classes" in kwargs and kwargs["num_classes"] == 3:
|
||||
print("Making it a 3-class pb")
|
||||
df["label"] = df.apply(mapping_func, axis=1)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
elif kwargs['label_column'] in df.columns:
|
||||
df['label'] = df[kwargs['label_column']]
|
||||
if kwargs['class_weight'] is not None:
|
||||
df["class_weight"] = np.where(df['label'] == 1, 1-kwargs['class_weight'],
|
||||
kwargs['class_weight'])
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
if "filter_low_agreements" in kwargs and kwargs["filter_low_agreements"] == True:
|
||||
df.drop(df[(df.agreement_rate <= 0.6)].index, axis=0, inplace=True)
|
||||
raise NotImplementedError
|
||||
|
||||
return df
|
||||
|
||||
|
||||
class DefaultENPreprocessor(DefaultENNoPreprocessor):
|
||||
def _clean(self, adhoc_df):
|
||||
print(
|
||||
".... removing \\n and replacing @mentions and URLs by placeholders. "
|
||||
"Emoji filtering is not done."
|
||||
)
|
||||
adhoc_df["text"] = [url_re.sub("URL", tweet) for tweet in adhoc_df.raw_text.values]
|
||||
adhoc_df["text"] = [mention_re.sub("MENTION", tweet) for tweet in adhoc_df.text.values]
|
||||
adhoc_df["text"] = [
|
||||
newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values
|
||||
]
|
||||
adhoc_df["text"] = [and_re.sub("&", tweet) for tweet in adhoc_df.text.values]
|
||||
|
||||
return adhoc_df
|
||||
|
||||
|
||||
class Defaulti18nPreprocessor(DataframeCleaner):
|
||||
def _clean(self, adhoc_df):
|
||||
print(".... removing @mentions, \\n and URLs. Emoji filtering is not done.")
|
||||
adhoc_df["text"] = [urls_mentions_re.sub("", tweet) for tweet in adhoc_df.raw_text.values]
|
||||
adhoc_df["text"] = [
|
||||
newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values
|
||||
]
|
||||
|
||||
return adhoc_df
|
348
trust_and_safety_models/toxicity/data/dataframe_loader.py
Normal file
348
trust_and_safety_models/toxicity/data/dataframe_loader.py
Normal file
@ -0,0 +1,348 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from datetime import date
|
||||
from importlib import import_module
|
||||
import pickle
|
||||
|
||||
from toxicity_ml_pipeline.settings.default_settings_tox import (
|
||||
CLIENT,
|
||||
EXISTING_TASK_VERSIONS,
|
||||
GCS_ADDRESS,
|
||||
TRAINING_DATA_LOCATION,
|
||||
)
|
||||
from toxicity_ml_pipeline.utils.helpers import execute_command, execute_query
|
||||
from toxicity_ml_pipeline.utils.queries import (
|
||||
FULL_QUERY,
|
||||
FULL_QUERY_W_TWEET_TYPES,
|
||||
PARSER_UDF,
|
||||
QUERY_SETTINGS,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pandas
|
||||
|
||||
|
||||
class DataframeLoader(ABC):
|
||||
|
||||
def __init__(self, project):
|
||||
self.project = project
|
||||
|
||||
@abstractmethod
|
||||
def produce_query(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def load_data(self, test=False):
|
||||
pass
|
||||
|
||||
|
||||
class ENLoader(DataframeLoader):
|
||||
def __init__(self, project, setting_file):
|
||||
super(ENLoader, self).__init__(project=project)
|
||||
self.date_begin = setting_file.DATE_BEGIN
|
||||
self.date_end = setting_file.DATE_END
|
||||
TASK_VERSION = setting_file.TASK_VERSION
|
||||
if TASK_VERSION not in EXISTING_TASK_VERSIONS:
|
||||
raise ValueError
|
||||
self.task_version = TASK_VERSION
|
||||
self.query_settings = dict(QUERY_SETTINGS)
|
||||
self.full_query = FULL_QUERY
|
||||
|
||||
def produce_query(self, date_begin, date_end, task_version=None, **keys):
|
||||
task_version = self.task_version if task_version is None else task_version
|
||||
|
||||
if task_version in keys["table"]:
|
||||
table_name = keys["table"][task_version]
|
||||
print(f"Loading {table_name}")
|
||||
|
||||
main_query = keys["main"].format(
|
||||
table=table_name,
|
||||
parser_udf=PARSER_UDF[task_version],
|
||||
date_begin=date_begin,
|
||||
date_end=date_end,
|
||||
)
|
||||
|
||||
return self.full_query.format(
|
||||
main_table_query=main_query, date_begin=date_begin, date_end=date_end
|
||||
)
|
||||
return ""
|
||||
|
||||
def _reload(self, test, file_keyword):
|
||||
query = f"SELECT * from `{TRAINING_DATA_LOCATION.format(project=self.project)}_{file_keyword}`"
|
||||
|
||||
if test:
|
||||
query += " ORDER BY RAND() LIMIT 1000"
|
||||
try:
|
||||
df = execute_query(client=CLIENT, query=query)
|
||||
except Exception:
|
||||
print(
|
||||
"Loading from BQ failed, trying to load from GCS. "
|
||||
"NB: use this option only for intermediate files, which will be deleted at the end of "
|
||||
"the project."
|
||||
)
|
||||
copy_cmd = f"gsutil cp {GCS_ADDRESS.format(project=self.project)}/training_data/{file_keyword}.pkl ."
|
||||
execute_command(copy_cmd)
|
||||
try:
|
||||
with open(f"{file_keyword}.pkl", "rb") as file:
|
||||
df = pickle.load(file)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if test:
|
||||
df = df.sample(frac=1)
|
||||
return df.iloc[:1000]
|
||||
|
||||
return df
|
||||
|
||||
def load_data(self, test=False, **kwargs):
|
||||
if "reload" in kwargs and kwargs["reload"]:
|
||||
df = self._reload(test, kwargs["reload"])
|
||||
if df is not None and df.shape[0] > 0:
|
||||
return df
|
||||
|
||||
df = None
|
||||
query_settings = self.query_settings
|
||||
if test:
|
||||
query_settings = {"fairness": self.query_settings["fairness"]}
|
||||
query_settings["fairness"]["main"] += " LIMIT 500"
|
||||
|
||||
for table, query_info in query_settings.items():
|
||||
curr_query = self.produce_query(
|
||||
date_begin=self.date_begin, date_end=self.date_end, **query_info
|
||||
)
|
||||
if curr_query == "":
|
||||
continue
|
||||
curr_df = execute_query(client=CLIENT, query=curr_query)
|
||||
curr_df["origin"] = table
|
||||
df = curr_df if df is None else pandas.concat((df, curr_df))
|
||||
|
||||
df["loading_date"] = date.today()
|
||||
df["date"] = pandas.to_datetime(df.date)
|
||||
return df
|
||||
|
||||
def load_precision_set(
|
||||
self, begin_date="...", end_date="...", with_tweet_types=False, task_version=3.5
|
||||
):
|
||||
if with_tweet_types:
|
||||
self.full_query = FULL_QUERY_W_TWEET_TYPES
|
||||
|
||||
query_settings = self.query_settings
|
||||
curr_query = self.produce_query(
|
||||
date_begin=begin_date,
|
||||
date_end=end_date,
|
||||
task_version=task_version,
|
||||
**query_settings["precision"],
|
||||
)
|
||||
curr_df = execute_query(client=CLIENT, query=curr_query)
|
||||
|
||||
curr_df.rename(columns={"media_url": "media_presence"}, inplace=True)
|
||||
return curr_df
|
||||
|
||||
|
||||
class ENLoaderWithSampling(ENLoader):
|
||||
|
||||
keywords = {
|
||||
"politics": [
|
||||
...
|
||||
],
|
||||
"insults": [
|
||||
...
|
||||
],
|
||||
"race": [
|
||||
...
|
||||
],
|
||||
}
|
||||
n = ...
|
||||
N = ...
|
||||
|
||||
def __init__(self, project):
|
||||
self.raw_loader = ENLoader(project=project)
|
||||
if project == ...:
|
||||
self.project = project
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
def sample_with_weights(self, df, n):
|
||||
w = df["label"].value_counts(normalize=True)[1]
|
||||
dist = np.full((df.shape[0],), w)
|
||||
sampled_df = df.sample(n=n, weights=dist, replace=False)
|
||||
return sampled_df
|
||||
|
||||
def sample_keywords(self, df, N, group):
|
||||
print("\nmatching", group, "keywords...")
|
||||
|
||||
keyword_list = self.keywords[group]
|
||||
match_df = df.loc[df.text.str.lower().str.contains("|".join(keyword_list), regex=True)]
|
||||
|
||||
print("sampling N/3 from", group)
|
||||
if match_df.shape[0] <= N / 3:
|
||||
print(
|
||||
"WARNING: Sampling only",
|
||||
match_df.shape[0],
|
||||
"instead of",
|
||||
N / 3,
|
||||
"examples from race focused tweets due to insufficient data",
|
||||
)
|
||||
sample_df = match_df
|
||||
|
||||
else:
|
||||
print(
|
||||
"sampling",
|
||||
group,
|
||||
"at",
|
||||
round(match_df["label"].value_counts(normalize=True)[1], 3),
|
||||
"% action rate",
|
||||
)
|
||||
sample_df = self.sample_with_weights(match_df, int(N / 3))
|
||||
print(sample_df.shape)
|
||||
print(sample_df.label.value_counts(normalize=True))
|
||||
|
||||
print("\nshape of df before dropping sampled rows after", group, "matching..", df.shape[0])
|
||||
df = df.loc[
|
||||
df.index.difference(sample_df.index),
|
||||
]
|
||||
print("\nshape of df after dropping sampled rows after", group, "matching..", df.shape[0])
|
||||
|
||||
return df, sample_df
|
||||
|
||||
def sample_first_set_helper(self, train_df, first_set, new_n):
|
||||
if first_set == "prev":
|
||||
fset = train_df.loc[train_df["origin"].isin(["prevalence", "causal prevalence"])]
|
||||
print(
|
||||
"sampling prev at", round(fset["label"].value_counts(normalize=True)[1], 3), "% action rate"
|
||||
)
|
||||
else:
|
||||
fset = train_df
|
||||
|
||||
n_fset = self.sample_with_weights(fset, new_n)
|
||||
print("len of sampled first set", n_fset.shape[0])
|
||||
print(n_fset.label.value_counts(normalize=True))
|
||||
|
||||
return n_fset
|
||||
|
||||
def sample(self, df, first_set, second_set, keyword_sampling, n, N):
|
||||
train_df = df[df.origin != "precision"]
|
||||
val_test_df = df[df.origin == "precision"]
|
||||
|
||||
print("\nsampling first set of data")
|
||||
new_n = n - N if second_set is not None else n
|
||||
n_fset = self.sample_first_set_helper(train_df, first_set, new_n)
|
||||
|
||||
print("\nsampling second set of data")
|
||||
train_df = train_df.loc[
|
||||
train_df.index.difference(n_fset.index),
|
||||
]
|
||||
|
||||
if second_set is None:
|
||||
print("no second set sampling being done")
|
||||
df = n_fset.append(val_test_df)
|
||||
return df
|
||||
|
||||
if second_set == "prev":
|
||||
sset = train_df.loc[train_df["origin"].isin(["prevalence", "causal prevalence"])]
|
||||
|
||||
elif second_set == "fdr":
|
||||
sset = train_df.loc[train_df["origin"] == "fdr"]
|
||||
|
||||
else:
|
||||
sset = train_df
|
||||
|
||||
if keyword_sampling == True:
|
||||
print("sampling based off of keywords defined...")
|
||||
print("second set is", second_set, "with length", sset.shape[0])
|
||||
|
||||
sset, n_politics = self.sample_keywords(sset, N, "politics")
|
||||
sset, n_insults = self.sample_keywords(sset, N, "insults")
|
||||
sset, n_race = self.sample_keywords(sset, N, "race")
|
||||
|
||||
n_sset = n_politics.append([n_insults, n_race])
|
||||
print("len of sampled second set", n_sset.shape[0])
|
||||
|
||||
else:
|
||||
print(
|
||||
"No keyword sampling. Instead random sampling from",
|
||||
second_set,
|
||||
"at",
|
||||
round(sset["label"].value_counts(normalize=True)[1], 3),
|
||||
"% action rate",
|
||||
)
|
||||
n_sset = self.sample_with_weights(sset, N)
|
||||
print("len of sampled second set", n_sset.shape[0])
|
||||
print(n_sset.label.value_counts(normalize=True))
|
||||
|
||||
df = n_fset.append([n_sset, val_test_df])
|
||||
df = df.sample(frac=1).reset_index(drop=True)
|
||||
|
||||
return df
|
||||
|
||||
def load_data(
|
||||
self, first_set="prev", second_set=None, keyword_sampling=False, test=False, **kwargs
|
||||
):
|
||||
n = kwargs.get("n", self.n)
|
||||
N = kwargs.get("N", self.N)
|
||||
|
||||
df = self.raw_loader.load_data(test=test, **kwargs)
|
||||
return self.sample(df, first_set, second_set, keyword_sampling, n, N)
|
||||
|
||||
|
||||
class I18nLoader(DataframeLoader):
|
||||
def __init__(self):
|
||||
super().__init__(project=...)
|
||||
from archive.settings.... import ACCEPTED_LANGUAGES, QUERY_SETTINGS
|
||||
|
||||
self.accepted_languages = ACCEPTED_LANGUAGES
|
||||
self.query_settings = dict(QUERY_SETTINGS)
|
||||
|
||||
def produce_query(self, language, query, dataset, table, lang):
|
||||
query = query.format(dataset=dataset, table=table)
|
||||
add_query = f"AND reviewed.{lang}='{language}'"
|
||||
query += add_query
|
||||
|
||||
return query
|
||||
|
||||
def query_keys(self, language, task=2, size="50"):
|
||||
if task == 2:
|
||||
if language == "ar":
|
||||
self.query_settings["adhoc_v2"]["table"] = "..."
|
||||
elif language == "tr":
|
||||
self.query_settings["adhoc_v2"]["table"] = "..."
|
||||
elif language == "es":
|
||||
self.query_settings["adhoc_v2"]["table"] = f"..."
|
||||
else:
|
||||
self.query_settings["adhoc_v2"]["table"] = "..."
|
||||
|
||||
return self.query_settings["adhoc_v2"]
|
||||
|
||||
if task == 3:
|
||||
return self.query_settings["adhoc_v3"]
|
||||
|
||||
raise ValueError(f"There are no other tasks than 2 or 3. {task} does not exist.")
|
||||
|
||||
def load_data(self, language, test=False, task=2):
|
||||
if language not in self.accepted_languages:
|
||||
raise ValueError(
|
||||
f"Language not in the data {language}. Accepted values are " f"{self.accepted_languages}"
|
||||
)
|
||||
|
||||
print(".... adhoc data")
|
||||
key_dict = self.query_keys(language=language, task=task)
|
||||
query_adhoc = self.produce_query(language=language, **key_dict)
|
||||
if test:
|
||||
query_adhoc += " LIMIT 500"
|
||||
adhoc_df = execute_query(CLIENT, query_adhoc)
|
||||
|
||||
if not (test or language == "tr" or task == 3):
|
||||
if language == "es":
|
||||
print(".... additional adhoc data")
|
||||
key_dict = self.query_keys(language=language, size="100")
|
||||
query_adhoc = self.produce_query(language=language, **key_dict)
|
||||
adhoc_df = pandas.concat(
|
||||
(adhoc_df, execute_query(CLIENT, query_adhoc)), axis=0, ignore_index=True
|
||||
)
|
||||
|
||||
print(".... prevalence data")
|
||||
query_prev = self.produce_query(language=language, **self.query_settings["prevalence_v2"])
|
||||
prev_df = execute_query(CLIENT, query_prev)
|
||||
prev_df["description"] = "Prevalence"
|
||||
adhoc_df = pandas.concat((adhoc_df, prev_df), axis=0, ignore_index=True)
|
||||
|
||||
return self.clean(adhoc_df)
|
284
trust_and_safety_models/toxicity/data/mb_generator.py
Normal file
284
trust_and_safety_models/toxicity/data/mb_generator.py
Normal file
@ -0,0 +1,284 @@
|
||||
from importlib import import_module
|
||||
import os
|
||||
|
||||
from toxicity_ml_pipeline.settings.default_settings_tox import (
|
||||
INNER_CV,
|
||||
LOCAL_DIR,
|
||||
MAX_SEQ_LENGTH,
|
||||
NUM_PREFETCH,
|
||||
NUM_WORKERS,
|
||||
OUTER_CV,
|
||||
TARGET_POS_PER_EPOCH,
|
||||
)
|
||||
from toxicity_ml_pipeline.utils.helpers import execute_command
|
||||
|
||||
import numpy as np
|
||||
import pandas
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
try:
|
||||
from transformers import AutoTokenizer, DataCollatorWithPadding
|
||||
except ModuleNotFoundError:
|
||||
print("...")
|
||||
else:
|
||||
from datasets import Dataset
|
||||
|
||||
|
||||
class BalancedMiniBatchLoader(object):
|
||||
def __init__(
|
||||
self,
|
||||
fold,
|
||||
mb_size,
|
||||
seed,
|
||||
perc_training_tox,
|
||||
scope="TOX",
|
||||
project=...,
|
||||
dual_head=None,
|
||||
n_outer_splits=None,
|
||||
n_inner_splits=None,
|
||||
sample_weights=None,
|
||||
huggingface=False,
|
||||
):
|
||||
if 0 >= perc_training_tox or perc_training_tox > 0.5:
|
||||
raise ValueError("Perc_training_tox should be in ]0; 0.5]")
|
||||
|
||||
self.perc_training_tox = perc_training_tox
|
||||
if not n_outer_splits:
|
||||
n_outer_splits = OUTER_CV
|
||||
if isinstance(n_outer_splits, int):
|
||||
self.n_outer_splits = n_outer_splits
|
||||
self.get_outer_fold = self._get_outer_cv_fold
|
||||
if fold < 0 or fold >= self.n_outer_splits or int(fold) != fold:
|
||||
raise ValueError(f"Number of fold should be an integer in [0 ; {self.n_outer_splits} [.")
|
||||
|
||||
elif n_outer_splits == "time":
|
||||
self.get_outer_fold = self._get_time_fold
|
||||
if fold != "time":
|
||||
raise ValueError(
|
||||
"To avoid repeating the same run many times, the external fold"
|
||||
"should be time when test data is split according to dates."
|
||||
)
|
||||
try:
|
||||
setting_file = import_module(f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings")
|
||||
except ModuleNotFoundError:
|
||||
raise ValueError(f"You need to define a setting file for your project {project}.")
|
||||
self.test_begin_date = setting_file.TEST_BEGIN_DATE
|
||||
self.test_end_date = setting_file.TEST_END_DATE
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Argument n_outer_splits should either an integer or 'time'. Provided: {n_outer_splits}"
|
||||
)
|
||||
|
||||
self.n_inner_splits = n_inner_splits if n_inner_splits is not None else INNER_CV
|
||||
|
||||
self.seed = seed
|
||||
self.mb_size = mb_size
|
||||
self.fold = fold
|
||||
|
||||
self.sample_weights = sample_weights
|
||||
self.dual_head = dual_head
|
||||
self.huggingface = huggingface
|
||||
if self.huggingface:
|
||||
self._load_tokenizer()
|
||||
|
||||
def _load_tokenizer(self):
|
||||
print("Making a local copy of Bertweet-base model")
|
||||
local_model_dir = os.path.join(LOCAL_DIR, "models")
|
||||
cmd = f"mkdir {local_model_dir} ; gsutil -m cp -r gs://... {local_model_dir}"
|
||||
execute_command(cmd)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
os.path.join(local_model_dir, "bertweet-base"), normalization=True
|
||||
)
|
||||
|
||||
def tokenize_function(self, el):
|
||||
return self.tokenizer(
|
||||
el["text"],
|
||||
max_length=MAX_SEQ_LENGTH,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
add_special_tokens=True,
|
||||
return_token_type_ids=False,
|
||||
return_attention_mask=False,
|
||||
)
|
||||
|
||||
def _get_stratified_kfold(self, n_splits):
|
||||
return StratifiedKFold(shuffle=True, n_splits=n_splits, random_state=self.seed)
|
||||
|
||||
def _get_time_fold(self, df):
|
||||
test_begin_date = pandas.to_datetime(self.test_begin_date).date()
|
||||
test_end_date = pandas.to_datetime(self.test_end_date).date()
|
||||
print(f"Test is going from {test_begin_date} to {test_end_date}.")
|
||||
test_data = df.query("@test_begin_date <= date <= @test_end_date")
|
||||
|
||||
query = "date < @test_begin_date"
|
||||
other_set = df.query(query)
|
||||
return other_set, test_data
|
||||
|
||||
def _get_outer_cv_fold(self, df):
|
||||
labels = df.int_label
|
||||
stratifier = self._get_stratified_kfold(n_splits=self.n_outer_splits)
|
||||
|
||||
k = 0
|
||||
for train_index, test_index in stratifier.split(np.zeros(len(labels)), labels):
|
||||
if k == self.fold:
|
||||
break
|
||||
k += 1
|
||||
|
||||
train_data = df.iloc[train_index].copy()
|
||||
test_data = df.iloc[test_index].copy()
|
||||
|
||||
return train_data, test_data
|
||||
|
||||
def get_steps_per_epoch(self, nb_pos_examples):
|
||||
return int(max(TARGET_POS_PER_EPOCH, nb_pos_examples) / self.mb_size / self.perc_training_tox)
|
||||
|
||||
def make_huggingface_tensorflow_ds(self, group, mb_size=None, shuffle=True):
|
||||
huggingface_ds = Dataset.from_pandas(group).map(self.tokenize_function, batched=True)
|
||||
data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, return_tensors="tf")
|
||||
tensorflow_ds = huggingface_ds.to_tf_dataset(
|
||||
columns=["input_ids"],
|
||||
label_cols=["labels"],
|
||||
shuffle=shuffle,
|
||||
batch_size=self.mb_size if mb_size is None else mb_size,
|
||||
collate_fn=data_collator,
|
||||
)
|
||||
|
||||
if shuffle:
|
||||
return tensorflow_ds.repeat()
|
||||
return tensorflow_ds
|
||||
|
||||
def make_pure_tensorflow_ds(self, df, nb_samples):
|
||||
buffer_size = nb_samples * 2
|
||||
|
||||
if self.sample_weights is not None:
|
||||
if self.sample_weights not in df.columns:
|
||||
raise ValueError
|
||||
ds = tf.data.Dataset.from_tensor_slices(
|
||||
(df.text.values, df.label.values, df[self.sample_weights].values)
|
||||
)
|
||||
elif self.dual_head:
|
||||
label_d = {f'{e}_output': df[f'{e}_label'].values for e in self.dual_head}
|
||||
label_d['content_output'] = tf.keras.utils.to_categorical(label_d['content_output'], num_classes=3)
|
||||
ds = tf.data.Dataset.from_tensor_slices((df.text.values, label_d))
|
||||
|
||||
else:
|
||||
ds = tf.data.Dataset.from_tensor_slices((df.text.values, df.label.values))
|
||||
ds = ds.shuffle(buffer_size, seed=self.seed, reshuffle_each_iteration=True).repeat()
|
||||
return ds
|
||||
|
||||
def get_balanced_dataset(self, training_data, size_limit=None, return_as_batch=True):
|
||||
training_data = training_data.sample(frac=1, random_state=self.seed)
|
||||
nb_samples = training_data.shape[0] if not size_limit else size_limit
|
||||
|
||||
num_classes = training_data.int_label.nunique()
|
||||
toxic_class = training_data.int_label.max()
|
||||
if size_limit:
|
||||
training_data = training_data[: size_limit * num_classes]
|
||||
|
||||
print(
|
||||
".... {} examples, incl. {:.2f}% tox in train, {} classes".format(
|
||||
nb_samples,
|
||||
100 * training_data[training_data.int_label == toxic_class].shape[0] / nb_samples,
|
||||
num_classes,
|
||||
)
|
||||
)
|
||||
label_groups = training_data.groupby("int_label")
|
||||
if self.huggingface:
|
||||
label_datasets = {
|
||||
label: self.make_huggingface_tensorflow_ds(group) for label, group in label_groups
|
||||
}
|
||||
|
||||
else:
|
||||
label_datasets = {
|
||||
label: self.make_pure_tensorflow_ds(group, nb_samples=nb_samples * 2)
|
||||
for label, group in label_groups
|
||||
}
|
||||
|
||||
datasets = [label_datasets[0], label_datasets[1]]
|
||||
weights = [1 - self.perc_training_tox, self.perc_training_tox]
|
||||
if num_classes == 3:
|
||||
datasets.append(label_datasets[2])
|
||||
weights = [1 - self.perc_training_tox, self.perc_training_tox / 2, self.perc_training_tox / 2]
|
||||
elif num_classes != 2:
|
||||
raise ValueError("Currently it should not be possible to get other than 2 or 3 classes")
|
||||
resampled_ds = tf.data.experimental.sample_from_datasets(datasets, weights, seed=self.seed)
|
||||
|
||||
if return_as_batch and not self.huggingface:
|
||||
return resampled_ds.batch(
|
||||
self.mb_size, drop_remainder=True, num_parallel_calls=NUM_WORKERS, deterministic=True
|
||||
).prefetch(NUM_PREFETCH)
|
||||
|
||||
return resampled_ds
|
||||
|
||||
@staticmethod
|
||||
def _compute_int_labels(full_df):
|
||||
if full_df.label.dtype == int:
|
||||
full_df["int_label"] = full_df.label
|
||||
|
||||
elif "int_label" not in full_df.columns:
|
||||
if full_df.label.max() > 1:
|
||||
raise ValueError("Binarizing labels that should not be.")
|
||||
full_df["int_label"] = np.where(full_df.label >= 0.5, 1, 0)
|
||||
|
||||
return full_df
|
||||
|
||||
def __call__(self, full_df, *args, **kwargs):
|
||||
full_df = self._compute_int_labels(full_df)
|
||||
|
||||
train_data, test_data = self.get_outer_fold(df=full_df)
|
||||
|
||||
stratifier = self._get_stratified_kfold(n_splits=self.n_inner_splits)
|
||||
for train_index, val_index in stratifier.split(
|
||||
np.zeros(train_data.shape[0]), train_data.int_label
|
||||
):
|
||||
curr_train_data = train_data.iloc[train_index]
|
||||
|
||||
mini_batches = self.get_balanced_dataset(curr_train_data)
|
||||
|
||||
steps_per_epoch = self.get_steps_per_epoch(
|
||||
nb_pos_examples=curr_train_data[curr_train_data.int_label != 0].shape[0]
|
||||
)
|
||||
|
||||
val_data = train_data.iloc[val_index].copy()
|
||||
|
||||
yield mini_batches, steps_per_epoch, val_data, test_data
|
||||
|
||||
def simple_cv_load(self, full_df):
|
||||
full_df = self._compute_int_labels(full_df)
|
||||
|
||||
train_data, test_data = self.get_outer_fold(df=full_df)
|
||||
if test_data.shape[0] == 0:
|
||||
test_data = train_data.iloc[:500]
|
||||
|
||||
mini_batches = self.get_balanced_dataset(train_data)
|
||||
steps_per_epoch = self.get_steps_per_epoch(
|
||||
nb_pos_examples=train_data[train_data.int_label != 0].shape[0]
|
||||
)
|
||||
|
||||
return mini_batches, test_data, steps_per_epoch
|
||||
|
||||
def no_cv_load(self, full_df):
|
||||
full_df = self._compute_int_labels(full_df)
|
||||
|
||||
val_test = full_df[full_df.origin == "precision"].copy(deep=True)
|
||||
val_data, test_data = self.get_outer_fold(df=val_test)
|
||||
|
||||
train_data = full_df.drop(full_df[full_df.origin == "precision"].index, axis=0)
|
||||
if test_data.shape[0] == 0:
|
||||
test_data = train_data.iloc[:500]
|
||||
|
||||
mini_batches = self.get_balanced_dataset(train_data)
|
||||
if train_data.int_label.nunique() == 1:
|
||||
raise ValueError('Should be at least two labels')
|
||||
|
||||
num_examples = train_data[train_data.int_label == 1].shape[0]
|
||||
if train_data.int_label.nunique() > 2:
|
||||
second_most_frequent_label = train_data.loc[train_data.int_label != 0, 'int_label'].mode().values[0]
|
||||
num_examples = train_data[train_data.int_label == second_most_frequent_label].shape[0] * 2
|
||||
steps_per_epoch = self.get_steps_per_epoch(nb_pos_examples=num_examples)
|
||||
|
||||
return mini_batches, steps_per_epoch, val_data, test_data
|
Reference in New Issue
Block a user