Twitter Recommendation Algorithm

Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
This commit is contained in:
twitter-team
2023-03-31 17:36:31 -05:00
commit ef4c5eb65e
5364 changed files with 460239 additions and 0 deletions

View File

@ -0,0 +1,118 @@
from abc import ABC
import re
from toxicity_ml_pipeline.settings.hcomp_settings import TOXIC_35
import numpy as np
TOXIC_35_set = set(TOXIC_35)
url_group = r"(\bhttps?:\/\/\S+)"
mention_group = r"(\B@\S+)"
urls_mentions_re = re.compile(url_group + r"|" + mention_group, re.IGNORECASE)
url_re = re.compile(url_group, re.IGNORECASE)
mention_re = re.compile(mention_group, re.IGNORECASE)
newline_re = re.compile(r"\n+", re.IGNORECASE)
and_re = re.compile(r"&\s?amp\s?;", re.IGNORECASE)
class DataframeCleaner(ABC):
def __init__(self):
pass
def _clean(self, df):
return df
def _systematic_preprocessing(self, df):
df.reset_index(inplace=True, drop=True)
if "media_url" in df.columns:
print(".... removing tweets with media")
df.drop(df[~df.media_url.isna()].index, inplace=True, axis=0)
else:
print("WARNING you are not removing tweets with media to train a BERT model.")
print(".... deleting duplicates")
df.drop_duplicates("text", inplace=True, keep="last")
print(f"Got {df.shape[0]} after cleaning")
return df.reset_index(inplace=False, drop=True)
def _postprocess(self, df, *args, **kwargs):
return df
def __call__(self, df, *args, **kwargs):
print(f"Got {df.shape[0]} before cleaning")
df["raw_text"] = df.text
df = self._clean(df)
df = self._systematic_preprocessing(df)
return self._postprocess(df, *args, **kwargs)
def mapping_func(el):
if el.aggregated_content in TOXIC_35_set:
return 2
if el.label == 1:
return 1
return 0
class DefaultENNoPreprocessor(DataframeCleaner):
def _postprocess(self, df, *args, **kwargs):
if "toxic_count" in df.columns and "non_toxic_count" in df.columns:
df["vote"] = df.toxic_count / (df.toxic_count + df.non_toxic_count)
df["agreement_rate"] = np.max((df.vote, 1 - df.vote), axis=0)
if "label_column" in kwargs and kwargs["label_column"] != "label":
if kwargs["label_column"] == "aggregated_content":
print("Replacing v3 label by v3.5 label.")
if "num_classes" in kwargs and kwargs["num_classes"] < 3:
df["label"] = np.where(df.aggregated_content.isin(TOXIC_35_set), 1, 0)
elif "num_classes" in kwargs and kwargs["num_classes"] == 3:
print("Making it a 3-class pb")
df["label"] = df.apply(mapping_func, axis=1)
else:
raise NotImplementedError
elif kwargs['label_column'] in df.columns:
df['label'] = df[kwargs['label_column']]
if kwargs['class_weight'] is not None:
df["class_weight"] = np.where(df['label'] == 1, 1-kwargs['class_weight'],
kwargs['class_weight'])
else:
raise NotImplementedError
if "filter_low_agreements" in kwargs and kwargs["filter_low_agreements"] == True:
df.drop(df[(df.agreement_rate <= 0.6)].index, axis=0, inplace=True)
raise NotImplementedError
return df
class DefaultENPreprocessor(DefaultENNoPreprocessor):
def _clean(self, adhoc_df):
print(
".... removing \\n and replacing @mentions and URLs by placeholders. "
"Emoji filtering is not done."
)
adhoc_df["text"] = [url_re.sub("URL", tweet) for tweet in adhoc_df.raw_text.values]
adhoc_df["text"] = [mention_re.sub("MENTION", tweet) for tweet in adhoc_df.text.values]
adhoc_df["text"] = [
newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values
]
adhoc_df["text"] = [and_re.sub("&", tweet) for tweet in adhoc_df.text.values]
return adhoc_df
class Defaulti18nPreprocessor(DataframeCleaner):
def _clean(self, adhoc_df):
print(".... removing @mentions, \\n and URLs. Emoji filtering is not done.")
adhoc_df["text"] = [urls_mentions_re.sub("", tweet) for tweet in adhoc_df.raw_text.values]
adhoc_df["text"] = [
newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values
]
return adhoc_df

View File

@ -0,0 +1,348 @@
from abc import ABC, abstractmethod
from datetime import date
from importlib import import_module
import pickle
from toxicity_ml_pipeline.settings.default_settings_tox import (
CLIENT,
EXISTING_TASK_VERSIONS,
GCS_ADDRESS,
TRAINING_DATA_LOCATION,
)
from toxicity_ml_pipeline.utils.helpers import execute_command, execute_query
from toxicity_ml_pipeline.utils.queries import (
FULL_QUERY,
FULL_QUERY_W_TWEET_TYPES,
PARSER_UDF,
QUERY_SETTINGS,
)
import numpy as np
import pandas
class DataframeLoader(ABC):
def __init__(self, project):
self.project = project
@abstractmethod
def produce_query(self):
pass
@abstractmethod
def load_data(self, test=False):
pass
class ENLoader(DataframeLoader):
def __init__(self, project, setting_file):
super(ENLoader, self).__init__(project=project)
self.date_begin = setting_file.DATE_BEGIN
self.date_end = setting_file.DATE_END
TASK_VERSION = setting_file.TASK_VERSION
if TASK_VERSION not in EXISTING_TASK_VERSIONS:
raise ValueError
self.task_version = TASK_VERSION
self.query_settings = dict(QUERY_SETTINGS)
self.full_query = FULL_QUERY
def produce_query(self, date_begin, date_end, task_version=None, **keys):
task_version = self.task_version if task_version is None else task_version
if task_version in keys["table"]:
table_name = keys["table"][task_version]
print(f"Loading {table_name}")
main_query = keys["main"].format(
table=table_name,
parser_udf=PARSER_UDF[task_version],
date_begin=date_begin,
date_end=date_end,
)
return self.full_query.format(
main_table_query=main_query, date_begin=date_begin, date_end=date_end
)
return ""
def _reload(self, test, file_keyword):
query = f"SELECT * from `{TRAINING_DATA_LOCATION.format(project=self.project)}_{file_keyword}`"
if test:
query += " ORDER BY RAND() LIMIT 1000"
try:
df = execute_query(client=CLIENT, query=query)
except Exception:
print(
"Loading from BQ failed, trying to load from GCS. "
"NB: use this option only for intermediate files, which will be deleted at the end of "
"the project."
)
copy_cmd = f"gsutil cp {GCS_ADDRESS.format(project=self.project)}/training_data/{file_keyword}.pkl ."
execute_command(copy_cmd)
try:
with open(f"{file_keyword}.pkl", "rb") as file:
df = pickle.load(file)
except Exception:
return None
if test:
df = df.sample(frac=1)
return df.iloc[:1000]
return df
def load_data(self, test=False, **kwargs):
if "reload" in kwargs and kwargs["reload"]:
df = self._reload(test, kwargs["reload"])
if df is not None and df.shape[0] > 0:
return df
df = None
query_settings = self.query_settings
if test:
query_settings = {"fairness": self.query_settings["fairness"]}
query_settings["fairness"]["main"] += " LIMIT 500"
for table, query_info in query_settings.items():
curr_query = self.produce_query(
date_begin=self.date_begin, date_end=self.date_end, **query_info
)
if curr_query == "":
continue
curr_df = execute_query(client=CLIENT, query=curr_query)
curr_df["origin"] = table
df = curr_df if df is None else pandas.concat((df, curr_df))
df["loading_date"] = date.today()
df["date"] = pandas.to_datetime(df.date)
return df
def load_precision_set(
self, begin_date="...", end_date="...", with_tweet_types=False, task_version=3.5
):
if with_tweet_types:
self.full_query = FULL_QUERY_W_TWEET_TYPES
query_settings = self.query_settings
curr_query = self.produce_query(
date_begin=begin_date,
date_end=end_date,
task_version=task_version,
**query_settings["precision"],
)
curr_df = execute_query(client=CLIENT, query=curr_query)
curr_df.rename(columns={"media_url": "media_presence"}, inplace=True)
return curr_df
class ENLoaderWithSampling(ENLoader):
keywords = {
"politics": [
...
],
"insults": [
...
],
"race": [
...
],
}
n = ...
N = ...
def __init__(self, project):
self.raw_loader = ENLoader(project=project)
if project == ...:
self.project = project
else:
raise ValueError
def sample_with_weights(self, df, n):
w = df["label"].value_counts(normalize=True)[1]
dist = np.full((df.shape[0],), w)
sampled_df = df.sample(n=n, weights=dist, replace=False)
return sampled_df
def sample_keywords(self, df, N, group):
print("\nmatching", group, "keywords...")
keyword_list = self.keywords[group]
match_df = df.loc[df.text.str.lower().str.contains("|".join(keyword_list), regex=True)]
print("sampling N/3 from", group)
if match_df.shape[0] <= N / 3:
print(
"WARNING: Sampling only",
match_df.shape[0],
"instead of",
N / 3,
"examples from race focused tweets due to insufficient data",
)
sample_df = match_df
else:
print(
"sampling",
group,
"at",
round(match_df["label"].value_counts(normalize=True)[1], 3),
"% action rate",
)
sample_df = self.sample_with_weights(match_df, int(N / 3))
print(sample_df.shape)
print(sample_df.label.value_counts(normalize=True))
print("\nshape of df before dropping sampled rows after", group, "matching..", df.shape[0])
df = df.loc[
df.index.difference(sample_df.index),
]
print("\nshape of df after dropping sampled rows after", group, "matching..", df.shape[0])
return df, sample_df
def sample_first_set_helper(self, train_df, first_set, new_n):
if first_set == "prev":
fset = train_df.loc[train_df["origin"].isin(["prevalence", "causal prevalence"])]
print(
"sampling prev at", round(fset["label"].value_counts(normalize=True)[1], 3), "% action rate"
)
else:
fset = train_df
n_fset = self.sample_with_weights(fset, new_n)
print("len of sampled first set", n_fset.shape[0])
print(n_fset.label.value_counts(normalize=True))
return n_fset
def sample(self, df, first_set, second_set, keyword_sampling, n, N):
train_df = df[df.origin != "precision"]
val_test_df = df[df.origin == "precision"]
print("\nsampling first set of data")
new_n = n - N if second_set is not None else n
n_fset = self.sample_first_set_helper(train_df, first_set, new_n)
print("\nsampling second set of data")
train_df = train_df.loc[
train_df.index.difference(n_fset.index),
]
if second_set is None:
print("no second set sampling being done")
df = n_fset.append(val_test_df)
return df
if second_set == "prev":
sset = train_df.loc[train_df["origin"].isin(["prevalence", "causal prevalence"])]
elif second_set == "fdr":
sset = train_df.loc[train_df["origin"] == "fdr"]
else:
sset = train_df
if keyword_sampling == True:
print("sampling based off of keywords defined...")
print("second set is", second_set, "with length", sset.shape[0])
sset, n_politics = self.sample_keywords(sset, N, "politics")
sset, n_insults = self.sample_keywords(sset, N, "insults")
sset, n_race = self.sample_keywords(sset, N, "race")
n_sset = n_politics.append([n_insults, n_race])
print("len of sampled second set", n_sset.shape[0])
else:
print(
"No keyword sampling. Instead random sampling from",
second_set,
"at",
round(sset["label"].value_counts(normalize=True)[1], 3),
"% action rate",
)
n_sset = self.sample_with_weights(sset, N)
print("len of sampled second set", n_sset.shape[0])
print(n_sset.label.value_counts(normalize=True))
df = n_fset.append([n_sset, val_test_df])
df = df.sample(frac=1).reset_index(drop=True)
return df
def load_data(
self, first_set="prev", second_set=None, keyword_sampling=False, test=False, **kwargs
):
n = kwargs.get("n", self.n)
N = kwargs.get("N", self.N)
df = self.raw_loader.load_data(test=test, **kwargs)
return self.sample(df, first_set, second_set, keyword_sampling, n, N)
class I18nLoader(DataframeLoader):
def __init__(self):
super().__init__(project=...)
from archive.settings.... import ACCEPTED_LANGUAGES, QUERY_SETTINGS
self.accepted_languages = ACCEPTED_LANGUAGES
self.query_settings = dict(QUERY_SETTINGS)
def produce_query(self, language, query, dataset, table, lang):
query = query.format(dataset=dataset, table=table)
add_query = f"AND reviewed.{lang}='{language}'"
query += add_query
return query
def query_keys(self, language, task=2, size="50"):
if task == 2:
if language == "ar":
self.query_settings["adhoc_v2"]["table"] = "..."
elif language == "tr":
self.query_settings["adhoc_v2"]["table"] = "..."
elif language == "es":
self.query_settings["adhoc_v2"]["table"] = f"..."
else:
self.query_settings["adhoc_v2"]["table"] = "..."
return self.query_settings["adhoc_v2"]
if task == 3:
return self.query_settings["adhoc_v3"]
raise ValueError(f"There are no other tasks than 2 or 3. {task} does not exist.")
def load_data(self, language, test=False, task=2):
if language not in self.accepted_languages:
raise ValueError(
f"Language not in the data {language}. Accepted values are " f"{self.accepted_languages}"
)
print(".... adhoc data")
key_dict = self.query_keys(language=language, task=task)
query_adhoc = self.produce_query(language=language, **key_dict)
if test:
query_adhoc += " LIMIT 500"
adhoc_df = execute_query(CLIENT, query_adhoc)
if not (test or language == "tr" or task == 3):
if language == "es":
print(".... additional adhoc data")
key_dict = self.query_keys(language=language, size="100")
query_adhoc = self.produce_query(language=language, **key_dict)
adhoc_df = pandas.concat(
(adhoc_df, execute_query(CLIENT, query_adhoc)), axis=0, ignore_index=True
)
print(".... prevalence data")
query_prev = self.produce_query(language=language, **self.query_settings["prevalence_v2"])
prev_df = execute_query(CLIENT, query_prev)
prev_df["description"] = "Prevalence"
adhoc_df = pandas.concat((adhoc_df, prev_df), axis=0, ignore_index=True)
return self.clean(adhoc_df)

View File

@ -0,0 +1,284 @@
from importlib import import_module
import os
from toxicity_ml_pipeline.settings.default_settings_tox import (
INNER_CV,
LOCAL_DIR,
MAX_SEQ_LENGTH,
NUM_PREFETCH,
NUM_WORKERS,
OUTER_CV,
TARGET_POS_PER_EPOCH,
)
from toxicity_ml_pipeline.utils.helpers import execute_command
import numpy as np
import pandas
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
try:
from transformers import AutoTokenizer, DataCollatorWithPadding
except ModuleNotFoundError:
print("...")
else:
from datasets import Dataset
class BalancedMiniBatchLoader(object):
def __init__(
self,
fold,
mb_size,
seed,
perc_training_tox,
scope="TOX",
project=...,
dual_head=None,
n_outer_splits=None,
n_inner_splits=None,
sample_weights=None,
huggingface=False,
):
if 0 >= perc_training_tox or perc_training_tox > 0.5:
raise ValueError("Perc_training_tox should be in ]0; 0.5]")
self.perc_training_tox = perc_training_tox
if not n_outer_splits:
n_outer_splits = OUTER_CV
if isinstance(n_outer_splits, int):
self.n_outer_splits = n_outer_splits
self.get_outer_fold = self._get_outer_cv_fold
if fold < 0 or fold >= self.n_outer_splits or int(fold) != fold:
raise ValueError(f"Number of fold should be an integer in [0 ; {self.n_outer_splits} [.")
elif n_outer_splits == "time":
self.get_outer_fold = self._get_time_fold
if fold != "time":
raise ValueError(
"To avoid repeating the same run many times, the external fold"
"should be time when test data is split according to dates."
)
try:
setting_file = import_module(f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings")
except ModuleNotFoundError:
raise ValueError(f"You need to define a setting file for your project {project}.")
self.test_begin_date = setting_file.TEST_BEGIN_DATE
self.test_end_date = setting_file.TEST_END_DATE
else:
raise ValueError(
f"Argument n_outer_splits should either an integer or 'time'. Provided: {n_outer_splits}"
)
self.n_inner_splits = n_inner_splits if n_inner_splits is not None else INNER_CV
self.seed = seed
self.mb_size = mb_size
self.fold = fold
self.sample_weights = sample_weights
self.dual_head = dual_head
self.huggingface = huggingface
if self.huggingface:
self._load_tokenizer()
def _load_tokenizer(self):
print("Making a local copy of Bertweet-base model")
local_model_dir = os.path.join(LOCAL_DIR, "models")
cmd = f"mkdir {local_model_dir} ; gsutil -m cp -r gs://... {local_model_dir}"
execute_command(cmd)
self.tokenizer = AutoTokenizer.from_pretrained(
os.path.join(local_model_dir, "bertweet-base"), normalization=True
)
def tokenize_function(self, el):
return self.tokenizer(
el["text"],
max_length=MAX_SEQ_LENGTH,
padding="max_length",
truncation=True,
add_special_tokens=True,
return_token_type_ids=False,
return_attention_mask=False,
)
def _get_stratified_kfold(self, n_splits):
return StratifiedKFold(shuffle=True, n_splits=n_splits, random_state=self.seed)
def _get_time_fold(self, df):
test_begin_date = pandas.to_datetime(self.test_begin_date).date()
test_end_date = pandas.to_datetime(self.test_end_date).date()
print(f"Test is going from {test_begin_date} to {test_end_date}.")
test_data = df.query("@test_begin_date <= date <= @test_end_date")
query = "date < @test_begin_date"
other_set = df.query(query)
return other_set, test_data
def _get_outer_cv_fold(self, df):
labels = df.int_label
stratifier = self._get_stratified_kfold(n_splits=self.n_outer_splits)
k = 0
for train_index, test_index in stratifier.split(np.zeros(len(labels)), labels):
if k == self.fold:
break
k += 1
train_data = df.iloc[train_index].copy()
test_data = df.iloc[test_index].copy()
return train_data, test_data
def get_steps_per_epoch(self, nb_pos_examples):
return int(max(TARGET_POS_PER_EPOCH, nb_pos_examples) / self.mb_size / self.perc_training_tox)
def make_huggingface_tensorflow_ds(self, group, mb_size=None, shuffle=True):
huggingface_ds = Dataset.from_pandas(group).map(self.tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, return_tensors="tf")
tensorflow_ds = huggingface_ds.to_tf_dataset(
columns=["input_ids"],
label_cols=["labels"],
shuffle=shuffle,
batch_size=self.mb_size if mb_size is None else mb_size,
collate_fn=data_collator,
)
if shuffle:
return tensorflow_ds.repeat()
return tensorflow_ds
def make_pure_tensorflow_ds(self, df, nb_samples):
buffer_size = nb_samples * 2
if self.sample_weights is not None:
if self.sample_weights not in df.columns:
raise ValueError
ds = tf.data.Dataset.from_tensor_slices(
(df.text.values, df.label.values, df[self.sample_weights].values)
)
elif self.dual_head:
label_d = {f'{e}_output': df[f'{e}_label'].values for e in self.dual_head}
label_d['content_output'] = tf.keras.utils.to_categorical(label_d['content_output'], num_classes=3)
ds = tf.data.Dataset.from_tensor_slices((df.text.values, label_d))
else:
ds = tf.data.Dataset.from_tensor_slices((df.text.values, df.label.values))
ds = ds.shuffle(buffer_size, seed=self.seed, reshuffle_each_iteration=True).repeat()
return ds
def get_balanced_dataset(self, training_data, size_limit=None, return_as_batch=True):
training_data = training_data.sample(frac=1, random_state=self.seed)
nb_samples = training_data.shape[0] if not size_limit else size_limit
num_classes = training_data.int_label.nunique()
toxic_class = training_data.int_label.max()
if size_limit:
training_data = training_data[: size_limit * num_classes]
print(
".... {} examples, incl. {:.2f}% tox in train, {} classes".format(
nb_samples,
100 * training_data[training_data.int_label == toxic_class].shape[0] / nb_samples,
num_classes,
)
)
label_groups = training_data.groupby("int_label")
if self.huggingface:
label_datasets = {
label: self.make_huggingface_tensorflow_ds(group) for label, group in label_groups
}
else:
label_datasets = {
label: self.make_pure_tensorflow_ds(group, nb_samples=nb_samples * 2)
for label, group in label_groups
}
datasets = [label_datasets[0], label_datasets[1]]
weights = [1 - self.perc_training_tox, self.perc_training_tox]
if num_classes == 3:
datasets.append(label_datasets[2])
weights = [1 - self.perc_training_tox, self.perc_training_tox / 2, self.perc_training_tox / 2]
elif num_classes != 2:
raise ValueError("Currently it should not be possible to get other than 2 or 3 classes")
resampled_ds = tf.data.experimental.sample_from_datasets(datasets, weights, seed=self.seed)
if return_as_batch and not self.huggingface:
return resampled_ds.batch(
self.mb_size, drop_remainder=True, num_parallel_calls=NUM_WORKERS, deterministic=True
).prefetch(NUM_PREFETCH)
return resampled_ds
@staticmethod
def _compute_int_labels(full_df):
if full_df.label.dtype == int:
full_df["int_label"] = full_df.label
elif "int_label" not in full_df.columns:
if full_df.label.max() > 1:
raise ValueError("Binarizing labels that should not be.")
full_df["int_label"] = np.where(full_df.label >= 0.5, 1, 0)
return full_df
def __call__(self, full_df, *args, **kwargs):
full_df = self._compute_int_labels(full_df)
train_data, test_data = self.get_outer_fold(df=full_df)
stratifier = self._get_stratified_kfold(n_splits=self.n_inner_splits)
for train_index, val_index in stratifier.split(
np.zeros(train_data.shape[0]), train_data.int_label
):
curr_train_data = train_data.iloc[train_index]
mini_batches = self.get_balanced_dataset(curr_train_data)
steps_per_epoch = self.get_steps_per_epoch(
nb_pos_examples=curr_train_data[curr_train_data.int_label != 0].shape[0]
)
val_data = train_data.iloc[val_index].copy()
yield mini_batches, steps_per_epoch, val_data, test_data
def simple_cv_load(self, full_df):
full_df = self._compute_int_labels(full_df)
train_data, test_data = self.get_outer_fold(df=full_df)
if test_data.shape[0] == 0:
test_data = train_data.iloc[:500]
mini_batches = self.get_balanced_dataset(train_data)
steps_per_epoch = self.get_steps_per_epoch(
nb_pos_examples=train_data[train_data.int_label != 0].shape[0]
)
return mini_batches, test_data, steps_per_epoch
def no_cv_load(self, full_df):
full_df = self._compute_int_labels(full_df)
val_test = full_df[full_df.origin == "precision"].copy(deep=True)
val_data, test_data = self.get_outer_fold(df=val_test)
train_data = full_df.drop(full_df[full_df.origin == "precision"].index, axis=0)
if test_data.shape[0] == 0:
test_data = train_data.iloc[:500]
mini_batches = self.get_balanced_dataset(train_data)
if train_data.int_label.nunique() == 1:
raise ValueError('Should be at least two labels')
num_examples = train_data[train_data.int_label == 1].shape[0]
if train_data.int_label.nunique() > 2:
second_most_frequent_label = train_data.loc[train_data.int_label != 0, 'int_label'].mode().values[0]
num_examples = train_data[train_data.int_label == second_most_frequent_label].shape[0] * 2
steps_per_epoch = self.get_steps_per_epoch(nb_pos_examples=num_examples)
return mini_batches, steps_per_epoch, val_data, test_data