mirror of
https://github.com/twitter/the-algorithm.git
synced 2025-06-10 14:48:16 -05:00
Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
This commit is contained in:
0
trust_and_safety_models/toxicity/__init__.py
Normal file
0
trust_and_safety_models/toxicity/__init__.py
Normal file
0
trust_and_safety_models/toxicity/data/__init__.py
Normal file
0
trust_and_safety_models/toxicity/data/__init__.py
Normal file
118
trust_and_safety_models/toxicity/data/data_preprocessing.py
Normal file
118
trust_and_safety_models/toxicity/data/data_preprocessing.py
Normal file
@ -0,0 +1,118 @@
|
||||
from abc import ABC
|
||||
import re
|
||||
|
||||
from toxicity_ml_pipeline.settings.hcomp_settings import TOXIC_35
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
TOXIC_35_set = set(TOXIC_35)
|
||||
|
||||
url_group = r"(\bhttps?:\/\/\S+)"
|
||||
mention_group = r"(\B@\S+)"
|
||||
urls_mentions_re = re.compile(url_group + r"|" + mention_group, re.IGNORECASE)
|
||||
url_re = re.compile(url_group, re.IGNORECASE)
|
||||
mention_re = re.compile(mention_group, re.IGNORECASE)
|
||||
newline_re = re.compile(r"\n+", re.IGNORECASE)
|
||||
and_re = re.compile(r"&\s?amp\s?;", re.IGNORECASE)
|
||||
|
||||
|
||||
class DataframeCleaner(ABC):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def _clean(self, df):
|
||||
return df
|
||||
|
||||
def _systematic_preprocessing(self, df):
|
||||
df.reset_index(inplace=True, drop=True)
|
||||
if "media_url" in df.columns:
|
||||
print(".... removing tweets with media")
|
||||
df.drop(df[~df.media_url.isna()].index, inplace=True, axis=0)
|
||||
else:
|
||||
print("WARNING you are not removing tweets with media to train a BERT model.")
|
||||
|
||||
print(".... deleting duplicates")
|
||||
df.drop_duplicates("text", inplace=True, keep="last")
|
||||
print(f"Got {df.shape[0]} after cleaning")
|
||||
|
||||
return df.reset_index(inplace=False, drop=True)
|
||||
|
||||
def _postprocess(self, df, *args, **kwargs):
|
||||
return df
|
||||
|
||||
def __call__(self, df, *args, **kwargs):
|
||||
print(f"Got {df.shape[0]} before cleaning")
|
||||
|
||||
df["raw_text"] = df.text
|
||||
df = self._clean(df)
|
||||
|
||||
df = self._systematic_preprocessing(df)
|
||||
|
||||
return self._postprocess(df, *args, **kwargs)
|
||||
|
||||
|
||||
def mapping_func(el):
|
||||
if el.aggregated_content in TOXIC_35_set:
|
||||
return 2
|
||||
if el.label == 1:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
class DefaultENNoPreprocessor(DataframeCleaner):
|
||||
def _postprocess(self, df, *args, **kwargs):
|
||||
if "toxic_count" in df.columns and "non_toxic_count" in df.columns:
|
||||
df["vote"] = df.toxic_count / (df.toxic_count + df.non_toxic_count)
|
||||
df["agreement_rate"] = np.max((df.vote, 1 - df.vote), axis=0)
|
||||
|
||||
if "label_column" in kwargs and kwargs["label_column"] != "label":
|
||||
if kwargs["label_column"] == "aggregated_content":
|
||||
print("Replacing v3 label by v3.5 label.")
|
||||
if "num_classes" in kwargs and kwargs["num_classes"] < 3:
|
||||
df["label"] = np.where(df.aggregated_content.isin(TOXIC_35_set), 1, 0)
|
||||
elif "num_classes" in kwargs and kwargs["num_classes"] == 3:
|
||||
print("Making it a 3-class pb")
|
||||
df["label"] = df.apply(mapping_func, axis=1)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
elif kwargs['label_column'] in df.columns:
|
||||
df['label'] = df[kwargs['label_column']]
|
||||
if kwargs['class_weight'] is not None:
|
||||
df["class_weight"] = np.where(df['label'] == 1, 1-kwargs['class_weight'],
|
||||
kwargs['class_weight'])
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
if "filter_low_agreements" in kwargs and kwargs["filter_low_agreements"] == True:
|
||||
df.drop(df[(df.agreement_rate <= 0.6)].index, axis=0, inplace=True)
|
||||
raise NotImplementedError
|
||||
|
||||
return df
|
||||
|
||||
|
||||
class DefaultENPreprocessor(DefaultENNoPreprocessor):
|
||||
def _clean(self, adhoc_df):
|
||||
print(
|
||||
".... removing \\n and replacing @mentions and URLs by placeholders. "
|
||||
"Emoji filtering is not done."
|
||||
)
|
||||
adhoc_df["text"] = [url_re.sub("URL", tweet) for tweet in adhoc_df.raw_text.values]
|
||||
adhoc_df["text"] = [mention_re.sub("MENTION", tweet) for tweet in adhoc_df.text.values]
|
||||
adhoc_df["text"] = [
|
||||
newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values
|
||||
]
|
||||
adhoc_df["text"] = [and_re.sub("&", tweet) for tweet in adhoc_df.text.values]
|
||||
|
||||
return adhoc_df
|
||||
|
||||
|
||||
class Defaulti18nPreprocessor(DataframeCleaner):
|
||||
def _clean(self, adhoc_df):
|
||||
print(".... removing @mentions, \\n and URLs. Emoji filtering is not done.")
|
||||
adhoc_df["text"] = [urls_mentions_re.sub("", tweet) for tweet in adhoc_df.raw_text.values]
|
||||
adhoc_df["text"] = [
|
||||
newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values
|
||||
]
|
||||
|
||||
return adhoc_df
|
348
trust_and_safety_models/toxicity/data/dataframe_loader.py
Normal file
348
trust_and_safety_models/toxicity/data/dataframe_loader.py
Normal file
@ -0,0 +1,348 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from datetime import date
|
||||
from importlib import import_module
|
||||
import pickle
|
||||
|
||||
from toxicity_ml_pipeline.settings.default_settings_tox import (
|
||||
CLIENT,
|
||||
EXISTING_TASK_VERSIONS,
|
||||
GCS_ADDRESS,
|
||||
TRAINING_DATA_LOCATION,
|
||||
)
|
||||
from toxicity_ml_pipeline.utils.helpers import execute_command, execute_query
|
||||
from toxicity_ml_pipeline.utils.queries import (
|
||||
FULL_QUERY,
|
||||
FULL_QUERY_W_TWEET_TYPES,
|
||||
PARSER_UDF,
|
||||
QUERY_SETTINGS,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pandas
|
||||
|
||||
|
||||
class DataframeLoader(ABC):
|
||||
|
||||
def __init__(self, project):
|
||||
self.project = project
|
||||
|
||||
@abstractmethod
|
||||
def produce_query(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def load_data(self, test=False):
|
||||
pass
|
||||
|
||||
|
||||
class ENLoader(DataframeLoader):
|
||||
def __init__(self, project, setting_file):
|
||||
super(ENLoader, self).__init__(project=project)
|
||||
self.date_begin = setting_file.DATE_BEGIN
|
||||
self.date_end = setting_file.DATE_END
|
||||
TASK_VERSION = setting_file.TASK_VERSION
|
||||
if TASK_VERSION not in EXISTING_TASK_VERSIONS:
|
||||
raise ValueError
|
||||
self.task_version = TASK_VERSION
|
||||
self.query_settings = dict(QUERY_SETTINGS)
|
||||
self.full_query = FULL_QUERY
|
||||
|
||||
def produce_query(self, date_begin, date_end, task_version=None, **keys):
|
||||
task_version = self.task_version if task_version is None else task_version
|
||||
|
||||
if task_version in keys["table"]:
|
||||
table_name = keys["table"][task_version]
|
||||
print(f"Loading {table_name}")
|
||||
|
||||
main_query = keys["main"].format(
|
||||
table=table_name,
|
||||
parser_udf=PARSER_UDF[task_version],
|
||||
date_begin=date_begin,
|
||||
date_end=date_end,
|
||||
)
|
||||
|
||||
return self.full_query.format(
|
||||
main_table_query=main_query, date_begin=date_begin, date_end=date_end
|
||||
)
|
||||
return ""
|
||||
|
||||
def _reload(self, test, file_keyword):
|
||||
query = f"SELECT * from `{TRAINING_DATA_LOCATION.format(project=self.project)}_{file_keyword}`"
|
||||
|
||||
if test:
|
||||
query += " ORDER BY RAND() LIMIT 1000"
|
||||
try:
|
||||
df = execute_query(client=CLIENT, query=query)
|
||||
except Exception:
|
||||
print(
|
||||
"Loading from BQ failed, trying to load from GCS. "
|
||||
"NB: use this option only for intermediate files, which will be deleted at the end of "
|
||||
"the project."
|
||||
)
|
||||
copy_cmd = f"gsutil cp {GCS_ADDRESS.format(project=self.project)}/training_data/{file_keyword}.pkl ."
|
||||
execute_command(copy_cmd)
|
||||
try:
|
||||
with open(f"{file_keyword}.pkl", "rb") as file:
|
||||
df = pickle.load(file)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if test:
|
||||
df = df.sample(frac=1)
|
||||
return df.iloc[:1000]
|
||||
|
||||
return df
|
||||
|
||||
def load_data(self, test=False, **kwargs):
|
||||
if "reload" in kwargs and kwargs["reload"]:
|
||||
df = self._reload(test, kwargs["reload"])
|
||||
if df is not None and df.shape[0] > 0:
|
||||
return df
|
||||
|
||||
df = None
|
||||
query_settings = self.query_settings
|
||||
if test:
|
||||
query_settings = {"fairness": self.query_settings["fairness"]}
|
||||
query_settings["fairness"]["main"] += " LIMIT 500"
|
||||
|
||||
for table, query_info in query_settings.items():
|
||||
curr_query = self.produce_query(
|
||||
date_begin=self.date_begin, date_end=self.date_end, **query_info
|
||||
)
|
||||
if curr_query == "":
|
||||
continue
|
||||
curr_df = execute_query(client=CLIENT, query=curr_query)
|
||||
curr_df["origin"] = table
|
||||
df = curr_df if df is None else pandas.concat((df, curr_df))
|
||||
|
||||
df["loading_date"] = date.today()
|
||||
df["date"] = pandas.to_datetime(df.date)
|
||||
return df
|
||||
|
||||
def load_precision_set(
|
||||
self, begin_date="...", end_date="...", with_tweet_types=False, task_version=3.5
|
||||
):
|
||||
if with_tweet_types:
|
||||
self.full_query = FULL_QUERY_W_TWEET_TYPES
|
||||
|
||||
query_settings = self.query_settings
|
||||
curr_query = self.produce_query(
|
||||
date_begin=begin_date,
|
||||
date_end=end_date,
|
||||
task_version=task_version,
|
||||
**query_settings["precision"],
|
||||
)
|
||||
curr_df = execute_query(client=CLIENT, query=curr_query)
|
||||
|
||||
curr_df.rename(columns={"media_url": "media_presence"}, inplace=True)
|
||||
return curr_df
|
||||
|
||||
|
||||
class ENLoaderWithSampling(ENLoader):
|
||||
|
||||
keywords = {
|
||||
"politics": [
|
||||
...
|
||||
],
|
||||
"insults": [
|
||||
...
|
||||
],
|
||||
"race": [
|
||||
...
|
||||
],
|
||||
}
|
||||
n = ...
|
||||
N = ...
|
||||
|
||||
def __init__(self, project):
|
||||
self.raw_loader = ENLoader(project=project)
|
||||
if project == ...:
|
||||
self.project = project
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
def sample_with_weights(self, df, n):
|
||||
w = df["label"].value_counts(normalize=True)[1]
|
||||
dist = np.full((df.shape[0],), w)
|
||||
sampled_df = df.sample(n=n, weights=dist, replace=False)
|
||||
return sampled_df
|
||||
|
||||
def sample_keywords(self, df, N, group):
|
||||
print("\nmatching", group, "keywords...")
|
||||
|
||||
keyword_list = self.keywords[group]
|
||||
match_df = df.loc[df.text.str.lower().str.contains("|".join(keyword_list), regex=True)]
|
||||
|
||||
print("sampling N/3 from", group)
|
||||
if match_df.shape[0] <= N / 3:
|
||||
print(
|
||||
"WARNING: Sampling only",
|
||||
match_df.shape[0],
|
||||
"instead of",
|
||||
N / 3,
|
||||
"examples from race focused tweets due to insufficient data",
|
||||
)
|
||||
sample_df = match_df
|
||||
|
||||
else:
|
||||
print(
|
||||
"sampling",
|
||||
group,
|
||||
"at",
|
||||
round(match_df["label"].value_counts(normalize=True)[1], 3),
|
||||
"% action rate",
|
||||
)
|
||||
sample_df = self.sample_with_weights(match_df, int(N / 3))
|
||||
print(sample_df.shape)
|
||||
print(sample_df.label.value_counts(normalize=True))
|
||||
|
||||
print("\nshape of df before dropping sampled rows after", group, "matching..", df.shape[0])
|
||||
df = df.loc[
|
||||
df.index.difference(sample_df.index),
|
||||
]
|
||||
print("\nshape of df after dropping sampled rows after", group, "matching..", df.shape[0])
|
||||
|
||||
return df, sample_df
|
||||
|
||||
def sample_first_set_helper(self, train_df, first_set, new_n):
|
||||
if first_set == "prev":
|
||||
fset = train_df.loc[train_df["origin"].isin(["prevalence", "causal prevalence"])]
|
||||
print(
|
||||
"sampling prev at", round(fset["label"].value_counts(normalize=True)[1], 3), "% action rate"
|
||||
)
|
||||
else:
|
||||
fset = train_df
|
||||
|
||||
n_fset = self.sample_with_weights(fset, new_n)
|
||||
print("len of sampled first set", n_fset.shape[0])
|
||||
print(n_fset.label.value_counts(normalize=True))
|
||||
|
||||
return n_fset
|
||||
|
||||
def sample(self, df, first_set, second_set, keyword_sampling, n, N):
|
||||
train_df = df[df.origin != "precision"]
|
||||
val_test_df = df[df.origin == "precision"]
|
||||
|
||||
print("\nsampling first set of data")
|
||||
new_n = n - N if second_set is not None else n
|
||||
n_fset = self.sample_first_set_helper(train_df, first_set, new_n)
|
||||
|
||||
print("\nsampling second set of data")
|
||||
train_df = train_df.loc[
|
||||
train_df.index.difference(n_fset.index),
|
||||
]
|
||||
|
||||
if second_set is None:
|
||||
print("no second set sampling being done")
|
||||
df = n_fset.append(val_test_df)
|
||||
return df
|
||||
|
||||
if second_set == "prev":
|
||||
sset = train_df.loc[train_df["origin"].isin(["prevalence", "causal prevalence"])]
|
||||
|
||||
elif second_set == "fdr":
|
||||
sset = train_df.loc[train_df["origin"] == "fdr"]
|
||||
|
||||
else:
|
||||
sset = train_df
|
||||
|
||||
if keyword_sampling == True:
|
||||
print("sampling based off of keywords defined...")
|
||||
print("second set is", second_set, "with length", sset.shape[0])
|
||||
|
||||
sset, n_politics = self.sample_keywords(sset, N, "politics")
|
||||
sset, n_insults = self.sample_keywords(sset, N, "insults")
|
||||
sset, n_race = self.sample_keywords(sset, N, "race")
|
||||
|
||||
n_sset = n_politics.append([n_insults, n_race])
|
||||
print("len of sampled second set", n_sset.shape[0])
|
||||
|
||||
else:
|
||||
print(
|
||||
"No keyword sampling. Instead random sampling from",
|
||||
second_set,
|
||||
"at",
|
||||
round(sset["label"].value_counts(normalize=True)[1], 3),
|
||||
"% action rate",
|
||||
)
|
||||
n_sset = self.sample_with_weights(sset, N)
|
||||
print("len of sampled second set", n_sset.shape[0])
|
||||
print(n_sset.label.value_counts(normalize=True))
|
||||
|
||||
df = n_fset.append([n_sset, val_test_df])
|
||||
df = df.sample(frac=1).reset_index(drop=True)
|
||||
|
||||
return df
|
||||
|
||||
def load_data(
|
||||
self, first_set="prev", second_set=None, keyword_sampling=False, test=False, **kwargs
|
||||
):
|
||||
n = kwargs.get("n", self.n)
|
||||
N = kwargs.get("N", self.N)
|
||||
|
||||
df = self.raw_loader.load_data(test=test, **kwargs)
|
||||
return self.sample(df, first_set, second_set, keyword_sampling, n, N)
|
||||
|
||||
|
||||
class I18nLoader(DataframeLoader):
|
||||
def __init__(self):
|
||||
super().__init__(project=...)
|
||||
from archive.settings.... import ACCEPTED_LANGUAGES, QUERY_SETTINGS
|
||||
|
||||
self.accepted_languages = ACCEPTED_LANGUAGES
|
||||
self.query_settings = dict(QUERY_SETTINGS)
|
||||
|
||||
def produce_query(self, language, query, dataset, table, lang):
|
||||
query = query.format(dataset=dataset, table=table)
|
||||
add_query = f"AND reviewed.{lang}='{language}'"
|
||||
query += add_query
|
||||
|
||||
return query
|
||||
|
||||
def query_keys(self, language, task=2, size="50"):
|
||||
if task == 2:
|
||||
if language == "ar":
|
||||
self.query_settings["adhoc_v2"]["table"] = "..."
|
||||
elif language == "tr":
|
||||
self.query_settings["adhoc_v2"]["table"] = "..."
|
||||
elif language == "es":
|
||||
self.query_settings["adhoc_v2"]["table"] = f"..."
|
||||
else:
|
||||
self.query_settings["adhoc_v2"]["table"] = "..."
|
||||
|
||||
return self.query_settings["adhoc_v2"]
|
||||
|
||||
if task == 3:
|
||||
return self.query_settings["adhoc_v3"]
|
||||
|
||||
raise ValueError(f"There are no other tasks than 2 or 3. {task} does not exist.")
|
||||
|
||||
def load_data(self, language, test=False, task=2):
|
||||
if language not in self.accepted_languages:
|
||||
raise ValueError(
|
||||
f"Language not in the data {language}. Accepted values are " f"{self.accepted_languages}"
|
||||
)
|
||||
|
||||
print(".... adhoc data")
|
||||
key_dict = self.query_keys(language=language, task=task)
|
||||
query_adhoc = self.produce_query(language=language, **key_dict)
|
||||
if test:
|
||||
query_adhoc += " LIMIT 500"
|
||||
adhoc_df = execute_query(CLIENT, query_adhoc)
|
||||
|
||||
if not (test or language == "tr" or task == 3):
|
||||
if language == "es":
|
||||
print(".... additional adhoc data")
|
||||
key_dict = self.query_keys(language=language, size="100")
|
||||
query_adhoc = self.produce_query(language=language, **key_dict)
|
||||
adhoc_df = pandas.concat(
|
||||
(adhoc_df, execute_query(CLIENT, query_adhoc)), axis=0, ignore_index=True
|
||||
)
|
||||
|
||||
print(".... prevalence data")
|
||||
query_prev = self.produce_query(language=language, **self.query_settings["prevalence_v2"])
|
||||
prev_df = execute_query(CLIENT, query_prev)
|
||||
prev_df["description"] = "Prevalence"
|
||||
adhoc_df = pandas.concat((adhoc_df, prev_df), axis=0, ignore_index=True)
|
||||
|
||||
return self.clean(adhoc_df)
|
284
trust_and_safety_models/toxicity/data/mb_generator.py
Normal file
284
trust_and_safety_models/toxicity/data/mb_generator.py
Normal file
@ -0,0 +1,284 @@
|
||||
from importlib import import_module
|
||||
import os
|
||||
|
||||
from toxicity_ml_pipeline.settings.default_settings_tox import (
|
||||
INNER_CV,
|
||||
LOCAL_DIR,
|
||||
MAX_SEQ_LENGTH,
|
||||
NUM_PREFETCH,
|
||||
NUM_WORKERS,
|
||||
OUTER_CV,
|
||||
TARGET_POS_PER_EPOCH,
|
||||
)
|
||||
from toxicity_ml_pipeline.utils.helpers import execute_command
|
||||
|
||||
import numpy as np
|
||||
import pandas
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
try:
|
||||
from transformers import AutoTokenizer, DataCollatorWithPadding
|
||||
except ModuleNotFoundError:
|
||||
print("...")
|
||||
else:
|
||||
from datasets import Dataset
|
||||
|
||||
|
||||
class BalancedMiniBatchLoader(object):
|
||||
def __init__(
|
||||
self,
|
||||
fold,
|
||||
mb_size,
|
||||
seed,
|
||||
perc_training_tox,
|
||||
scope="TOX",
|
||||
project=...,
|
||||
dual_head=None,
|
||||
n_outer_splits=None,
|
||||
n_inner_splits=None,
|
||||
sample_weights=None,
|
||||
huggingface=False,
|
||||
):
|
||||
if 0 >= perc_training_tox or perc_training_tox > 0.5:
|
||||
raise ValueError("Perc_training_tox should be in ]0; 0.5]")
|
||||
|
||||
self.perc_training_tox = perc_training_tox
|
||||
if not n_outer_splits:
|
||||
n_outer_splits = OUTER_CV
|
||||
if isinstance(n_outer_splits, int):
|
||||
self.n_outer_splits = n_outer_splits
|
||||
self.get_outer_fold = self._get_outer_cv_fold
|
||||
if fold < 0 or fold >= self.n_outer_splits or int(fold) != fold:
|
||||
raise ValueError(f"Number of fold should be an integer in [0 ; {self.n_outer_splits} [.")
|
||||
|
||||
elif n_outer_splits == "time":
|
||||
self.get_outer_fold = self._get_time_fold
|
||||
if fold != "time":
|
||||
raise ValueError(
|
||||
"To avoid repeating the same run many times, the external fold"
|
||||
"should be time when test data is split according to dates."
|
||||
)
|
||||
try:
|
||||
setting_file = import_module(f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings")
|
||||
except ModuleNotFoundError:
|
||||
raise ValueError(f"You need to define a setting file for your project {project}.")
|
||||
self.test_begin_date = setting_file.TEST_BEGIN_DATE
|
||||
self.test_end_date = setting_file.TEST_END_DATE
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Argument n_outer_splits should either an integer or 'time'. Provided: {n_outer_splits}"
|
||||
)
|
||||
|
||||
self.n_inner_splits = n_inner_splits if n_inner_splits is not None else INNER_CV
|
||||
|
||||
self.seed = seed
|
||||
self.mb_size = mb_size
|
||||
self.fold = fold
|
||||
|
||||
self.sample_weights = sample_weights
|
||||
self.dual_head = dual_head
|
||||
self.huggingface = huggingface
|
||||
if self.huggingface:
|
||||
self._load_tokenizer()
|
||||
|
||||
def _load_tokenizer(self):
|
||||
print("Making a local copy of Bertweet-base model")
|
||||
local_model_dir = os.path.join(LOCAL_DIR, "models")
|
||||
cmd = f"mkdir {local_model_dir} ; gsutil -m cp -r gs://... {local_model_dir}"
|
||||
execute_command(cmd)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
os.path.join(local_model_dir, "bertweet-base"), normalization=True
|
||||
)
|
||||
|
||||
def tokenize_function(self, el):
|
||||
return self.tokenizer(
|
||||
el["text"],
|
||||
max_length=MAX_SEQ_LENGTH,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
add_special_tokens=True,
|
||||
return_token_type_ids=False,
|
||||
return_attention_mask=False,
|
||||
)
|
||||
|
||||
def _get_stratified_kfold(self, n_splits):
|
||||
return StratifiedKFold(shuffle=True, n_splits=n_splits, random_state=self.seed)
|
||||
|
||||
def _get_time_fold(self, df):
|
||||
test_begin_date = pandas.to_datetime(self.test_begin_date).date()
|
||||
test_end_date = pandas.to_datetime(self.test_end_date).date()
|
||||
print(f"Test is going from {test_begin_date} to {test_end_date}.")
|
||||
test_data = df.query("@test_begin_date <= date <= @test_end_date")
|
||||
|
||||
query = "date < @test_begin_date"
|
||||
other_set = df.query(query)
|
||||
return other_set, test_data
|
||||
|
||||
def _get_outer_cv_fold(self, df):
|
||||
labels = df.int_label
|
||||
stratifier = self._get_stratified_kfold(n_splits=self.n_outer_splits)
|
||||
|
||||
k = 0
|
||||
for train_index, test_index in stratifier.split(np.zeros(len(labels)), labels):
|
||||
if k == self.fold:
|
||||
break
|
||||
k += 1
|
||||
|
||||
train_data = df.iloc[train_index].copy()
|
||||
test_data = df.iloc[test_index].copy()
|
||||
|
||||
return train_data, test_data
|
||||
|
||||
def get_steps_per_epoch(self, nb_pos_examples):
|
||||
return int(max(TARGET_POS_PER_EPOCH, nb_pos_examples) / self.mb_size / self.perc_training_tox)
|
||||
|
||||
def make_huggingface_tensorflow_ds(self, group, mb_size=None, shuffle=True):
|
||||
huggingface_ds = Dataset.from_pandas(group).map(self.tokenize_function, batched=True)
|
||||
data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, return_tensors="tf")
|
||||
tensorflow_ds = huggingface_ds.to_tf_dataset(
|
||||
columns=["input_ids"],
|
||||
label_cols=["labels"],
|
||||
shuffle=shuffle,
|
||||
batch_size=self.mb_size if mb_size is None else mb_size,
|
||||
collate_fn=data_collator,
|
||||
)
|
||||
|
||||
if shuffle:
|
||||
return tensorflow_ds.repeat()
|
||||
return tensorflow_ds
|
||||
|
||||
def make_pure_tensorflow_ds(self, df, nb_samples):
|
||||
buffer_size = nb_samples * 2
|
||||
|
||||
if self.sample_weights is not None:
|
||||
if self.sample_weights not in df.columns:
|
||||
raise ValueError
|
||||
ds = tf.data.Dataset.from_tensor_slices(
|
||||
(df.text.values, df.label.values, df[self.sample_weights].values)
|
||||
)
|
||||
elif self.dual_head:
|
||||
label_d = {f'{e}_output': df[f'{e}_label'].values for e in self.dual_head}
|
||||
label_d['content_output'] = tf.keras.utils.to_categorical(label_d['content_output'], num_classes=3)
|
||||
ds = tf.data.Dataset.from_tensor_slices((df.text.values, label_d))
|
||||
|
||||
else:
|
||||
ds = tf.data.Dataset.from_tensor_slices((df.text.values, df.label.values))
|
||||
ds = ds.shuffle(buffer_size, seed=self.seed, reshuffle_each_iteration=True).repeat()
|
||||
return ds
|
||||
|
||||
def get_balanced_dataset(self, training_data, size_limit=None, return_as_batch=True):
|
||||
training_data = training_data.sample(frac=1, random_state=self.seed)
|
||||
nb_samples = training_data.shape[0] if not size_limit else size_limit
|
||||
|
||||
num_classes = training_data.int_label.nunique()
|
||||
toxic_class = training_data.int_label.max()
|
||||
if size_limit:
|
||||
training_data = training_data[: size_limit * num_classes]
|
||||
|
||||
print(
|
||||
".... {} examples, incl. {:.2f}% tox in train, {} classes".format(
|
||||
nb_samples,
|
||||
100 * training_data[training_data.int_label == toxic_class].shape[0] / nb_samples,
|
||||
num_classes,
|
||||
)
|
||||
)
|
||||
label_groups = training_data.groupby("int_label")
|
||||
if self.huggingface:
|
||||
label_datasets = {
|
||||
label: self.make_huggingface_tensorflow_ds(group) for label, group in label_groups
|
||||
}
|
||||
|
||||
else:
|
||||
label_datasets = {
|
||||
label: self.make_pure_tensorflow_ds(group, nb_samples=nb_samples * 2)
|
||||
for label, group in label_groups
|
||||
}
|
||||
|
||||
datasets = [label_datasets[0], label_datasets[1]]
|
||||
weights = [1 - self.perc_training_tox, self.perc_training_tox]
|
||||
if num_classes == 3:
|
||||
datasets.append(label_datasets[2])
|
||||
weights = [1 - self.perc_training_tox, self.perc_training_tox / 2, self.perc_training_tox / 2]
|
||||
elif num_classes != 2:
|
||||
raise ValueError("Currently it should not be possible to get other than 2 or 3 classes")
|
||||
resampled_ds = tf.data.experimental.sample_from_datasets(datasets, weights, seed=self.seed)
|
||||
|
||||
if return_as_batch and not self.huggingface:
|
||||
return resampled_ds.batch(
|
||||
self.mb_size, drop_remainder=True, num_parallel_calls=NUM_WORKERS, deterministic=True
|
||||
).prefetch(NUM_PREFETCH)
|
||||
|
||||
return resampled_ds
|
||||
|
||||
@staticmethod
|
||||
def _compute_int_labels(full_df):
|
||||
if full_df.label.dtype == int:
|
||||
full_df["int_label"] = full_df.label
|
||||
|
||||
elif "int_label" not in full_df.columns:
|
||||
if full_df.label.max() > 1:
|
||||
raise ValueError("Binarizing labels that should not be.")
|
||||
full_df["int_label"] = np.where(full_df.label >= 0.5, 1, 0)
|
||||
|
||||
return full_df
|
||||
|
||||
def __call__(self, full_df, *args, **kwargs):
|
||||
full_df = self._compute_int_labels(full_df)
|
||||
|
||||
train_data, test_data = self.get_outer_fold(df=full_df)
|
||||
|
||||
stratifier = self._get_stratified_kfold(n_splits=self.n_inner_splits)
|
||||
for train_index, val_index in stratifier.split(
|
||||
np.zeros(train_data.shape[0]), train_data.int_label
|
||||
):
|
||||
curr_train_data = train_data.iloc[train_index]
|
||||
|
||||
mini_batches = self.get_balanced_dataset(curr_train_data)
|
||||
|
||||
steps_per_epoch = self.get_steps_per_epoch(
|
||||
nb_pos_examples=curr_train_data[curr_train_data.int_label != 0].shape[0]
|
||||
)
|
||||
|
||||
val_data = train_data.iloc[val_index].copy()
|
||||
|
||||
yield mini_batches, steps_per_epoch, val_data, test_data
|
||||
|
||||
def simple_cv_load(self, full_df):
|
||||
full_df = self._compute_int_labels(full_df)
|
||||
|
||||
train_data, test_data = self.get_outer_fold(df=full_df)
|
||||
if test_data.shape[0] == 0:
|
||||
test_data = train_data.iloc[:500]
|
||||
|
||||
mini_batches = self.get_balanced_dataset(train_data)
|
||||
steps_per_epoch = self.get_steps_per_epoch(
|
||||
nb_pos_examples=train_data[train_data.int_label != 0].shape[0]
|
||||
)
|
||||
|
||||
return mini_batches, test_data, steps_per_epoch
|
||||
|
||||
def no_cv_load(self, full_df):
|
||||
full_df = self._compute_int_labels(full_df)
|
||||
|
||||
val_test = full_df[full_df.origin == "precision"].copy(deep=True)
|
||||
val_data, test_data = self.get_outer_fold(df=val_test)
|
||||
|
||||
train_data = full_df.drop(full_df[full_df.origin == "precision"].index, axis=0)
|
||||
if test_data.shape[0] == 0:
|
||||
test_data = train_data.iloc[:500]
|
||||
|
||||
mini_batches = self.get_balanced_dataset(train_data)
|
||||
if train_data.int_label.nunique() == 1:
|
||||
raise ValueError('Should be at least two labels')
|
||||
|
||||
num_examples = train_data[train_data.int_label == 1].shape[0]
|
||||
if train_data.int_label.nunique() > 2:
|
||||
second_most_frequent_label = train_data.loc[train_data.int_label != 0, 'int_label'].mode().values[0]
|
||||
num_examples = train_data[train_data.int_label == second_most_frequent_label].shape[0] * 2
|
||||
steps_per_epoch = self.get_steps_per_epoch(nb_pos_examples=num_examples)
|
||||
|
||||
return mini_batches, steps_per_epoch, val_data, test_data
|
227
trust_and_safety_models/toxicity/load_model.py
Normal file
227
trust_and_safety_models/toxicity/load_model.py
Normal file
@ -0,0 +1,227 @@
|
||||
import os
|
||||
|
||||
from toxicity_ml_pipeline.settings.default_settings_tox import LOCAL_DIR, MAX_SEQ_LENGTH
|
||||
try:
|
||||
from toxicity_ml_pipeline.optim.losses import MaskedBCE
|
||||
except ImportError:
|
||||
print('No MaskedBCE loss')
|
||||
from toxicity_ml_pipeline.utils.helpers import execute_command
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
try:
|
||||
from twitter.cuad.representation.models.text_encoder import TextEncoder
|
||||
except ModuleNotFoundError:
|
||||
print("No TextEncoder package")
|
||||
|
||||
try:
|
||||
from transformers import TFAutoModelForSequenceClassification
|
||||
except ModuleNotFoundError:
|
||||
print("No HuggingFace package")
|
||||
|
||||
LOCAL_MODEL_DIR = os.path.join(LOCAL_DIR, "models")
|
||||
|
||||
|
||||
def reload_model_weights(weights_dir, language, **kwargs):
|
||||
optimizer = tf.keras.optimizers.Adam(0.01)
|
||||
model_type = (
|
||||
"twitter_bert_base_en_uncased_mlm"
|
||||
if language == "en"
|
||||
else "twitter_multilingual_bert_base_cased_mlm"
|
||||
)
|
||||
model = load(optimizer=optimizer, seed=42, model_type=model_type, **kwargs)
|
||||
model.load_weights(weights_dir)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def _locally_copy_models(model_type):
|
||||
if model_type == "twitter_multilingual_bert_base_cased_mlm":
|
||||
preprocessor = "bert_multi_cased_preprocess_3"
|
||||
elif model_type == "twitter_bert_base_en_uncased_mlm":
|
||||
preprocessor = "bert_en_uncased_preprocess_3"
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
copy_cmd = """mkdir {local_dir}
|
||||
gsutil cp -r ...
|
||||
gsutil cp -r ..."""
|
||||
execute_command(
|
||||
copy_cmd.format(model_type=model_type, preprocessor=preprocessor, local_dir=LOCAL_MODEL_DIR)
|
||||
)
|
||||
|
||||
return preprocessor
|
||||
|
||||
|
||||
def load_encoder(model_type, trainable):
|
||||
try:
|
||||
model = TextEncoder(
|
||||
max_seq_lengths=MAX_SEQ_LENGTH,
|
||||
model_type=model_type,
|
||||
cluster="gcp",
|
||||
trainable=trainable,
|
||||
enable_dynamic_shapes=True,
|
||||
)
|
||||
except (OSError, tf.errors.AbortedError) as e:
|
||||
print(e)
|
||||
preprocessor = _locally_copy_models(model_type)
|
||||
|
||||
model = TextEncoder(
|
||||
max_seq_lengths=MAX_SEQ_LENGTH,
|
||||
local_model_path=f"models/{model_type}",
|
||||
local_preprocessor_path=f"models/{preprocessor}",
|
||||
cluster="gcp",
|
||||
trainable=trainable,
|
||||
enable_dynamic_shapes=True,
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def get_loss(loss_name, from_logits, **kwargs):
|
||||
loss_name = loss_name.lower()
|
||||
if loss_name == "bce":
|
||||
print("Binary CE loss")
|
||||
return tf.keras.losses.BinaryCrossentropy(from_logits=from_logits)
|
||||
|
||||
if loss_name == "cce":
|
||||
print("Categorical cross-entropy loss")
|
||||
return tf.keras.losses.CategoricalCrossentropy(from_logits=from_logits)
|
||||
|
||||
if loss_name == "scce":
|
||||
print("Sparse categorical cross-entropy loss")
|
||||
return tf.keras.losses.SparseCategoricalCrossentropy(from_logits=from_logits)
|
||||
|
||||
if loss_name == "focal_bce":
|
||||
gamma = kwargs.get("gamma", 2)
|
||||
print("Focal binary CE loss", gamma)
|
||||
return tf.keras.losses.BinaryFocalCrossentropy(gamma=gamma, from_logits=from_logits)
|
||||
|
||||
if loss_name == 'masked_bce':
|
||||
multitask = kwargs.get("multitask", False)
|
||||
if from_logits or multitask:
|
||||
raise NotImplementedError
|
||||
print(f'Masked Binary Cross Entropy')
|
||||
return MaskedBCE()
|
||||
|
||||
if loss_name == "inv_kl_loss":
|
||||
raise NotImplementedError
|
||||
|
||||
raise ValueError(
|
||||
f"This loss name is not valid: {loss_name}. Accepted loss names: BCE, masked BCE, CCE, sCCE, "
|
||||
f"Focal_BCE, inv_KL_loss"
|
||||
)
|
||||
|
||||
def _add_additional_embedding_layer(doc_embedding, glorot, seed):
|
||||
doc_embedding = tf.keras.layers.Dense(768, activation="tanh", kernel_initializer=glorot)(doc_embedding)
|
||||
doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding)
|
||||
return doc_embedding
|
||||
|
||||
def _get_bias(**kwargs):
|
||||
smart_bias_value = kwargs.get('smart_bias_value', 0)
|
||||
print('Smart bias init to ', smart_bias_value)
|
||||
output_bias = tf.keras.initializers.Constant(smart_bias_value)
|
||||
return output_bias
|
||||
|
||||
|
||||
def load_inhouse_bert(model_type, trainable, seed, **kwargs):
|
||||
inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)
|
||||
encoder = load_encoder(model_type=model_type, trainable=trainable)
|
||||
doc_embedding = encoder([inputs])["pooled_output"]
|
||||
doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding)
|
||||
|
||||
glorot = tf.keras.initializers.glorot_uniform(seed=seed)
|
||||
if kwargs.get("additional_layer", False):
|
||||
doc_embedding = _add_additional_embedding_layer(doc_embedding, glorot, seed)
|
||||
|
||||
if kwargs.get('content_num_classes', None):
|
||||
probs = get_last_layer(glorot=glorot, last_layer_name='target_output', **kwargs)(doc_embedding)
|
||||
second_probs = get_last_layer(num_classes=kwargs['content_num_classes'],
|
||||
last_layer_name='content_output',
|
||||
glorot=glorot)(doc_embedding)
|
||||
probs = [probs, second_probs]
|
||||
else:
|
||||
probs = get_last_layer(glorot=glorot, **kwargs)(doc_embedding)
|
||||
model = tf.keras.models.Model(inputs=inputs, outputs=probs)
|
||||
|
||||
return model, False
|
||||
|
||||
def get_last_layer(**kwargs):
|
||||
output_bias = _get_bias(**kwargs)
|
||||
if 'glorot' in kwargs:
|
||||
glorot = kwargs['glorot']
|
||||
else:
|
||||
glorot = tf.keras.initializers.glorot_uniform(seed=kwargs['seed'])
|
||||
layer_name = kwargs.get('last_layer_name', 'dense_1')
|
||||
|
||||
if kwargs.get('num_classes', 1) > 1:
|
||||
last_layer = tf.keras.layers.Dense(
|
||||
kwargs["num_classes"], activation="softmax", kernel_initializer=glorot,
|
||||
bias_initializer=output_bias, name=layer_name
|
||||
)
|
||||
|
||||
elif kwargs.get('num_raters', 1) > 1:
|
||||
if kwargs.get('multitask', False):
|
||||
raise NotImplementedError
|
||||
last_layer = tf.keras.layers.Dense(
|
||||
kwargs['num_raters'], activation="sigmoid", kernel_initializer=glorot,
|
||||
bias_initializer=output_bias, name='probs')
|
||||
|
||||
else:
|
||||
last_layer = tf.keras.layers.Dense(
|
||||
1, activation="sigmoid", kernel_initializer=glorot,
|
||||
bias_initializer=output_bias, name=layer_name
|
||||
)
|
||||
|
||||
return last_layer
|
||||
|
||||
def load_bertweet(**kwargs):
|
||||
bert = TFAutoModelForSequenceClassification.from_pretrained(
|
||||
os.path.join(LOCAL_MODEL_DIR, "bertweet-base"),
|
||||
num_labels=1,
|
||||
classifier_dropout=0.1,
|
||||
hidden_size=768,
|
||||
)
|
||||
if "num_classes" in kwargs and kwargs["num_classes"] > 2:
|
||||
raise NotImplementedError
|
||||
|
||||
return bert, True
|
||||
|
||||
|
||||
def load(
|
||||
optimizer,
|
||||
seed,
|
||||
model_type="twitter_multilingual_bert_base_cased_mlm",
|
||||
loss_name="BCE",
|
||||
trainable=True,
|
||||
**kwargs,
|
||||
):
|
||||
if model_type == "bertweet-base":
|
||||
model, from_logits = load_bertweet()
|
||||
else:
|
||||
model, from_logits = load_inhouse_bert(model_type, trainable, seed, **kwargs)
|
||||
|
||||
pr_auc = tf.keras.metrics.AUC(curve="PR", name="pr_auc", from_logits=from_logits)
|
||||
roc_auc = tf.keras.metrics.AUC(curve="ROC", name="roc_auc", from_logits=from_logits)
|
||||
|
||||
loss = get_loss(loss_name, from_logits, **kwargs)
|
||||
if kwargs.get('content_num_classes', None):
|
||||
second_loss = get_loss(loss_name=kwargs['content_loss_name'], from_logits=from_logits)
|
||||
loss_weights = {'content_output': kwargs['content_loss_weight'], 'target_output': 1}
|
||||
model.compile(
|
||||
optimizer=optimizer,
|
||||
loss={'content_output': second_loss, 'target_output': loss},
|
||||
loss_weights=loss_weights,
|
||||
metrics=[pr_auc, roc_auc],
|
||||
)
|
||||
|
||||
else:
|
||||
model.compile(
|
||||
optimizer=optimizer,
|
||||
loss=loss,
|
||||
metrics=[pr_auc, roc_auc],
|
||||
)
|
||||
print(model.summary(), "logits: ", from_logits)
|
||||
|
||||
return model
|
0
trust_and_safety_models/toxicity/optim/__init__.py
Normal file
0
trust_and_safety_models/toxicity/optim/__init__.py
Normal file
220
trust_and_safety_models/toxicity/optim/callbacks.py
Normal file
220
trust_and_safety_models/toxicity/optim/callbacks.py
Normal file
@ -0,0 +1,220 @@
|
||||
from collections import defaultdict
|
||||
import os
|
||||
|
||||
from toxicity_ml_pipeline.settings.default_settings_tox import REMOTE_LOGDIR
|
||||
from toxicity_ml_pipeline.settings.default_settings_abs import LABEL_NAMES
|
||||
from toxicity_ml_pipeline.utils.absv_utils import parse_labeled_data
|
||||
from toxicity_ml_pipeline.utils.helpers import compute_precision_fixed_recall, execute_command
|
||||
|
||||
from sklearn.metrics import average_precision_score, roc_auc_score
|
||||
import tensorflow as tf
|
||||
import wandb
|
||||
|
||||
|
||||
class NothingCallback(tf.keras.callbacks.Callback):
|
||||
def on_epoch_begin(self, epoch, logs=None):
|
||||
print("ici, ", epoch)
|
||||
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
print("fin ", epoch)
|
||||
|
||||
def on_train_batch_end(self, batch, logs=None):
|
||||
print("fin de batch ", batch)
|
||||
|
||||
|
||||
class ControlledStoppingCheckpointCallback(tf.keras.callbacks.ModelCheckpoint):
|
||||
def __init__(self, stopping_epoch, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.stopping_epoch = stopping_epoch
|
||||
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
super().on_epoch_end(epoch, logs)
|
||||
if epoch == self.stopping_epoch:
|
||||
self.model.stop_training = True
|
||||
|
||||
|
||||
class SyncingTensorBoard(tf.keras.callbacks.TensorBoard):
|
||||
def __init__(self, remote_logdir=None, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.remote_logdir = remote_logdir if remote_logdir is not None else REMOTE_LOGDIR
|
||||
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
super().on_epoch_end(epoch, logs=logs)
|
||||
self.synchronize()
|
||||
|
||||
def synchronize(self):
|
||||
base_dir = os.path.dirname(self.log_dir)
|
||||
cmd = f"gsutil -m rsync -r {base_dir} {self.remote_logdir}"
|
||||
execute_command(cmd)
|
||||
|
||||
|
||||
class GradientLoggingTensorBoard(SyncingTensorBoard):
|
||||
def __init__(self, loader, val_data, freq, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
val_dataset = loader.get_balanced_dataset(
|
||||
training_data=val_data, size_limit=50, return_as_batch=False
|
||||
)
|
||||
data_args = list(val_dataset.batch(32).take(1))[0]
|
||||
self.x_batch, self.y_batch = data_args[0], data_args[1]
|
||||
self.freq = freq
|
||||
self.counter = 0
|
||||
|
||||
def _log_gradients(self):
|
||||
writer = self._train_writer
|
||||
|
||||
with writer.as_default():
|
||||
with tf.GradientTape() as tape:
|
||||
y_pred = self.model(self.x_batch)
|
||||
loss = self.model.compiled_loss(y_true=self.y_batch, y_pred=y_pred)
|
||||
gradient_norm = tf.linalg.global_norm(tape.gradient(loss, self.model.trainable_weights))
|
||||
|
||||
tf.summary.scalar("gradient_norm", data=gradient_norm, step=self.counter)
|
||||
writer.flush()
|
||||
|
||||
def on_train_batch_end(self, batch, logs=None):
|
||||
super().on_batch_end(batch, logs=logs)
|
||||
self.counter += 1
|
||||
if batch % self.freq == 0:
|
||||
self._log_gradients()
|
||||
|
||||
|
||||
class AdditionalResultLogger(tf.keras.callbacks.Callback):
|
||||
def __init__(
|
||||
self,
|
||||
data,
|
||||
set_,
|
||||
fixed_recall=0.85,
|
||||
from_logits=False,
|
||||
dataset_transform_func=None,
|
||||
batch_size=64,
|
||||
dual_head=None,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.set_ = set_
|
||||
if data is None:
|
||||
return None
|
||||
|
||||
self.single_head = True
|
||||
try:
|
||||
self.labels = data.int_label.values
|
||||
except AttributeError:
|
||||
self.labels = data.to_dataframe()[LABEL_NAMES].values.astype('int')
|
||||
self.data = data.to_tf_dataset().map(parse_labeled_data).batch(batch_size)
|
||||
self.label_names = LABEL_NAMES
|
||||
else:
|
||||
self.label_names = ['']
|
||||
if dual_head:
|
||||
self.label_names = [f'{e}_label' for e in dual_head]
|
||||
self.labels = {f'{e}_output': data[f'{e}_label'].values for e in dual_head}
|
||||
self.single_head = False
|
||||
if dataset_transform_func is None:
|
||||
self.data = data.text.values
|
||||
else:
|
||||
self.data = dataset_transform_func(data, mb_size=batch_size, shuffle=False)
|
||||
|
||||
finally:
|
||||
if len(self.label_names) == 1:
|
||||
self.metric_kw = {}
|
||||
else:
|
||||
self.metric_kw = {'average': None}
|
||||
|
||||
self.counter = 0
|
||||
self.best_metrics = defaultdict(float)
|
||||
self.from_logits = from_logits
|
||||
print(f"Loaded callback for {set_}, from_logits: {from_logits}, labels {self.label_names}")
|
||||
|
||||
if 1 < fixed_recall <= 100:
|
||||
fixed_recall = fixed_recall / 100
|
||||
elif not (0 < fixed_recall <= 100):
|
||||
raise ValueError("Threshold should be between 0 and 1, or 0 and 100")
|
||||
self.fixed_recall = fixed_recall
|
||||
self.batch_size = batch_size
|
||||
|
||||
def compute_precision_fixed_recall(self, labels, preds):
|
||||
result, _ = compute_precision_fixed_recall(labels=labels, preds=preds,
|
||||
fixed_recall=self.fixed_recall)
|
||||
|
||||
return result
|
||||
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
self.additional_evaluations(step=epoch, eval_time="epoch")
|
||||
|
||||
def on_train_batch_end(self, batch, logs=None):
|
||||
self.counter += 1
|
||||
if self.counter % 2000 == 0:
|
||||
self.additional_evaluations(step=self.counter, eval_time="batch")
|
||||
|
||||
def _binary_evaluations(self, preds, label_name=None, class_index=None):
|
||||
mask = None
|
||||
curr_labels = self.labels
|
||||
if label_name is not None:
|
||||
curr_labels = self.labels[label_name]
|
||||
if class_index is not None:
|
||||
curr_labels = (curr_labels == class_index).astype(int)
|
||||
|
||||
if -1 in curr_labels:
|
||||
mask = curr_labels != -1
|
||||
curr_labels = curr_labels[mask]
|
||||
preds = preds[mask]
|
||||
|
||||
return {
|
||||
f"precision_recall{self.fixed_recall}": self.compute_precision_fixed_recall(
|
||||
labels=curr_labels, preds=preds
|
||||
),
|
||||
"pr_auc": average_precision_score(y_true=curr_labels, y_score=preds),
|
||||
"roc_auc": roc_auc_score(y_true=curr_labels, y_score=preds),
|
||||
}
|
||||
|
||||
|
||||
def _multiclass_evaluations(self, preds):
|
||||
pr_auc_l = average_precision_score(y_true=self.labels, y_score=preds, **self.metric_kw)
|
||||
roc_auc_l = roc_auc_score(y_true=self.labels, y_score=preds, **self.metric_kw)
|
||||
metrics = {}
|
||||
for i, label in enumerate(self.label_names):
|
||||
metrics[f'pr_auc_{label}'] = pr_auc_l[i]
|
||||
metrics[f'roc_auc_{label}'] = roc_auc_l[i]
|
||||
|
||||
return metrics
|
||||
|
||||
def additional_evaluations(self, step, eval_time):
|
||||
print("Evaluating ", self.set_, eval_time, step)
|
||||
|
||||
preds = self.model.predict(x=self.data, batch_size=self.batch_size)
|
||||
if self.from_logits:
|
||||
preds = tf.keras.activations.sigmoid(preds.logits).numpy()
|
||||
|
||||
if self.single_head:
|
||||
if len(self.label_names) == 1:
|
||||
metrics = self._binary_evaluations(preds)
|
||||
else:
|
||||
metrics = self._multiclass_evaluations(preds)
|
||||
else:
|
||||
if preds[0].shape[1] == 1:
|
||||
binary_preds = preds[0]
|
||||
multic_preds = preds[1]
|
||||
else:
|
||||
binary_preds = preds[1]
|
||||
multic_preds = preds[0]
|
||||
|
||||
binary_metrics = self._binary_evaluations(binary_preds, label_name='target_output')
|
||||
metrics = {f'{k}_target': v for k, v in binary_metrics.items()}
|
||||
num_classes = multic_preds.shape[1]
|
||||
for class_ in range(num_classes):
|
||||
binary_metrics = self._binary_evaluations(multic_preds[:, class_], label_name='content_output', class_index=class_)
|
||||
metrics.update({f'{k}_content_{class_}': v for k, v in binary_metrics.items()})
|
||||
|
||||
for k, v in metrics.items():
|
||||
self.best_metrics[f"max_{k}"] = max(v, self.best_metrics[f"max_{k}"])
|
||||
|
||||
self.log_metrics(metrics, step=step, eval_time=eval_time)
|
||||
|
||||
def log_metrics(self, metrics_d, step, eval_time):
|
||||
commit = False if self.set_ == "validation" else True
|
||||
to_report = {self.set_: {**metrics_d, **self.best_metrics}}
|
||||
|
||||
if eval_time == "epoch":
|
||||
to_report["epoch"] = step
|
||||
|
||||
wandb.log(to_report, commit=commit)
|
56
trust_and_safety_models/toxicity/optim/losses.py
Normal file
56
trust_and_safety_models/toxicity/optim/losses.py
Normal file
@ -0,0 +1,56 @@
|
||||
import tensorflow as tf
|
||||
from keras.utils import tf_utils
|
||||
from keras.utils import losses_utils
|
||||
from keras import backend
|
||||
|
||||
def inv_kl_divergence(y_true, y_pred):
|
||||
y_pred = tf.convert_to_tensor(y_pred)
|
||||
y_true = tf.cast(y_true, y_pred.dtype)
|
||||
y_true = backend.clip(y_true, backend.epsilon(), 1)
|
||||
y_pred = backend.clip(y_pred, backend.epsilon(), 1)
|
||||
return tf.reduce_sum(y_pred * tf.math.log(y_pred / y_true), axis=-1)
|
||||
|
||||
def masked_bce(y_true, y_pred):
|
||||
y_true = tf.cast(y_true, dtype=tf.float32)
|
||||
mask = y_true != -1
|
||||
|
||||
return tf.keras.metrics.binary_crossentropy(tf.boolean_mask(y_true, mask),
|
||||
tf.boolean_mask(y_pred, mask))
|
||||
|
||||
|
||||
class LossFunctionWrapper(tf.keras.losses.Loss):
|
||||
def __init__(self,
|
||||
fn,
|
||||
reduction=losses_utils.ReductionV2.AUTO,
|
||||
name=None,
|
||||
**kwargs):
|
||||
super().__init__(reduction=reduction, name=name)
|
||||
self.fn = fn
|
||||
self._fn_kwargs = kwargs
|
||||
|
||||
def call(self, y_true, y_pred):
|
||||
if tf.is_tensor(y_pred) and tf.is_tensor(y_true):
|
||||
y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(y_pred, y_true)
|
||||
|
||||
ag_fn = tf.__internal__.autograph.tf_convert(self.fn, tf.__internal__.autograph.control_status_ctx())
|
||||
return ag_fn(y_true, y_pred, **self._fn_kwargs)
|
||||
|
||||
def get_config(self):
|
||||
config = {}
|
||||
for k, v in self._fn_kwargs.items():
|
||||
config[k] = backend.eval(v) if tf_utils.is_tensor_or_variable(v) else v
|
||||
base_config = super().get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
class InvKLD(LossFunctionWrapper):
|
||||
def __init__(self,
|
||||
reduction=losses_utils.ReductionV2.AUTO,
|
||||
name='inv_kl_divergence'):
|
||||
super().__init__(inv_kl_divergence, name=name, reduction=reduction)
|
||||
|
||||
|
||||
class MaskedBCE(LossFunctionWrapper):
|
||||
def __init__(self,
|
||||
reduction=losses_utils.ReductionV2.AUTO,
|
||||
name='masked_bce'):
|
||||
super().__init__(masked_bce, name=name, reduction=reduction)
|
44
trust_and_safety_models/toxicity/optim/schedulers.py
Normal file
44
trust_and_safety_models/toxicity/optim/schedulers.py
Normal file
@ -0,0 +1,44 @@
|
||||
from typing import Callable
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
|
||||
def __init__(
|
||||
self,
|
||||
initial_learning_rate: float,
|
||||
decay_schedule_fn: Callable,
|
||||
warmup_steps: int,
|
||||
power: float = 1.0,
|
||||
name: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
self.initial_learning_rate = initial_learning_rate
|
||||
self.warmup_steps = warmup_steps
|
||||
self.power = power
|
||||
self.decay_schedule_fn = decay_schedule_fn
|
||||
self.name = name
|
||||
|
||||
def __call__(self, step):
|
||||
with tf.name_scope(self.name or "WarmUp") as name:
|
||||
global_step_float = tf.cast(step, tf.float32)
|
||||
warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
|
||||
warmup_percent_done = global_step_float / warmup_steps_float
|
||||
warmup_learning_rate = self.initial_learning_rate * tf.math.pow(
|
||||
warmup_percent_done, self.power
|
||||
)
|
||||
return tf.cond(
|
||||
global_step_float < warmup_steps_float,
|
||||
lambda: warmup_learning_rate,
|
||||
lambda: self.decay_schedule_fn(step - self.warmup_steps),
|
||||
name=name,
|
||||
)
|
||||
|
||||
def get_config(self):
|
||||
return {
|
||||
"initial_learning_rate": self.initial_learning_rate,
|
||||
"decay_schedule_fn": self.decay_schedule_fn,
|
||||
"warmup_steps": self.warmup_steps,
|
||||
"power": self.power,
|
||||
"name": self.name,
|
||||
}
|
54
trust_and_safety_models/toxicity/rescoring.py
Normal file
54
trust_and_safety_models/toxicity/rescoring.py
Normal file
@ -0,0 +1,54 @@
|
||||
from toxicity_ml_pipeline.load_model import reload_model_weights
|
||||
from toxicity_ml_pipeline.utils.helpers import load_inference_func, upload_model
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def score(language, df, gcs_model_path, batch_size=64, text_col="text", kw="", **kwargs):
|
||||
if language != "en":
|
||||
raise NotImplementedError(
|
||||
"Data preprocessing not implemented here, needs to be added for i18n models"
|
||||
)
|
||||
model_folder = upload_model(full_gcs_model_path=gcs_model_path)
|
||||
try:
|
||||
inference_func = load_inference_func(model_folder)
|
||||
except OSError:
|
||||
model = reload_model_weights(model_folder, language, **kwargs)
|
||||
preds = model.predict(x=df[text_col], batch_size=batch_size)
|
||||
if type(preds) != list:
|
||||
if len(preds.shape)> 1 and preds.shape[1] > 1:
|
||||
if 'num_classes' in kwargs and kwargs['num_classes'] > 1:
|
||||
raise NotImplementedError
|
||||
preds = np.mean(preds, 1)
|
||||
|
||||
df[f"prediction_{kw}"] = preds
|
||||
else:
|
||||
if len(preds) > 2:
|
||||
raise NotImplementedError
|
||||
for preds_arr in preds:
|
||||
if preds_arr.shape[1] == 1:
|
||||
df[f"prediction_{kw}_target"] = preds_arr
|
||||
else:
|
||||
for ind in range(preds_arr.shape[1]):
|
||||
df[f"prediction_{kw}_content_{ind}"] = preds_arr[:, ind]
|
||||
|
||||
return df
|
||||
else:
|
||||
return _get_score(inference_func, df, kw=kw, batch_size=batch_size, text_col=text_col)
|
||||
|
||||
|
||||
def _get_score(inference_func, df, text_col="text", kw="", batch_size=64):
|
||||
score_col = f"prediction_{kw}"
|
||||
beginning = 0
|
||||
end = df.shape[0]
|
||||
predictions = np.zeros(shape=end, dtype=float)
|
||||
|
||||
while beginning < end:
|
||||
mb = df[text_col].values[beginning : beginning + batch_size]
|
||||
res = inference_func(input_1=tf.constant(mb))
|
||||
predictions[beginning : beginning + batch_size] = list(res.values())[0].numpy()[:, 0]
|
||||
beginning += batch_size
|
||||
|
||||
df[score_col] = predictions
|
||||
return df
|
@ -0,0 +1,38 @@
|
||||
import os
|
||||
|
||||
|
||||
TEAM_PROJECT = "twttr-toxicity-prod"
|
||||
try:
|
||||
from google.cloud import bigquery
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
print("No Google packages")
|
||||
CLIENT = None
|
||||
else:
|
||||
from google.auth.exceptions import DefaultCredentialsError
|
||||
|
||||
try:
|
||||
CLIENT = bigquery.Client(project=TEAM_PROJECT)
|
||||
except DefaultCredentialsError as e:
|
||||
CLIENT = None
|
||||
print("Issue at logging time", e)
|
||||
|
||||
TRAINING_DATA_LOCATION = f"..."
|
||||
GCS_ADDRESS = "..."
|
||||
LOCAL_DIR = os.getcwd()
|
||||
REMOTE_LOGDIR = "{GCS_ADDRESS}/logs"
|
||||
MODEL_DIR = "{GCS_ADDRESS}/models"
|
||||
|
||||
EXISTING_TASK_VERSIONS = {3, 3.5}
|
||||
|
||||
RANDOM_SEED = ...
|
||||
TRAIN_EPOCHS = 4
|
||||
MINI_BATCH_SIZE = 32
|
||||
TARGET_POS_PER_EPOCH = 5000
|
||||
PERC_TRAINING_TOX = ...
|
||||
MAX_SEQ_LENGTH = 100
|
||||
|
||||
WARM_UP_PERC = 0.1
|
||||
OUTER_CV = 5
|
||||
INNER_CV = 5
|
||||
NUM_PREFETCH = 5
|
||||
NUM_WORKERS = 10
|
401
trust_and_safety_models/toxicity/train.py
Normal file
401
trust_and_safety_models/toxicity/train.py
Normal file
@ -0,0 +1,401 @@
|
||||
from datetime import datetime
|
||||
from importlib import import_module
|
||||
import os
|
||||
|
||||
from toxicity_ml_pipeline.data.data_preprocessing import (
|
||||
DefaultENNoPreprocessor,
|
||||
DefaultENPreprocessor,
|
||||
)
|
||||
from toxicity_ml_pipeline.data.dataframe_loader import ENLoader, ENLoaderWithSampling
|
||||
from toxicity_ml_pipeline.data.mb_generator import BalancedMiniBatchLoader
|
||||
from toxicity_ml_pipeline.load_model import load, get_last_layer
|
||||
from toxicity_ml_pipeline.optim.callbacks import (
|
||||
AdditionalResultLogger,
|
||||
ControlledStoppingCheckpointCallback,
|
||||
GradientLoggingTensorBoard,
|
||||
SyncingTensorBoard,
|
||||
)
|
||||
from toxicity_ml_pipeline.optim.schedulers import WarmUp
|
||||
from toxicity_ml_pipeline.settings.default_settings_abs import GCS_ADDRESS as ABS_GCS
|
||||
from toxicity_ml_pipeline.settings.default_settings_tox import (
|
||||
GCS_ADDRESS as TOX_GCS,
|
||||
MODEL_DIR,
|
||||
RANDOM_SEED,
|
||||
REMOTE_LOGDIR,
|
||||
WARM_UP_PERC,
|
||||
)
|
||||
from toxicity_ml_pipeline.utils.helpers import check_gpu, set_seeds, upload_model
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
try:
|
||||
from tensorflow_addons.optimizers import AdamW
|
||||
except ModuleNotFoundError:
|
||||
print("No TFA")
|
||||
|
||||
|
||||
class Trainer(object):
|
||||
OPTIMIZERS = ["Adam", "AdamW"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
optimizer_name,
|
||||
weight_decay,
|
||||
learning_rate,
|
||||
mb_size,
|
||||
train_epochs,
|
||||
content_loss_weight=1,
|
||||
language="en",
|
||||
scope='TOX',
|
||||
project=...,
|
||||
experiment_id="default",
|
||||
gradient_clipping=None,
|
||||
fold="time",
|
||||
seed=RANDOM_SEED,
|
||||
log_gradients=False,
|
||||
kw="",
|
||||
stopping_epoch=None,
|
||||
test=False,
|
||||
):
|
||||
self.seed = seed
|
||||
self.weight_decay = weight_decay
|
||||
self.learning_rate = learning_rate
|
||||
self.mb_size = mb_size
|
||||
self.train_epochs = train_epochs
|
||||
self.gradient_clipping = gradient_clipping
|
||||
|
||||
if optimizer_name not in self.OPTIMIZERS:
|
||||
raise ValueError(
|
||||
f"Optimizer {optimizer_name} not implemented. Accepted values {self.OPTIMIZERS}."
|
||||
)
|
||||
self.optimizer_name = optimizer_name
|
||||
self.log_gradients = log_gradients
|
||||
self.test = test
|
||||
self.fold = fold
|
||||
self.stopping_epoch = stopping_epoch
|
||||
self.language = language
|
||||
if scope == 'TOX':
|
||||
GCS_ADDRESS = TOX_GCS.format(project=project)
|
||||
elif scope == 'ABS':
|
||||
GCS_ADDRESS = ABS_GCS
|
||||
else:
|
||||
raise ValueError
|
||||
GCS_ADDRESS = GCS_ADDRESS.format(project=project)
|
||||
try:
|
||||
self.setting_file = import_module(f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings")
|
||||
except ModuleNotFoundError:
|
||||
raise ValueError(f"You need to define a setting file for your project {project}.")
|
||||
experiment_settings = self.setting_file.experiment_settings
|
||||
|
||||
self.project = project
|
||||
self.remote_logdir = REMOTE_LOGDIR.format(GCS_ADDRESS=GCS_ADDRESS, project=project)
|
||||
self.model_dir = MODEL_DIR.format(GCS_ADDRESS=GCS_ADDRESS, project=project)
|
||||
|
||||
if experiment_id not in experiment_settings:
|
||||
raise ValueError("This is not an experiment id as defined in the settings file.")
|
||||
|
||||
for var, default_value in experiment_settings["default"].items():
|
||||
override_val = experiment_settings[experiment_id].get(var, default_value)
|
||||
print("Setting ", var, override_val)
|
||||
self.__setattr__(var, override_val)
|
||||
|
||||
self.content_loss_weight = content_loss_weight if self.dual_head else None
|
||||
|
||||
self.mb_loader = BalancedMiniBatchLoader(
|
||||
fold=self.fold,
|
||||
seed=self.seed,
|
||||
perc_training_tox=self.perc_training_tox,
|
||||
mb_size=self.mb_size,
|
||||
n_outer_splits="time",
|
||||
scope=scope,
|
||||
project=project,
|
||||
dual_head=self.dual_head,
|
||||
sample_weights=self.sample_weights,
|
||||
huggingface=("bertweet" in self.model_type),
|
||||
)
|
||||
self._init_dirnames(kw=kw, experiment_id=experiment_id)
|
||||
print("------- Checking there is a GPU")
|
||||
check_gpu()
|
||||
|
||||
def _init_dirnames(self, kw, experiment_id):
|
||||
kw = "test" if self.test else kw
|
||||
hyper_param_kw = ""
|
||||
if self.optimizer_name == "AdamW":
|
||||
hyper_param_kw += f"{self.weight_decay}_"
|
||||
if self.gradient_clipping:
|
||||
hyper_param_kw += f"{self.gradient_clipping}_"
|
||||
if self.content_loss_weight:
|
||||
hyper_param_kw += f"{self.content_loss_weight}_"
|
||||
experiment_name = (
|
||||
f"{self.language}{str(datetime.now()).replace(' ', '')[:-7]}{kw}_{experiment_id}{self.fold}_"
|
||||
f"{self.optimizer_name}_"
|
||||
f"{self.learning_rate}_"
|
||||
f"{hyper_param_kw}"
|
||||
f"{self.mb_size}_"
|
||||
f"{self.perc_training_tox}_"
|
||||
f"{self.train_epochs}_seed{self.seed}"
|
||||
)
|
||||
print("------- Experiment name: ", experiment_name)
|
||||
self.logdir = (
|
||||
f"..."
|
||||
if self.test
|
||||
else f"..."
|
||||
)
|
||||
self.checkpoint_path = f"{self.model_dir}/{experiment_name}"
|
||||
|
||||
@staticmethod
|
||||
def _additional_writers(logdir, metric_name):
|
||||
return tf.summary.create_file_writer(os.path.join(logdir, metric_name))
|
||||
|
||||
def get_callbacks(self, fold, val_data, test_data):
|
||||
fold_logdir = self.logdir + f"_fold{fold}"
|
||||
fold_checkpoint_path = self.checkpoint_path + f"_fold{fold}/{{epoch:02d}}"
|
||||
|
||||
tb_args = {
|
||||
"log_dir": fold_logdir,
|
||||
"histogram_freq": 0,
|
||||
"update_freq": 500,
|
||||
"embeddings_freq": 0,
|
||||
"remote_logdir": f"{self.remote_logdir}_{self.language}"
|
||||
if not self.test
|
||||
else f"{self.remote_logdir}_test",
|
||||
}
|
||||
tensorboard_callback = (
|
||||
GradientLoggingTensorBoard(loader=self.mb_loader, val_data=val_data, freq=10, **tb_args)
|
||||
if self.log_gradients
|
||||
else SyncingTensorBoard(**tb_args)
|
||||
)
|
||||
|
||||
callbacks = [tensorboard_callback]
|
||||
if "bertweet" in self.model_type:
|
||||
from_logits = True
|
||||
dataset_transform_func = self.mb_loader.make_huggingface_tensorflow_ds
|
||||
else:
|
||||
from_logits = False
|
||||
dataset_transform_func = None
|
||||
|
||||
fixed_recall = 0.85 if not self.dual_head else 0.5
|
||||
val_callback = AdditionalResultLogger(
|
||||
data=val_data,
|
||||
set_="validation",
|
||||
from_logits=from_logits,
|
||||
dataset_transform_func=dataset_transform_func,
|
||||
dual_head=self.dual_head,
|
||||
fixed_recall=fixed_recall
|
||||
)
|
||||
if val_callback is not None:
|
||||
callbacks.append(val_callback)
|
||||
|
||||
test_callback = AdditionalResultLogger(
|
||||
data=test_data,
|
||||
set_="test",
|
||||
from_logits=from_logits,
|
||||
dataset_transform_func=dataset_transform_func,
|
||||
dual_head=self.dual_head,
|
||||
fixed_recall=fixed_recall
|
||||
)
|
||||
callbacks.append(test_callback)
|
||||
|
||||
checkpoint_args = {
|
||||
"filepath": fold_checkpoint_path,
|
||||
"verbose": 0,
|
||||
"monitor": "val_pr_auc",
|
||||
"save_weights_only": True,
|
||||
"mode": "max",
|
||||
"save_freq": "epoch",
|
||||
}
|
||||
if self.stopping_epoch:
|
||||
checkpoint_callback = ControlledStoppingCheckpointCallback(
|
||||
**checkpoint_args,
|
||||
stopping_epoch=self.stopping_epoch,
|
||||
save_best_only=False,
|
||||
)
|
||||
callbacks.append(checkpoint_callback)
|
||||
|
||||
return callbacks
|
||||
|
||||
def get_lr_schedule(self, steps_per_epoch):
|
||||
total_num_steps = steps_per_epoch * self.train_epochs
|
||||
|
||||
warm_up_perc = WARM_UP_PERC if self.learning_rate >= 1e-3 else 0
|
||||
warm_up_steps = int(total_num_steps * warm_up_perc)
|
||||
if self.linear_lr_decay:
|
||||
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
|
||||
self.learning_rate,
|
||||
total_num_steps - warm_up_steps,
|
||||
end_learning_rate=0.0,
|
||||
power=1.0,
|
||||
cycle=False,
|
||||
)
|
||||
else:
|
||||
print('Constant learning rate')
|
||||
learning_rate_fn = self.learning_rate
|
||||
|
||||
if warm_up_perc > 0:
|
||||
print(f".... using warm-up for {warm_up_steps} steps")
|
||||
warm_up_schedule = WarmUp(
|
||||
initial_learning_rate=self.learning_rate,
|
||||
decay_schedule_fn=learning_rate_fn,
|
||||
warmup_steps=warm_up_steps,
|
||||
)
|
||||
return warm_up_schedule
|
||||
return learning_rate_fn
|
||||
|
||||
def get_optimizer(self, schedule):
|
||||
optim_args = {
|
||||
"learning_rate": schedule,
|
||||
"beta_1": 0.9,
|
||||
"beta_2": 0.999,
|
||||
"epsilon": 1e-6,
|
||||
"amsgrad": False,
|
||||
}
|
||||
if self.gradient_clipping:
|
||||
optim_args["global_clipnorm"] = self.gradient_clipping
|
||||
|
||||
print(f".... {self.optimizer_name} w global clipnorm {self.gradient_clipping}")
|
||||
if self.optimizer_name == "Adam":
|
||||
return tf.keras.optimizers.Adam(**optim_args)
|
||||
|
||||
if self.optimizer_name == "AdamW":
|
||||
optim_args["weight_decay"] = self.weight_decay
|
||||
return AdamW(**optim_args)
|
||||
raise NotImplementedError
|
||||
|
||||
def get_training_actors(self, steps_per_epoch, val_data, test_data, fold):
|
||||
callbacks = self.get_callbacks(fold=fold, val_data=val_data, test_data=test_data)
|
||||
schedule = self.get_lr_schedule(steps_per_epoch=steps_per_epoch)
|
||||
|
||||
optimizer = self.get_optimizer(schedule)
|
||||
|
||||
return optimizer, callbacks
|
||||
|
||||
def load_data(self):
|
||||
if self.project == 435 or self.project == 211:
|
||||
if self.dataset_type is None:
|
||||
data_loader = ENLoader(project=self.project, setting_file=self.setting_file)
|
||||
dataset_type_args = {}
|
||||
else:
|
||||
data_loader = ENLoaderWithSampling(project=self.project, setting_file=self.setting_file)
|
||||
dataset_type_args = self.dataset_type
|
||||
|
||||
df = data_loader.load_data(
|
||||
language=self.language, test=self.test, reload=self.dataset_reload, **dataset_type_args
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
def preprocess(self, df):
|
||||
if self.project == 435 or self.project == 211:
|
||||
if self.preprocessing is None:
|
||||
data_prepro = DefaultENNoPreprocessor()
|
||||
elif self.preprocessing == "default":
|
||||
data_prepro = DefaultENPreprocessor()
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
return data_prepro(
|
||||
df=df,
|
||||
label_column=self.label_column,
|
||||
class_weight=self.perc_training_tox if self.sample_weights == 'class_weight' else None,
|
||||
filter_low_agreements=self.filter_low_agreements,
|
||||
num_classes=self.num_classes,
|
||||
)
|
||||
|
||||
def load_model(self, optimizer):
|
||||
smart_bias_value = (
|
||||
np.log(self.perc_training_tox / (1 - self.perc_training_tox)) if self.smart_bias_init else 0
|
||||
)
|
||||
model = load(
|
||||
optimizer,
|
||||
seed=self.seed,
|
||||
trainable=self.trainable,
|
||||
model_type=self.model_type,
|
||||
loss_name=self.loss_name,
|
||||
num_classes=self.num_classes,
|
||||
additional_layer=self.additional_layer,
|
||||
smart_bias_value=smart_bias_value,
|
||||
content_num_classes=self.content_num_classes,
|
||||
content_loss_name=self.content_loss_name,
|
||||
content_loss_weight=self.content_loss_weight
|
||||
)
|
||||
|
||||
if self.model_reload is not False:
|
||||
model_folder = upload_model(full_gcs_model_path=os.path.join(self.model_dir, self.model_reload))
|
||||
model.load_weights(model_folder)
|
||||
if self.scratch_last_layer:
|
||||
print('Putting the last layer back to scratch')
|
||||
model.layers[-1] = get_last_layer(seed=self.seed,
|
||||
num_classes=self.num_classes,
|
||||
smart_bias_value=smart_bias_value)
|
||||
|
||||
return model
|
||||
|
||||
def _train_single_fold(self, mb_generator, test_data, steps_per_epoch, fold, val_data=None):
|
||||
steps_per_epoch = 100 if self.test else steps_per_epoch
|
||||
|
||||
optimizer, callbacks = self.get_training_actors(
|
||||
steps_per_epoch=steps_per_epoch, val_data=val_data, test_data=test_data, fold=fold
|
||||
)
|
||||
print("Loading model")
|
||||
model = self.load_model(optimizer)
|
||||
print(f"Nb of steps per epoch: {steps_per_epoch} ---- launching training")
|
||||
training_args = {
|
||||
"epochs": self.train_epochs,
|
||||
"steps_per_epoch": steps_per_epoch,
|
||||
"batch_size": self.mb_size,
|
||||
"callbacks": callbacks,
|
||||
"verbose": 2,
|
||||
}
|
||||
|
||||
model.fit(mb_generator, **training_args)
|
||||
return
|
||||
|
||||
def train_full_model(self):
|
||||
print("Setting up random seed.")
|
||||
set_seeds(self.seed)
|
||||
|
||||
print(f"Loading {self.language} data")
|
||||
df = self.load_data()
|
||||
df = self.preprocess(df=df)
|
||||
|
||||
print("Going to train on everything but the test dataset")
|
||||
mini_batches, test_data, steps_per_epoch = self.mb_loader.simple_cv_load(df)
|
||||
|
||||
self._train_single_fold(
|
||||
mb_generator=mini_batches, test_data=test_data, steps_per_epoch=steps_per_epoch, fold="full"
|
||||
)
|
||||
|
||||
def train(self):
|
||||
print("Setting up random seed.")
|
||||
set_seeds(self.seed)
|
||||
|
||||
print(f"Loading {self.language} data")
|
||||
df = self.load_data()
|
||||
df = self.preprocess(df=df)
|
||||
|
||||
print("Loading MB generator")
|
||||
i = 0
|
||||
if self.project == 435 or self.project == 211:
|
||||
mb_generator, steps_per_epoch, val_data, test_data = self.mb_loader.no_cv_load(full_df=df)
|
||||
self._train_single_fold(
|
||||
mb_generator=mb_generator,
|
||||
val_data=val_data,
|
||||
test_data=test_data,
|
||||
steps_per_epoch=steps_per_epoch,
|
||||
fold=i,
|
||||
)
|
||||
else:
|
||||
raise ValueError("Sure you want to do multiple fold training")
|
||||
for mb_generator, steps_per_epoch, val_data, test_data in self.mb_loader(full_df=df):
|
||||
self._train_single_fold(
|
||||
mb_generator=mb_generator,
|
||||
val_data=val_data,
|
||||
test_data=test_data,
|
||||
steps_per_epoch=steps_per_epoch,
|
||||
fold=i,
|
||||
)
|
||||
i += 1
|
||||
if i == 3:
|
||||
break
|
0
trust_and_safety_models/toxicity/utils/__init__.py
Normal file
0
trust_and_safety_models/toxicity/utils/__init__.py
Normal file
99
trust_and_safety_models/toxicity/utils/helpers.py
Normal file
99
trust_and_safety_models/toxicity/utils/helpers.py
Normal file
@ -0,0 +1,99 @@
|
||||
import bisect
|
||||
import os
|
||||
import random as python_random
|
||||
import subprocess
|
||||
|
||||
from toxicity_ml_pipeline.settings.default_settings_tox import LOCAL_DIR
|
||||
|
||||
import numpy as np
|
||||
from sklearn.metrics import precision_recall_curve
|
||||
|
||||
|
||||
try:
|
||||
import tensorflow as tf
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
def upload_model(full_gcs_model_path):
|
||||
folder_name = full_gcs_model_path
|
||||
if folder_name[:5] != "gs://":
|
||||
folder_name = "gs://" + folder_name
|
||||
|
||||
dirname = os.path.dirname(folder_name)
|
||||
epoch = os.path.basename(folder_name)
|
||||
|
||||
model_dir = os.path.join(LOCAL_DIR, "models")
|
||||
cmd = f"mkdir {model_dir}"
|
||||
try:
|
||||
execute_command(cmd)
|
||||
except subprocess.CalledProcessError:
|
||||
pass
|
||||
model_dir = os.path.join(model_dir, os.path.basename(dirname))
|
||||
cmd = f"mkdir {model_dir}"
|
||||
try:
|
||||
execute_command(cmd)
|
||||
except subprocess.CalledProcessError:
|
||||
pass
|
||||
|
||||
try:
|
||||
_ = int(epoch)
|
||||
except ValueError:
|
||||
cmd = f"gsutil rsync -r '{folder_name}' {model_dir}"
|
||||
weights_dir = model_dir
|
||||
|
||||
else:
|
||||
cmd = f"gsutil cp '{dirname}/checkpoint' {model_dir}/"
|
||||
execute_command(cmd)
|
||||
cmd = f"gsutil cp '{os.path.join(dirname, epoch)}*' {model_dir}/"
|
||||
weights_dir = f"{model_dir}/{epoch}"
|
||||
|
||||
execute_command(cmd)
|
||||
return weights_dir
|
||||
|
||||
def compute_precision_fixed_recall(labels, preds, fixed_recall):
|
||||
precision_values, recall_values, thresholds = precision_recall_curve(y_true=labels, probas_pred=preds)
|
||||
index_recall = bisect.bisect_left(-recall_values, -1 * fixed_recall)
|
||||
result = precision_values[index_recall - 1]
|
||||
print(f"Precision at {recall_values[index_recall-1]} recall: {result}")
|
||||
|
||||
return result, thresholds[index_recall - 1]
|
||||
|
||||
def load_inference_func(model_folder):
|
||||
model = tf.saved_model.load(model_folder, ["serve"])
|
||||
inference_func = model.signatures["serving_default"]
|
||||
return inference_func
|
||||
|
||||
|
||||
def execute_query(client, query):
|
||||
job = client.query(query)
|
||||
df = job.result().to_dataframe()
|
||||
return df
|
||||
|
||||
|
||||
def execute_command(cmd, print_=True):
|
||||
s = subprocess.run(cmd, shell=True, capture_output=print_, check=True)
|
||||
if print_:
|
||||
print(s.stderr.decode("utf-8"))
|
||||
print(s.stdout.decode("utf-8"))
|
||||
|
||||
|
||||
def check_gpu():
|
||||
try:
|
||||
execute_command("nvidia-smi")
|
||||
except subprocess.CalledProcessError:
|
||||
print("There is no GPU when there should be one.")
|
||||
raise AttributeError
|
||||
|
||||
l = tf.config.list_physical_devices("GPU")
|
||||
if len(l) == 0:
|
||||
raise ModuleNotFoundError("Tensorflow has not found the GPU. Check your installation")
|
||||
print(l)
|
||||
|
||||
|
||||
def set_seeds(seed):
|
||||
np.random.seed(seed)
|
||||
|
||||
python_random.seed(seed)
|
||||
|
||||
tf.random.set_seed(seed)
|
Reference in New Issue
Block a user