improvements from external prs

-fix corner case where dr converter failed when initializing

Closes twitter/the-algorithm#550
This commit is contained in:
twitter-team
2023-04-05 16:08:19 -07:00
parent 23fa75d406
commit 31e82d6474
31 changed files with 305 additions and 244 deletions

View File

@ -44,5 +44,6 @@ pub struct RenamedFeatures {
}
pub fn parse(json_str: &str) -> Result<AllConfig, Error> {
serde_json::from_str(json_str)
let all_config: AllConfig = serde_json::from_str(json_str)?;
Ok(all_config)
}

View File

@ -2,6 +2,9 @@ use std::collections::BTreeSet;
use std::fmt::{self, Debug, Display};
use std::fs;
use crate::all_config;
use crate::all_config::AllConfig;
use anyhow::{bail, Context};
use bpr_thrift::data::DataRecord;
use bpr_thrift::prediction_service::BatchPredictionRequest;
use bpr_thrift::tensor::GeneralTensor;
@ -16,8 +19,6 @@ use segdense::util;
use thrift::protocol::{TBinaryInputProtocol, TSerializable};
use thrift::transport::TBufferChannel;
use crate::{all_config, all_config::AllConfig};
pub fn log_feature_match(
dr: &DataRecord,
seg_dense_config: &DensificationTransformSpec,
@ -28,20 +29,24 @@ pub fn log_feature_match(
for (feature_id, feature_value) in dr.continuous_features.as_ref().unwrap() {
debug!(
"{dr_type} - Continuous Datarecord => Feature ID: {feature_id}, Feature value: {feature_value}"
"{} - Continous Datarecord => Feature ID: {}, Feature value: {}",
dr_type, feature_id, feature_value
);
for input_feature in &seg_dense_config.cont.input_features {
if input_feature.feature_id == *feature_id {
debug!("Matching input feature: {input_feature:?}")
debug!("Matching input feature: {:?}", input_feature)
}
}
}
for feature_id in dr.binary_features.as_ref().unwrap() {
debug!("{dr_type} - Binary Datarecord => Feature ID: {feature_id}");
debug!(
"{} - Binary Datarecord => Feature ID: {}",
dr_type, feature_id
);
for input_feature in &seg_dense_config.binary.input_features {
if input_feature.feature_id == *feature_id {
debug!("Found input feature: {input_feature:?}")
debug!("Found input feature: {:?}", input_feature)
}
}
}
@ -90,18 +95,19 @@ impl BatchPredictionRequestToTorchTensorConverter {
model_version: &str,
reporting_feature_ids: Vec<(i64, &str)>,
register_metric_fn: Option<impl Fn(&HistogramVec)>,
) -> BatchPredictionRequestToTorchTensorConverter {
let all_config_path = format!("{model_dir}/{model_version}/all_config.json");
let seg_dense_config_path =
format!("{model_dir}/{model_version}/segdense_transform_spec_home_recap_2022.json");
let seg_dense_config = util::load_config(&seg_dense_config_path);
) -> anyhow::Result<BatchPredictionRequestToTorchTensorConverter> {
let all_config_path = format!("{}/{}/all_config.json", model_dir, model_version);
let seg_dense_config_path = format!(
"{}/{}/segdense_transform_spec_home_recap_2022.json",
model_dir, model_version
);
let seg_dense_config = util::load_config(&seg_dense_config_path)?;
let all_config = all_config::parse(
&fs::read_to_string(&all_config_path)
.unwrap_or_else(|error| panic!("error loading all_config.json - {error}")),
)
.unwrap();
.with_context(|| "error loading all_config.json - ")?,
)?;
let feature_mapper = util::load_from_parsed_config_ref(&seg_dense_config);
let feature_mapper = util::load_from_parsed_config(seg_dense_config.clone())?;
let user_embedding_feature_id = Self::get_feature_id(
&all_config
@ -131,11 +137,11 @@ impl BatchPredictionRequestToTorchTensorConverter {
let (discrete_feature_metrics, continuous_feature_metrics) = METRICS.get_or_init(|| {
let discrete = HistogramVec::new(
HistogramOpts::new(":navi:feature_id:discrete", "Discrete Feature ID values")
.buckets(Vec::from([
0.0f64, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0,
.buckets(Vec::from(&[
0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0,
120.0, 130.0, 140.0, 150.0, 160.0, 170.0, 180.0, 190.0, 200.0, 250.0,
300.0, 500.0, 1000.0, 10000.0, 100000.0,
])),
] as &'static [f64])),
&["feature_id"],
)
.expect("metric cannot be created");
@ -144,18 +150,18 @@ impl BatchPredictionRequestToTorchTensorConverter {
":navi:feature_id:continuous",
"continuous Feature ID values",
)
.buckets(Vec::from([
0.0f64, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0,
120.0, 130.0, 140.0, 150.0, 160.0, 170.0, 180.0, 190.0, 200.0, 250.0, 300.0,
500.0, 1000.0, 10000.0, 100000.0,
])),
.buckets(Vec::from(&[
0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0,
130.0, 140.0, 150.0, 160.0, 170.0, 180.0, 190.0, 200.0, 250.0, 300.0, 500.0,
1000.0, 10000.0, 100000.0,
] as &'static [f64])),
&["feature_id"],
)
.expect("metric cannot be created");
if let Some(r) = register_metric_fn {
register_metric_fn.map(|r| {
r(&discrete);
r(&continuous);
}
});
(discrete, continuous)
});
@ -164,13 +170,16 @@ impl BatchPredictionRequestToTorchTensorConverter {
for (feature_id, feature_type) in reporting_feature_ids.iter() {
match *feature_type {
"discrete" => discrete_features_to_report.insert(*feature_id),
"continuous" => continuous_features_to_report.insert(*feature_id),
_ => panic!("Invalid feature type {feature_type} for reporting metrics!"),
"discrete" => discrete_features_to_report.insert(feature_id.clone()),
"continuous" => continuous_features_to_report.insert(feature_id.clone()),
_ => bail!(
"Invalid feature type {} for reporting metrics!",
feature_type
),
};
}
BatchPredictionRequestToTorchTensorConverter {
Ok(BatchPredictionRequestToTorchTensorConverter {
all_config,
seg_dense_config,
all_config_path,
@ -183,7 +192,7 @@ impl BatchPredictionRequestToTorchTensorConverter {
continuous_features_to_report,
discrete_feature_metrics,
continuous_feature_metrics,
}
})
}
fn get_feature_id(feature_name: &str, seg_dense_config: &Root) -> i64 {
@ -218,43 +227,45 @@ impl BatchPredictionRequestToTorchTensorConverter {
let mut working_set = vec![0 as f32; total_size];
let mut bpr_start = 0;
for (bpr, &bpr_end) in bprs.iter().zip(batch_size) {
if bpr.common_features.is_some()
&& bpr.common_features.as_ref().unwrap().tensors.is_some()
&& bpr
.common_features
.as_ref()
.unwrap()
.tensors
.as_ref()
.unwrap()
.contains_key(&feature_id)
{
let source_tensor = bpr
.common_features
.as_ref()
.unwrap()
.tensors
.as_ref()
.unwrap()
.get(&feature_id)
.unwrap();
let tensor = match source_tensor {
GeneralTensor::FloatTensor(float_tensor) =>
//Tensor::of_slice(
if bpr.common_features.is_some() {
if bpr.common_features.as_ref().unwrap().tensors.is_some() {
if bpr
.common_features
.as_ref()
.unwrap()
.tensors
.as_ref()
.unwrap()
.contains_key(&feature_id)
{
float_tensor
.floats
.iter()
.map(|x| x.into_inner() as f32)
.collect::<Vec<_>>()
}
_ => vec![0 as f32; cols],
};
let source_tensor = bpr
.common_features
.as_ref()
.unwrap()
.tensors
.as_ref()
.unwrap()
.get(&feature_id)
.unwrap();
let tensor = match source_tensor {
GeneralTensor::FloatTensor(float_tensor) =>
//Tensor::of_slice(
{
float_tensor
.floats
.iter()
.map(|x| x.into_inner() as f32)
.collect::<Vec<_>>()
}
_ => vec![0 as f32; cols],
};
// since the tensor is found in common feature, add it in all batches
for row in bpr_start..bpr_end {
for col in 0..cols {
working_set[row * cols + col] = tensor[col];
// since the tensor is found in common feature, add it in all batches
for row in bpr_start..bpr_end {
for col in 0..cols {
working_set[row * cols + col] = tensor[col];
}
}
}
}
}
@ -298,9 +309,9 @@ impl BatchPredictionRequestToTorchTensorConverter {
// (INT64 --> INT64, DataRecord.discrete_feature)
fn get_continuous(&self, bprs: &[BatchPredictionRequest], batch_ends: &[usize]) -> InputTensor {
// These need to be part of model schema
let rows = batch_ends[batch_ends.len() - 1];
let cols = 5293;
let full_size = rows * cols;
let rows: usize = batch_ends[batch_ends.len() - 1];
let cols: usize = 5293;
let full_size: usize = rows * cols;
let default_val = f32::NAN;
let mut tensor = vec![default_val; full_size];
@ -325,15 +336,18 @@ impl BatchPredictionRequestToTorchTensorConverter {
.unwrap();
for feature in common_features {
if let Some(f_info) = self.feature_mapper.get(feature.0) {
let idx = f_info.index_within_tensor as usize;
if idx < cols {
// Set value in each row
for r in bpr_start..bpr_end {
let flat_index = r * cols + idx;
tensor[flat_index] = feature.1.into_inner() as f32;
match self.feature_mapper.get(feature.0) {
Some(f_info) => {
let idx = f_info.index_within_tensor as usize;
if idx < cols {
// Set value in each row
for r in bpr_start..bpr_end {
let flat_index: usize = r * cols + idx;
tensor[flat_index] = feature.1.into_inner() as f32;
}
}
}
None => (),
}
if self.continuous_features_to_report.contains(feature.0) {
self.continuous_feature_metrics
@ -349,24 +363,28 @@ impl BatchPredictionRequestToTorchTensorConverter {
// Process the batch of datarecords
for r in bpr_start..bpr_end {
let dr: &DataRecord = &bpr.individual_features_list[r - bpr_start];
let dr: &DataRecord =
&bpr.individual_features_list[usize::try_from(r - bpr_start).unwrap()];
if dr.continuous_features.is_some() {
for feature in dr.continuous_features.as_ref().unwrap() {
if let Some(f_info) = self.feature_mapper.get(feature.0) {
let idx = f_info.index_within_tensor as usize;
let flat_index = r * cols + idx;
if flat_index < tensor.len() && idx < cols {
tensor[flat_index] = feature.1.into_inner() as f32;
match self.feature_mapper.get(&feature.0) {
Some(f_info) => {
let idx = f_info.index_within_tensor as usize;
let flat_index: usize = r * cols + idx;
if flat_index < tensor.len() && idx < cols {
tensor[flat_index] = feature.1.into_inner() as f32;
}
}
None => (),
}
if self.continuous_features_to_report.contains(feature.0) {
self.continuous_feature_metrics
.with_label_values(&[feature.0.to_string().as_str()])
.observe(feature.1.into_inner())
.observe(feature.1.into_inner() as f64)
} else if self.discrete_features_to_report.contains(feature.0) {
self.discrete_feature_metrics
.with_label_values(&[feature.0.to_string().as_str()])
.observe(feature.1.into_inner())
.observe(feature.1.into_inner() as f64)
}
}
}
@ -383,10 +401,10 @@ impl BatchPredictionRequestToTorchTensorConverter {
fn get_binary(&self, bprs: &[BatchPredictionRequest], batch_ends: &[usize]) -> InputTensor {
// These need to be part of model schema
let rows = batch_ends[batch_ends.len() - 1];
let cols = 149;
let full_size = rows * cols;
let default_val = 0;
let rows: usize = batch_ends[batch_ends.len() - 1];
let cols: usize = 149;
let full_size: usize = rows * cols;
let default_val: i64 = 0;
let mut v = vec![default_val; full_size];
@ -410,15 +428,18 @@ impl BatchPredictionRequestToTorchTensorConverter {
.unwrap();
for feature in common_features {
if let Some(f_info) = self.feature_mapper.get(feature) {
let idx = f_info.index_within_tensor as usize;
if idx < cols {
// Set value in each row
for r in bpr_start..bpr_end {
let flat_index = r * cols + idx;
v[flat_index] = 1;
match self.feature_mapper.get(feature) {
Some(f_info) => {
let idx = f_info.index_within_tensor as usize;
if idx < cols {
// Set value in each row
for r in bpr_start..bpr_end {
let flat_index: usize = r * cols + idx;
v[flat_index] = 1;
}
}
}
None => (),
}
}
}
@ -428,10 +449,13 @@ impl BatchPredictionRequestToTorchTensorConverter {
let dr: &DataRecord = &bpr.individual_features_list[r - bpr_start];
if dr.binary_features.is_some() {
for feature in dr.binary_features.as_ref().unwrap() {
if let Some(f_info) = self.feature_mapper.get(feature) {
let idx = f_info.index_within_tensor as usize;
let flat_index = r * cols + idx;
v[flat_index] = 1;
match self.feature_mapper.get(&feature) {
Some(f_info) => {
let idx = f_info.index_within_tensor as usize;
let flat_index: usize = r * cols + idx;
v[flat_index] = 1;
}
None => (),
}
}
}
@ -448,10 +472,10 @@ impl BatchPredictionRequestToTorchTensorConverter {
#[allow(dead_code)]
fn get_discrete(&self, bprs: &[BatchPredictionRequest], batch_ends: &[usize]) -> InputTensor {
// These need to be part of model schema
let rows = batch_ends[batch_ends.len() - 1];
let cols = 320;
let full_size = rows * cols;
let default_val = 0;
let rows: usize = batch_ends[batch_ends.len() - 1];
let cols: usize = 320;
let full_size: usize = rows * cols;
let default_val: i64 = 0;
let mut v = vec![default_val; full_size];
@ -475,15 +499,18 @@ impl BatchPredictionRequestToTorchTensorConverter {
.unwrap();
for feature in common_features {
if let Some(f_info) = self.feature_mapper.get(feature.0) {
let idx = f_info.index_within_tensor as usize;
if idx < cols {
// Set value in each row
for r in bpr_start..bpr_end {
let flat_index = r * cols + idx;
v[flat_index] = *feature.1;
match self.feature_mapper.get(feature.0) {
Some(f_info) => {
let idx = f_info.index_within_tensor as usize;
if idx < cols {
// Set value in each row
for r in bpr_start..bpr_end {
let flat_index: usize = r * cols + idx;
v[flat_index] = *feature.1;
}
}
}
None => (),
}
if self.discrete_features_to_report.contains(feature.0) {
self.discrete_feature_metrics
@ -495,15 +522,18 @@ impl BatchPredictionRequestToTorchTensorConverter {
// Process the batch of datarecords
for r in bpr_start..bpr_end {
let dr: &DataRecord = &bpr.individual_features_list[r];
let dr: &DataRecord = &bpr.individual_features_list[usize::try_from(r).unwrap()];
if dr.discrete_features.is_some() {
for feature in dr.discrete_features.as_ref().unwrap() {
if let Some(f_info) = self.feature_mapper.get(feature.0) {
let idx = f_info.index_within_tensor as usize;
let flat_index = r * cols + idx;
if flat_index < v.len() && idx < cols {
v[flat_index] = *feature.1;
match self.feature_mapper.get(&feature.0) {
Some(f_info) => {
let idx = f_info.index_within_tensor as usize;
let flat_index: usize = r * cols + idx;
if flat_index < v.len() && idx < cols {
v[flat_index] = *feature.1;
}
}
None => (),
}
if self.discrete_features_to_report.contains(feature.0) {
self.discrete_feature_metrics
@ -569,7 +599,7 @@ impl Converter for BatchPredictionRequestToTorchTensorConverter {
.map(|bpr| bpr.individual_features_list.len())
.scan(0usize, |acc, e| {
//running total
*acc += e;
*acc = *acc + e;
Some(*acc)
})
.collect::<Vec<_>>();