mirror of
https://github.com/twitter/the-algorithm.git
synced 2025-06-18 18:48:15 -05:00
Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
This commit is contained in:
240
science/search/ingester/config/pipeline-ingester.realtime.xml
Normal file
240
science/search/ingester/config/pipeline-ingester.realtime.xml
Normal file
@ -0,0 +1,240 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
|
||||
<!-- Ingesters process tweet create events from TweetyPie and write them to a queue for Earlybird
|
||||
to index. -->
|
||||
<pipeline>
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka">
|
||||
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<!-- Read tweets from the thrift kafka queue. The reader loops forever. -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.KafkaRawRecordConsumerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
kafkaConsumerGroupId=""
|
||||
maxPollRecords="1"
|
||||
pollTimeoutMs="1000"
|
||||
partitioned="false"
|
||||
deciderKey=""
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Deserialize the bytes into TweetData -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TweetEventDeserializerStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Filter to only have the safetytype for this cluster -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.FilterEventsBySafetyTypeStage"
|
||||
tweetCreateLatencyLogThresholdMillis="5000"
|
||||
safetyType="PUBLIC"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Parse to TwitterMessage -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ThriftTweetParserStage"
|
||||
tweetCreateEventBranchNames="kafka_retweet_and_reply"
|
||||
tweetDeleteEventBranchNames="kafka_update_events_delete"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<branch>
|
||||
<pipeline key="kafka_update_events_delete">
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka_update_events_delete">
|
||||
|
||||
<!-- we are willing to queue more deletes than other stages,
|
||||
to make sure we don't slow down the incoming tweets -->
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.DeleteUpdateEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
driverFactoryId="kafka_update_events_delete"/>
|
||||
</pipeline>
|
||||
</branch>
|
||||
|
||||
<!-- Processes retweets and replies to tweets -->
|
||||
<branch>
|
||||
<pipeline key="kafka_retweet_and_reply">
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka_retweet_and_reply">
|
||||
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<!-- An incoming reply to this stage can either be a tweet directed at someone using @mention, or
|
||||
a tweet that is a direct reply to another tweet. This stage filters retweets and tweets that are
|
||||
direct replies to other tweets into the retweet_and_reply pipeline -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.FilterRetweetsAndRepliesStage"
|
||||
driverFactoryId="kafka_retweet_and_reply"/>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ConvertToThriftVersionedEventsStage"
|
||||
driverFactoryId="kafka_retweet_and_reply"/>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.RetweetAndReplyUpdateEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
driverFactoryId="kafka_retweet_and_reply"/>
|
||||
</pipeline>
|
||||
</branch>
|
||||
|
||||
<!-- filters out messages that are not formatted correctly -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.FilterTwitterMessageStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- retrieves space ids from space urls if the tweet has space urls -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveSpaceIdsStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
|
||||
<!-- looks up user reputation scores for each message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.LookupUserPropertiesBatchedStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- extract text features of the message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TextFeatureExtractionWorkersStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- compute text quality score of the message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TextQualityEvaluationWorkerStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Extract lat/lon pairs from the text, and geocode them -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.SingleTweetExtractAndGeocodeLatLonStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- adds coded locations -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.PopulateCodedLocationsBatchedStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Parse the TwitterMessages into ThriftStatuses -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ConvertMessageToThriftStage"
|
||||
thriftVersionedEventsBranchName="kafka_base_tweets"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Branch for tweets -->
|
||||
<branch>
|
||||
<pipeline key="kafka_base_tweets">
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka_base_tweets">
|
||||
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.TweetThriftVersionedEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName="search_ingester_indexing_events_realtime_prod"
|
||||
driverFactoryId="kafka_base_tweets"/>
|
||||
</pipeline>
|
||||
</branch>
|
||||
|
||||
<!-- Resolve compressed URL via Pink -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ResolveCompressedUrlsBatchedStage"
|
||||
pinkClientId="INGESTER"
|
||||
batchedStageBatchSize="10"
|
||||
tweetMaxAgeToResolve="10000"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Retrieve card information -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveCardBatchedStage"
|
||||
tweetypieClientId="ingester.prod"
|
||||
internalBatchSize="50"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Retrieve named entities -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveNamedEntitiesSingleTweetStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- retrieves space admins and title for a tweet if the tweet has space urls -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveSpaceAdminsAndTitleStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- extract text features of the message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TextUrlsFeatureExtractionStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Compute the tweet signature -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ComputeTweetSignatureStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Parse the TwitterMessages into ThriftStatuses -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ConvertDelayedMessageToThriftStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.TweetThriftVersionedEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
stageName="UpdateEvents"
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
driverFactoryId="kafka"/>
|
||||
</pipeline>
|
Reference in New Issue
Block a user