Source code for sgnlp.models.rumour_detection_twitter.tokenization

from transformers import TransfoXLTokenizer


[docs]class RumourDetectionTwitterTokenizer(TransfoXLTokenizer): """ This Tokenizer class performs word-level tokenization to generate tokens. Args: text (:obj:`str`): input text string to tokenize Example:: # 1. From local vocab file vocab_file = 'vocab.txt' tokenizer = RumourDetectionTwitterTokenizer(vocab_file=vocab_file) tokenizer.build_vocab() token_ids, token_attention_masks = tokenizer.tokenize_threads( [ ["The quick brown fox", "jumped over the lazy dog"], [ "Are those shy Eurasian footwear", "cowboy chaps", "or jolly earthmoving headgear?", ], ] ) # 2. Download pretrained from storage #TODO """ def __init__(self, *args, vocab_file, **kwargs): super().__init__(*args, **kwargs) self.vocab_file = vocab_file # self.unk_idx = self.unk_token_id
[docs] def tokenize_threads(self, threads, max_length=None, max_posts=None, **kwargs): """ This function performs tokenization on a batch of Twitter threads and returns the token ids and attention masks for each tweet. Args: threads (List[List[str]]): A batch of threads containing the raw text from the Tweets to be tokenized. max_length (int): Maxmium number of tokens in a single Tweet max_posts (int): Maximum number of Tweets in a single thread Returns: :List[List[int]]: token ids for each token in each Tweet. Each tweet/thread would have been padded (or truncated) to `max_length`/`max_posts` respectively. :List[List[int]]: attention mask (0 or 1) for each token in each Tweet. Each tweet/thread would have been padded (or truncated) to `max_length`/`max_posts` respectively. """ assert ( max_length is not None ), "Please specify the maximum sequence length for each tweet. This is required to fit all the token indices in a single tensor." input_ids = [] attention_masks = [] for thread in threads: thread_input_ids = self.__call__( thread, max_length=max_length, **kwargs ).input_ids fully_padded_thread = [self.pad_token_id] * max_length token_masks = list( map( lambda x: list(map(lambda y: 1 if y != 0 else 0, x)), thread_input_ids, ) ) # pad the threads to max post length input_ids += [ thread_input_ids[:max_posts] + [fully_padded_thread] * (max_posts - len(thread)) ] attention_masks += [ token_masks[:max_posts] + [[0] * max_length] * (max_posts - len(thread)) ] return input_ids, attention_masks