Source code for sgnlp.models.sentic_gcn.tokenization

import pathlib
import pickle
from typing import Dict, List, Optional, Tuple

import torch

from transformers import PreTrainedTokenizer, BertTokenizer


VOCAB_FILES_NAMES = {"vocab_file": "vocab.pkl"}


[docs]class SenticGCNTokenizer(PreTrainedTokenizer): """ The SenticGCN tokenizer class used for to generate tokens for the embedding model. Args: text (:obj:`str`): input text string to tokenize Example:: tokenizer = SenticGCNTokenizer.from_pretrained("senticgcn") inputs = tokenizer('Hello World!') inputs['input_ids'] """ vocab_files_names = VOCAB_FILES_NAMES def __init__( self, vocab_file: str = None, train_files: List[str] = None, train_vocab: bool = False, do_lower_case: bool = True, unk_token: str = "<unk>", pad_token: str = "<pad>", **kwargs, ): super().__init__( do_lower_case=do_lower_case, unk_token=unk_token, pad_token=pad_token, **kwargs, ) self.do_lower_case = do_lower_case if train_vocab: self.vocab = self.create_vocab(train_files) else: with open(vocab_file, "rb") as fin: self.vocab = pickle.load(fin) self.ids_to_tokens = {v: k for k, v in self.vocab.items()} @property def vocab_size(self): return len(self.vocab)
[docs] def get_vocab(self): return dict(self.vocab)
def _convert_token_to_id(self, token: str) -> int: return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index: int) -> str: return self.ids_to_tokens(index, self.unk_token) @staticmethod def __read_text_file(file_names: List[str]) -> str: """ Helper method to read contents of a list of text files. Args: file_names (List[str]): list of text files to read. Returns: str: return a concatenated string of text files contents. """ text = "" for fname in file_names: with open(fname, "r", encoding="utf-8", newline="\n", errors="ignore") as fin: lines = fin.readlines() for i in range(0, len(lines), 3): text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] aspect = lines[i + 1].lower().strip() text += f"{text_left} {aspect} {text_right} " # Left a space at the end return text def create_vocab(self, train_files: List[str]) -> Dict[str, int]: text = SenticGCNTokenizer.__read_text_file(train_files) if self.do_lower_case: text = text.lower() vocab = {} vocab[self.pad_token] = 0 vocab[self.unk_token] = 1 offset = len(vocab.keys()) words = text.split() for word in words: if word not in vocab: vocab[word] = offset offset += 1 return vocab def _tokenize(self, text, **kwargs): if self.do_lower_case: text = text.lower() words = text.split() return words
[docs] def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: save_dir = pathlib.Path(save_directory) save_dir.mkdir(exist_ok=True) vocab_file_path = save_dir.joinpath("vocab.pkl") with open(vocab_file_path, "wb") as fout: pickle.dump(self.vocab, fout) return (str(vocab_file_path),)
[docs]class SenticGCNBertTokenizer(BertTokenizer): """ The senticGCN Bert Tokenizer class used to generate tokens for the embedding model, derived from BertTokenizer class. Args: text (:obj:`str`): input text string to tokenize Example:: tokenizer = SenticGCNBertTokenizer.from_pretrained('bert-base-uncased') inputs = tokenizer('Hello World!') inputs['input_ids'] """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)
[docs] def __call__( self, text, max_length: int = 85, add_special_tokens: bool = False, padding: bool = True, truncation: bool = True, return_token_type_ids: bool = False, return_attention_mask: bool = False, return_tensors: str = None, **kwargs, ): encoding = super().__call__( text, max_length=max_length, add_special_tokens=add_special_tokens, padding=padding, truncation=truncation, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, return_tensors=return_tensors, **kwargs, ) # Workaround for padding empty input text for key in encoding.keys(): if len(encoding[key]) == 0 and padding == "max_length": encoding[key] = [0] * max_length if return_tensors == "pt": encoding[key] = torch.tensor(encoding[key]) return encoding