import torch
import numpy as np
from transformers import BertModel, BertTokenizer
from ..utils import get_default_device
[docs]class Bert:
MASK = "[MASK]"
CLS = "[CLS]"
SEP = "[SEP]"
def __init__(self, model_name, device=None):
super().__init__()
self.model_name = model_name
self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
self.device = device if device else get_default_device()
self.model = BertModel.from_pretrained(model_name).to(device=self.device)
self.max_len = self.model.embeddings.position_embeddings.weight.size(0)
self.dim = self.model.embeddings.position_embeddings.weight.size(1)
def tokenize(self, text, masked_idxs=None):
tokenized_text = self.tokenizer.tokenize(text)
if masked_idxs is not None:
for idx in masked_idxs:
tokenized_text[idx] = self.MASK
# prepend [CLS] and append [SEP]
# see https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py#L195 # NOQA
tokenized = [self.CLS] + tokenized_text + [self.SEP]
return tokenized
def tokenize_to_ids(self, text, masked_idxs=None, pad=True):
tokens = self.tokenize(text, masked_idxs)
return self.convert_tokens_to_ids(tokens, pad=pad)
def convert_tokens_to_ids(self, tokens, pad=True):
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
ids = torch.tensor([token_ids])
assert ids.size(1) < self.max_len
if pad:
padded_ids = torch.zeros(1, self.max_len).to(ids)
padded_ids[0, : ids.size(1)] = ids
mask = torch.zeros(1, self.max_len).to(ids)
mask[0, : ids.size(1)] = 1
return padded_ids, mask
else:
return ids
def flatten(self, list_of_lists):
for list in list_of_lists:
for item in list:
yield item
[docs] def subword_tokenize(self, tokens):
"""Segment each token into subwords while keeping track of
token boundaries.
Parameters
----------
tokens: A sequence of strings, representing input tokens.
Returns
-------
A tuple consisting of:
- A list of subwords, flanked by the special symbols required
by Bert (CLS and SEP).
- An array of indices into the list of subwords, indicating
that the corresponding subword is the start of a new
token. For example, [1, 3, 4, 7] means that the subwords
1, 3, 4, 7 are token starts, while all other subwords
(0, 2, 5, 6, 8...) are in or at the end of tokens.
This list allows selecting Bert hidden states that
represent tokens, which is necessary in sequence
labeling.
"""
subwords = list(map(self.tokenizer.tokenize, tokens))
subword_lengths = list(map(len, subwords))
subwords = [self.CLS] + list(self.flatten(subwords))[:509] + [self.SEP]
token_start_idxs = 1 + np.cumsum([0] + subword_lengths[:-1])
token_start_idxs[token_start_idxs > 509] = 509
return subwords, token_start_idxs
[docs] def subword_tokenize_to_ids(self, tokens):
"""Segment each token into subwords while keeping track of
token boundaries and convert subwords into IDs.
Parameters
----------
tokens: A sequence of strings, representing input tokens.
Returns
-------
A tuple consisting of:
- A list of subword IDs, including IDs of the special
symbols (CLS and SEP) required by Bert.
- A mask indicating padding tokens.
- An array of indices into the list of subwords. See
doc of subword_tokenize.
"""
subwords, token_start_idxs = self.subword_tokenize(tokens)
subword_ids, mask = self.convert_tokens_to_ids(subwords)
token_starts = torch.zeros(1, self.max_len).to(subword_ids)
token_starts[0, token_start_idxs] = 1
return subword_ids.numpy(), mask.numpy(), token_starts.numpy()
def segment_ids(self, segment1_len, segment2_len):
ids = [0] * segment1_len + [1] * segment2_len
return torch.tensor([ids])