Source code for sgnlp.models.emotion_entailment.utils

from typing import List, Dict
import argparse
import json
import pathlib

import pandas as pd
import torch
from tqdm import tqdm
from torch.utils.data import TensorDataset

from .tokenization import (
    RecconEmotionEntailmentTokenizer,
)
from .data_class import RecconEmotionEntailmentArguments


[docs]def parse_args_and_load_config( config_path: str = "config/emotion_entailment_config.json", ) -> RecconEmotionEntailmentArguments: """Get config from config file using argparser Returns: RecconEmotionEntailmentArguments: RecconEmotionEntailmentArguments instance """ parser = argparse.ArgumentParser() parser.add_argument("--config", type=str, default=config_path) args = parser.parse_args() with open(pathlib.Path(__file__).parent / args.config, "r") as f: cfg = json.load(f) emotion_entailment_args = RecconEmotionEntailmentArguments(**cfg) return emotion_entailment_args
[docs]class RecconEmotionEntailmentData(torch.utils.data.Dataset): """Class to create torch Dataset instance, which is the required data type for Transformer's Trainer Args: dataset (TensorDataset): TensorDataset object is_training (bool, optional): Set True if training, set False if evaluating. Defaults to True. """ def __init__(self, dataset: TensorDataset, is_training: bool = True) -> None: """Constructor method""" self.dataset = dataset self.is_training = is_training def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: """Get a dictionary of the selected instance for each batch Args: idx (int): idx to select instances for each batch Returns: (Dict[str, torch.Tensor]): dictionary containing input_ids, attention_mask and token_type_ids of the selected instance """ item = {} in_idx, in_mask, seg_ids, label = self.dataset[idx] item["input_ids"] = in_idx item["attention_mask"] = in_mask item["token_type_ids"] = seg_ids if self.is_training: item["labels"] = label return item def __len__(self) -> int: """Returns length of dataset Returns: int: length of the dataset attribute """ return len(self.dataset)
[docs]class InputExample(object): """Convert pandas dataframe data instance to InputExample instance for easier manipulation Args: guid (int): [description] text_a (str): [description] text_b (str, optional): Not used for emotion entailment. Defaults to None. label (int optional): Contains label of data. Defaults to None. """ def __init__(self, guid: int, text_a: str, text_b: str = None, label: int = None): self.guid = guid self.text_a = text_a self.text_b = text_b self.label = label
[docs]class InputFeatures(object): """Convert InputExample instance to InputFeature instance for easier manipulation Args: input_ids (List[int]): Contains list of input_ids created by tokenizer input_mask (List[int]): Contains list of input_mask created by tokenizer. Int needs to be either 1 or 0. segment_ids (List[int], optional): Contains list of segment_ids created by tokenizer. Int needs to be either 1 or 0. Defaults to None. label_id (int): Contains label of data. """ def __init__( self, input_ids: List[int], input_mask: List[int], segment_ids: List[int], label_id: int, ): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids self.label_id = label_id
[docs]def convert_df_to_dataset( df: pd.DataFrame, max_seq_length: int, tokenizer: RecconEmotionEntailmentTokenizer ) -> TensorDataset: """Convert pandas dataframe to TensorDataset Args: df (pd.DataFrame): DataFrame containing 'text' and 'labels' columns max_seq_length (int): max sequence length tokenizer (RecconEmotionEntailmentTokenizer): RecconEmotionEntailmentTokenizer from sgnlp Returns: TensorDataset: A tensordataset object containing all examples """ examples = convert_df_to_examples(df) dataset = load_examples(examples, max_seq_length, tokenizer) return dataset
[docs]def convert_df_to_examples(df: pd.DataFrame) -> List[InputExample]: """Convert dataframe to examples which can be fed into dataloader Args: df (pd.DataFrame): df which contains 'text' and 'labels columns Returns: List[InputExample]: list of InputExample """ if "text" in df.columns and "labels" in df.columns: examples = [ InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(df["text"].astype(str), df["labels"])) ] return examples
[docs]def convert_example_to_feature( example_row: List, pad_token: int = 0, sequence_a_segment_id: int = 0, cls_token_segment_id: int = 1, pad_token_segment_id: int = 0, mask_padding_with_zero: bool = True, sep_token_extra: bool = False, ) -> InputFeatures: """Method to generate InputFeatures object from individual example row Args: example_row (List): List of inputs from convert_examples_to_features function pad_token (int, optional): Option for pad_token. Defaults to 0. sequence_a_segment_id (int, optional): Option for sequence_a_segment_id. Defaults to 0. cls_token_segment_id (int, optional): Option for cls_token_segment_id. Defaults to 1. pad_token_segment_id (int, optional): Option for pad_token_segment_id Defaults to 0. mask_padding_with_zero (bool, optional): Option for mask_padding_with_zero . Defaults to True. sep_token_extra (bool, optional): Option for sep_token_extra. Defaults to False. Returns: InputFeatures: Generated InputFeatures object """ ( example, max_seq_length, tokenizer, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id, sep_token_extra, pad_token, add_prefix_space, pad_to_max_length, ) = example_row if add_prefix_space and not example.text_a.startswith(" "): tokens_a = tokenizer.tokenize(" " + example.text_a) else: tokens_a = tokenizer.tokenize(example.text_a) special_tokens_count = 3 if sep_token_extra else 2 if len(tokens_a) > max_seq_length - special_tokens_count: tokens_a = tokens_a[: (max_seq_length - special_tokens_count)] tokens = tokens_a + [sep_token] segment_ids = [sequence_a_segment_id] * len(tokens) if cls_token_at_end: tokens = tokens + [cls_token] segment_ids = segment_ids + [cls_token_segment_id] else: tokens = [cls_token] + tokens segment_ids = [cls_token_segment_id] + segment_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) if pad_to_max_length: padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids input_mask = ( [0 if mask_padding_with_zero else 1] * padding_length ) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids else: input_ids = input_ids + ([pad_token] * padding_length) input_mask = input_mask + ( [0 if mask_padding_with_zero else 1] * padding_length ) segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length return InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=example.label, )
[docs]def convert_examples_to_features( examples: List[InputExample], max_seq_length: int, tokenizer: RecconEmotionEntailmentTokenizer, cls_token_at_end: bool = False, sep_token_extra: bool = False, pad_on_left: bool = False, cls_token: str = "[CLS]", sep_token: str = "[SEP]", pad_token: int = 0, cls_token_segment_id: int = 1, pad_token_segment_id: int = 0, silent: bool = False, add_prefix_space: bool = False, pad_to_max_length: bool = True, ) -> List[InputFeatures]: """Loop method to process all examples Args: examples (List[InputExample]):List of InputExample max_seq_length (int): Max sequence length tokenizer (RecconEmotionEntailmentTokenizer): RecconEmotionEntailmentTokenizer sgnlp cls_token_at_end (bool, optional): Option to use cls_token_at_end . Defaults to False. sep_token_extra (bool, optional): Option to use sep_token_extra. Defaults to False. pad_on_left (bool, optional): Option to use pad_on_left. Defaults to False. cls_token (str, optional): cls_token to use. Defaults to "[CLS]". sep_token (str, optional): sep_token to use. Defaults to "[SEP]". pad_token (int, optional): pad_token to use. Defaults to 0. cls_token_segment_id (int, optional): Option for cls_token_segment_id. Defaults to 1. pad_token_segment_id (int, optional): Option for pad_token_segment_id. Defaults to 0. silent (bool, optional): Option for silent. Defaults to False. add_prefix_space (bool, optional): Option to add_prefix_space. Defaults to False. pad_to_max_length (bool, optional): Option to pad_to_max_length. Defaults to True. Returns: List[InputFeatures]: list of InputFeatures """ examples = [ ( example, max_seq_length, tokenizer, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id, sep_token_extra, pad_token, add_prefix_space, pad_to_max_length, ) for example in examples ] return [ convert_example_to_feature(example) for example in tqdm(examples, disable=silent, position=0, leave=True) ]
[docs]def load_examples( examples: List[InputExample], max_seq_length: int, tokenizer: RecconEmotionEntailmentTokenizer, ) -> TensorDataset: """Load examples from dataframe Args: examples (List[InputExample]): list of InputFeatures max_seq_length (int): max sequence length tokenizer (RecconEmotionEntailmentTokenizer): RecconEmotionEntailmentTokenizer from sgnlp Returns: TensorDataset: A tensordataset object containing all examples """ features = convert_examples_to_features( examples, max_seq_length, tokenizer, cls_token_at_end=False, cls_token=tokenizer.cls_token, cls_token_segment_id=0, sep_token=tokenizer.sep_token, sep_token_extra=True, pad_on_left=False, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0, add_prefix_space=True, pad_to_max_length=bool(len(examples) > 1), ) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) dataset = TensorDataset( all_input_ids, all_input_mask, all_segment_ids, all_label_ids ) return dataset
[docs]def get_all_evidence_utterance_from_conversation( emotion: str, conversation_history: List[str] ) -> Dict[str, List[str]]: """Iterate through a conversation history to let each utterance be the evidence utterance. The last utterance is treated as the target utterance. Ouput dictionary is in a format which can be used with RecconEmotionEntailmentPreprocessor Args: emotion (str): Emotion of the target utterance conversation_history (List[str]): List of utterance in a conversation. The last utterance is used as the target utterance. Returns: Dict[str, List[str]]: Dictionary in a format that can be used with RecconEmotionEntailmentPreprocessor The dictionary looks like this: {'emotion': ['happiness'], 'target_utterance': ['......'], 'evidence_utterance': ['......'], 'conversation_history': ['......']} """ conversation_history_text = " ".join(conversation_history) target_utterance = conversation_history[-1] output = { "emotion": [], "target_utterance": [], "evidence_utterance": [], "conversation_history": [], } for evidence_utterance in conversation_history: output["emotion"].append(emotion) output["target_utterance"].append(target_utterance) output["evidence_utterance"].append(evidence_utterance) output["conversation_history"].append(conversation_history_text) return output