Source code for supar.models.const.crf.model

# -*- coding: utf-8 -*-

import torch
import torch.nn as nn
from supar.config import Config
from supar.model import Model
from supar.modules import MLP, Biaffine
from supar.structs import ConstituencyCRF


[docs]class CRFConstituencyModel(Model): r""" The implementation of CRF Constituency Parser :cite:`zhang-etal-2020-fast`, also called FANCY (abbr. of Fast and Accurate Neural Crf constituencY) Parser. Args: n_words (int): The size of the word vocabulary. n_labels (int): The number of labels in the treebank. n_tags (int): The number of POS tags, required if POS tag embeddings are used. Default: ``None``. n_chars (int): The number of characters, required if character-level representations are used. Default: ``None``. encoder (str): Encoder to use. ``'lstm'``: BiLSTM encoder. ``'bert'``: BERT-like pretrained language model (for finetuning), e.g., ``'bert-base-cased'``. Default: ``'lstm'``. feat (List[str]): Additional features to use, required if ``encoder='lstm'``. ``'tag'``: POS tag embeddings. ``'char'``: Character-level representations extracted by CharLSTM. ``'bert'``: BERT representations, other pretrained language models like RoBERTa are also feasible. Default: [``'char'``]. n_embed (int): The size of word embeddings. Default: 100. n_pretrained (int): The size of pretrained word embeddings. Default: 100. n_feat_embed (int): The size of feature representations. Default: 100. n_char_embed (int): The size of character embeddings serving as inputs of CharLSTM, required if using CharLSTM. Default: 50. n_char_hidden (int): The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100. char_pad_index (int): The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0. elmo (str): Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``. elmo_bos_eos (Tuple[bool]): A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs. Default: ``(True, False)``. bert (str): Specifies which kind of language model to use, e.g., ``'bert-base-cased'``. This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_. Default: ``None``. n_bert_layers (int): Specifies how many last layers to use, required if ``encoder='bert'`` or using BERT features. The final outputs would be weighted sum of the hidden states of these layers. Default: 4. mix_dropout (float): The dropout ratio of BERT layers, required if ``encoder='bert'`` or using BERT features. Default: .0. bert_pooling (str): Pooling way to get token embeddings. ``first``: take the first subtoken. ``last``: take the last subtoken. ``mean``: take a mean over all. Default: ``mean``. bert_pad_index (int): The index of the padding token in BERT vocabulary, required if ``encoder='bert'`` or using BERT features. Default: 0. finetune (bool): If ``False``, freezes all parameters, required if using pretrained layers. Default: ``False``. n_plm_embed (int): The size of PLM embeddings. If 0, uses the size of the pretrained embedding model. Default: 0. embed_dropout (float): The dropout ratio of input embeddings. Default: .33. n_encoder_hidden (int): The size of encoder hidden states. Default: 800. n_encoder_layers (int): The number of encoder layers. Default: 3. encoder_dropout (float): The dropout ratio of encoder layer. Default: .33. n_span_mlp (int): Span MLP size. Default: 500. n_label_mlp (int): Label MLP size. Default: 100. mlp_dropout (float): The dropout ratio of MLP layers. Default: .33. pad_index (int): The index of the padding token in the word vocabulary. Default: 0. unk_index (int): The index of the unknown token in the word vocabulary. Default: 1. .. _transformers: https://github.com/huggingface/transformers """ def __init__(self, n_words, n_labels, n_tags=None, n_chars=None, encoder='lstm', feat=['char'], n_embed=100, n_pretrained=100, n_feat_embed=100, n_char_embed=50, n_char_hidden=100, char_pad_index=0, elmo='original_5b', elmo_bos_eos=(True, True), bert=None, n_bert_layers=4, mix_dropout=.0, bert_pooling='mean', bert_pad_index=0, finetune=False, n_plm_embed=0, embed_dropout=.33, n_encoder_hidden=800, n_encoder_layers=3, encoder_dropout=.33, n_span_mlp=500, n_label_mlp=100, mlp_dropout=.33, pad_index=0, unk_index=1, **kwargs): super().__init__(**Config().update(locals())) self.span_mlp_l = MLP(n_in=self.args.n_encoder_hidden, n_out=n_span_mlp, dropout=mlp_dropout) self.span_mlp_r = MLP(n_in=self.args.n_encoder_hidden, n_out=n_span_mlp, dropout=mlp_dropout) self.label_mlp_l = MLP(n_in=self.args.n_encoder_hidden, n_out=n_label_mlp, dropout=mlp_dropout) self.label_mlp_r = MLP(n_in=self.args.n_encoder_hidden, n_out=n_label_mlp, dropout=mlp_dropout) self.span_attn = Biaffine(n_in=n_span_mlp, bias_x=True, bias_y=False) self.label_attn = Biaffine(n_in=n_label_mlp, n_out=n_labels, bias_x=True, bias_y=True) self.criterion = nn.CrossEntropyLoss()
[docs] def forward(self, words, feats=None): r""" Args: words (~torch.LongTensor): ``[batch_size, seq_len]``. Word indices. feats (List[~torch.LongTensor]): A list of feat indices. The size is either ``[batch_size, seq_len, fix_len]`` if ``feat`` is ``'char'`` or ``'bert'``, or ``[batch_size, seq_len]`` otherwise. Default: ``None``. Returns: ~torch.Tensor, ~torch.Tensor: The first tensor of shape ``[batch_size, seq_len, seq_len]`` holds scores of all possible constituents. The second of shape ``[batch_size, seq_len, seq_len, n_labels]`` holds scores of all possible labels on each constituent. """ x = self.encode(words, feats) x_f, x_b = x.chunk(2, -1) x = torch.cat((x_f[:, :-1], x_b[:, 1:]), -1) span_l = self.span_mlp_l(x) span_r = self.span_mlp_r(x) label_l = self.label_mlp_l(x) label_r = self.label_mlp_r(x) # [batch_size, seq_len, seq_len] s_span = self.span_attn(span_l, span_r) # [batch_size, seq_len, seq_len, n_labels] s_label = self.label_attn(label_l, label_r).permute(0, 2, 3, 1) return s_span, s_label
[docs] def loss(self, s_span, s_label, charts, mask, mbr=True): r""" Args: s_span (~torch.Tensor): ``[batch_size, seq_len, seq_len]``. Scores of all constituents. s_label (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``. Scores of all constituent labels. charts (~torch.LongTensor): ``[batch_size, seq_len, seq_len]``. The tensor of gold-standard labels. Positions without labels are filled with -1. mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``. The mask for covering the unpadded tokens in each chart. mbr (bool): If ``True``, returns marginals for MBR decoding. Default: ``True``. Returns: ~torch.Tensor, ~torch.Tensor: The training loss and original constituent scores of shape ``[batch_size, seq_len, seq_len]`` if ``mbr=False``, or marginals otherwise. """ span_mask = charts.ge(0) & mask span_dist = ConstituencyCRF(s_span, mask[:, 0].sum(-1)) span_loss = -span_dist.log_prob(charts).sum() / mask[:, 0].sum() span_probs = span_dist.marginals if mbr else s_span label_loss = self.criterion(s_label[span_mask], charts[span_mask]) loss = span_loss + label_loss return loss, span_probs
[docs] def decode(self, s_span, s_label, mask): r""" Args: s_span (~torch.Tensor): ``[batch_size, seq_len, seq_len]``. Scores of all constituents. s_label (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``. Scores of all constituent labels. mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``. The mask for covering the unpadded tokens in each chart. Returns: List[List[Tuple]]: Sequences of factorized labeled trees. """ span_preds = ConstituencyCRF(s_span, mask[:, 0].sum(-1)).argmax label_preds = s_label.argmax(-1).tolist() return [[(i, j, labels[i][j]) for i, j in spans] for spans, labels in zip(span_preds, label_preds)]