Source code for supar.utils.data

# -*- coding: utf-8 -*-

from __future__ import annotations

import os
import queue
import tempfile
import threading
from contextlib import contextmanager
from typing import Dict, Iterable, List, Union

import pathos.multiprocessing as mp
import torch
import torch.distributed as dist
from torch.distributions.utils import lazy_property

from supar.utils.common import INF
from supar.utils.fn import binarize, debinarize, kmeans
from supar.utils.logging import get_logger, progress_bar
from supar.utils.parallel import gather, is_dist, is_master
from supar.utils.transform import Batch, Transform

logger = get_logger(__name__)


[docs]class Dataset(torch.utils.data.Dataset): r""" Dataset that is compatible with :class:`torch.utils.data.Dataset`, serving as a wrapper for manipulating all data fields with the operating behaviours defined in :class:`~supar.utils.transform.Transform`. The data fields of all the instantiated sentences can be accessed as an attribute of the dataset. Args: transform (Transform): An instance of :class:`~supar.utils.transform.Transform` or its derivations. The instance holds a series of loading and processing behaviours with regard to the specific data format. data (Union[str, Iterable]): A filename or a list of instances that will be passed into :meth:`transform.load`. cache (bool): If ``True``, tries to use the previously cached binarized data for fast loading. In this way, sentences are loaded on-the-fly according to the meta data. If ``False``, all sentences will be directly loaded into the memory. Default: ``False``. binarize (bool): If ``True``, binarizes the dataset once building it. Only works if ``cache=True``. Default: ``False``. bin (str): Path to binarized files, required if ``cache=True``. Default: ``None``. max_len (int): Sentences exceeding the length will be discarded. Default: ``None``. kwargs (Dict): Together with `data`, kwargs will be passed into :meth:`transform.load` to control the loading behaviour. Attributes: transform (Transform): An instance of :class:`~supar.utils.transform.Transform`. sentences (List[Sentence]): A list of sentences loaded from the data. Each sentence includes fields obeying the data format defined in ``transform``. If ``cache=True``, each is a pointer to the sentence stored in the cache file. """ def __init__( self, transform: Transform, data: Union[str, Iterable], cache: bool = False, binarize: bool = False, bin: str = None, max_len: int = None, **kwargs ) -> Dataset: super(Dataset, self).__init__() self.transform = transform self.data = data self.cache = cache self.binarize = binarize self.bin = bin self.max_len = max_len or INF self.kwargs = kwargs if cache: if not isinstance(data, str) or not os.path.exists(data): raise FileNotFoundError("Please specify a valid file path for caching!") if self.bin is None: self.fbin = data + '.pt' else: os.makedirs(self.bin, exist_ok=True) self.fbin = os.path.join(self.bin, os.path.split(data)[1]) + '.pt' if not self.binarize and os.path.exists(self.fbin): try: self.sentences = debinarize(self.fbin, meta=True)['sentences'] except Exception: raise RuntimeError(f"Error found while debinarizing {self.fbin}, which may have been corrupted. " "Try re-binarizing it first!") else: self.sentences = list(transform.load(data, **kwargs)) def __repr__(self): s = f"{self.__class__.__name__}(" s += f"n_sentences={len(self.sentences)}" if hasattr(self, 'loader'): s += f", n_batches={len(self.loader)}" if hasattr(self, 'buckets'): s += f", n_buckets={len(self.buckets)}" if self.cache: s += f", cache={self.cache}" if self.binarize: s += f", binarize={self.binarize}" if self.max_len < INF: s += f", max_len={self.max_len}" s += ")" return s def __len__(self): return len(self.sentences) def __getitem__(self, index): return debinarize(self.fbin, self.sentences[index]) if self.cache else self.sentences[index] def __getattr__(self, name): if name not in {f.name for f in self.transform.flattened_fields}: raise AttributeError(f"Property {name} unavailable!") if self.cache: if os.path.exists(self.fbin) and not self.binarize: sentences = self else: sentences = self.transform.load(self.data, **self.kwargs) return (getattr(sentence, name) for sentence in sentences) return [getattr(sentence, name) for sentence in self.sentences] def __getstate__(self): return self.__dict__ def __setstate__(self, state): self.__dict__.update(state) @lazy_property def sizes(self): if not self.cache: return [s.size for s in self.sentences] return debinarize(self.fbin, 'sizes') def build( self, batch_size: int, n_buckets: int = 1, shuffle: bool = False, distributed: bool = False, even: bool = True, n_workers: int = 0, seed: int = 1, pin_memory: bool = True, chunk_size: int = 10000 ) -> Dataset: # if not forced and the binarized file already exists, directly load the meta file if self.cache and os.path.exists(self.fbin) and not self.binarize: self.sentences = debinarize(self.fbin, meta=True)['sentences'] else: with tempfile.TemporaryDirectory() as ftemp: ftemp = gather(ftemp)[0] if is_dist() else ftemp fbin = self.fbin if self.cache else os.path.join(ftemp, 'data.pt') @contextmanager def cache(sentences): fs = os.path.join(ftemp, 'sentences') fb = os.path.join(ftemp, os.path.basename(fbin)) global global_transform global_transform = self.transform sentences = binarize({'sentences': progress_bar(sentences)}, fs)[1]['sentences'] try: yield ((sentences[s:s+chunk_size], fs, f"{fb}.{i}", self.max_len) for i, s in enumerate(range(0, len(sentences), chunk_size))) finally: del global_transform def numericalize(sentences, fs, fb, max_len): sentences = global_transform((debinarize(fs, sentence) for sentence in sentences)) sentences = [i for i in sentences if len(i) < max_len] return binarize({'sentences': sentences, 'sizes': [sentence.size for sentence in sentences]}, fb)[0] logger.info(f"Caching the data to {fbin}") # numericalize the fields of each sentence if is_master(): with cache(self.transform.load(self.data, **self.kwargs)) as chunks, mp.Pool(32) as pool: results = [pool.apply_async(numericalize, chunk) for chunk in chunks] self.sentences = binarize((r.get() for r in results), fbin, merge=True)[1]['sentences'] if is_dist(): dist.barrier() self.sentences = debinarize(fbin, meta=True)['sentences'] if not self.cache: self.sentences = [debinarize(fbin, i) for i in progress_bar(self.sentences)] if is_dist(): dist.barrier() # NOTE: the final bucket count is roughly equal to n_buckets self.buckets = dict(zip(*kmeans(self.sizes, n_buckets))) self.loader = DataLoader(transform=self.transform, dataset=self, batch_sampler=Sampler(self.buckets, batch_size, shuffle, distributed, even, seed), num_workers=n_workers, collate_fn=collate_fn, pin_memory=pin_memory) return self
class Sampler(torch.utils.data.Sampler): r""" Sampler that supports for bucketization and token-level batchification. Args: buckets (Dict): A dict that maps each centroid to indices of clustered sentences. The centroid corresponds to the average length of all sentences in the bucket. batch_size (int): Token-level batch size. The resulting batch contains roughly the same number of tokens as ``batch_size``. shuffle (bool): If ``True``, the sampler will shuffle both buckets and samples in each bucket. Default: ``False``. distributed (bool): If ``True``, the sampler will be used in conjunction with :class:`torch.nn.parallel.DistributedDataParallel` that restricts data loading to a subset of the dataset. Default: ``False``. even (bool): If ``True``, the sampler will add extra indices to make the data evenly divisible across the replicas. Default: ``True``. seed (int): Random seed used to shuffle the samples. Default: ``1``. """ def __init__( self, buckets: Dict[float, List], batch_size: int, shuffle: bool = False, distributed: bool = False, even: bool = True, seed: int = 1 ) -> Sampler: self.batch_size = batch_size self.shuffle = shuffle self.distributed = distributed self.even = even self.seed = seed self.sizes, self.buckets = zip(*[(size, bucket) for size, bucket in buckets.items()]) # number of batches in each bucket, clipped by range [1, len(bucket)] self.n_batches = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in zip(self.sizes, self.buckets)] self.rank, self.n_replicas, self.n_samples = 0, 1, self.n_total_samples if distributed: self.rank = dist.get_rank() self.n_replicas = dist.get_world_size() self.n_samples = self.n_total_samples // self.n_replicas if self.n_total_samples % self.n_replicas != 0: self.n_samples += 1 if even else int(self.rank < self.n_total_samples % self.n_replicas) self.epoch = 1 def __iter__(self): g = torch.Generator() g.manual_seed(self.epoch + self.seed) self.epoch += 1 total, batches = 0, [] # if `shuffle=True`, shuffle both the buckets and samples in each bucket # for distributed training, make sure each process generates the same random sequence at each epoch range_fn = torch.arange if not self.shuffle else lambda x: torch.randperm(x, generator=g) def cycle(length): while True: for i in range_fn(length).tolist(): yield i for i in cycle(len(self.buckets)): bucket = self.buckets[i] split_sizes = [(len(bucket) - j - 1) // self.n_batches[i] + 1 for j in range(self.n_batches[i])] # DON'T use `torch.chunk` which may return wrong number of batches for batch in range_fn(len(bucket)).split(split_sizes): if total % self.n_replicas == self.rank: batches.append([bucket[j] for j in batch.tolist()]) if len(batches) == self.n_samples: return iter(batches[i] for i in range_fn(self.n_samples).tolist()) total += 1 def __len__(self): return self.n_samples @property def n_total_samples(self): return sum(self.n_batches) def set_epoch(self, epoch: int) -> None: self.epoch = epoch class DataLoader(torch.utils.data.DataLoader): r""" A wrapper for native :class:`torch.utils.data.DataLoader` enhanced with a data prefetcher. See http://stackoverflow.com/questions/7323664/python-generator-pre-fetch and https://github.com/NVIDIA/apex/issues/304. """ def __init__(self, transform, **kwargs): super().__init__(**kwargs) self.transform = transform def __iter__(self): return PrefetchGenerator(self.transform, super().__iter__()) class PrefetchGenerator(threading.Thread): def __init__(self, transform, loader, prefetch=1): threading.Thread.__init__(self) self.transform = transform self.queue = queue.Queue(prefetch) self.loader = loader self.daemon = True if torch.cuda.is_available(): self.stream = torch.cuda.Stream() self.start() def __iter__(self): return self def __next__(self): if hasattr(self, 'stream'): torch.cuda.current_stream().wait_stream(self.stream) batch = self.queue.get() if batch is None: raise StopIteration return batch def run(self): # `torch.cuda.current_device` is thread local # see https://github.com/pytorch/pytorch/issues/56588 if is_dist() and torch.cuda.is_available(): torch.cuda.set_device(dist.get_rank()) if hasattr(self, 'stream'): with torch.cuda.stream(self.stream): for batch in self.loader: self.queue.put(batch.compose(self.transform)) else: for batch in self.loader: self.queue.put(batch.compose(self.transform)) self.queue.put(None) def collate_fn(x): return Batch(x)