Source code for medacy.data.dataset

"""
A medaCy Dataset facilities the management of data for both model training and model prediction.

A Dataset object provides a wrapper for a unix file directory containing training/prediction
data. If a Dataset, at training time, is fed into a pipeline requiring auxilary files
(Metamap for instance) the Dataset will automatically create those files in the most efficient way possible.

Training
#################
When a directory contains **both** raw text files alongside annotation files, an instantiated Dataset
detects and facilitates access to those files.

Assuming your directory looks like this (where .ann files are in `BRAT <http://brat.nlplab.org/standoff.html>`_ format):
::
    home/medacy/data
    ├── file_one.ann
    ├── file_one.txt
    ├── file_two.ann
    └── file_two.txt

A common data work flow might look as follows.

Running:
::
    >>> from medacy.data import Dataset
    >>> from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap

    >>> dataset = Dataset('/home/datasets/some_dataset')
    >>> for data_file in dataset:
    ...    (data_file.file_name, data_file.raw_path, dataset.ann_path)
    (file_one, file_one.txt, file_one.ann)
    (file_two, file_two.txt, file_two.ann)
    >>> dataset
    ['file_one', 'file_two']
    >>>> dataset.is_metamapped()
    False
    >>> metamap = MetaMap('/home/path/to/metamap/binary')
    >>> with metamap:
    ...     metamap.metamap_dataset(dataset)
    >>> dataset.is_metamapped()
    True

MedaCy **does not** alter the data you load in any way - it only reads from it.

Prediction
##########
When a directory contains **only** raw text files, an instantiated Dataset object interprets this as
a directory of files that need to be predicted. This means that the internal Datafile that aggregates
meta-data for a given prediction file does not have fields for annotation_file_path set.

When a directory contains **only** ann files, an instantiated Dataset object interprets this as
a directory of files that are predictions. Useful methods for analysis include :meth:`medacy.data.dataset.Dataset.compute_confusion_matrix`,
:meth:`medacy.data.dataset.Dataset.compute_ambiguity` and :meth:`medacy.data.dataset.Dataset.compute_counts`.

External Datasets
#################

In the real world, datasets (regardless of domain) are evolving entities. Hence, it is essential to version them.
A medaCy compatible dataset can be created to facilitate this versioning. A medaCy compatible dataset lives a python
packages that can be hooked into medaCy or used for any other purpose - it is simply a loose wrapper for this Dataset
object. Instructions for creating such a dataset can be found `here <https://github.com/NLPatVCU/medaCy/tree/master/examples/guide>`_.
wrap them.
"""

import argparse
import json
import logging
import os
import pprint
from collections import Counter
from pathlib import Path

from medacy.data.annotations import Annotations
from medacy.data.data_file import DataFile


[docs]class Dataset:
    """
    A facilitation class for data management.
    """

    def __init__(self, data_directory, data_limit=None):
        """
        Manages directory of training data along with other medaCy generated files.

        Only text files: considers a directory for managing metamapping.
        Only ann files: considers a directory of predictions.
        Both text and ann files: considers a directory for training.

        :param data_directory: Directory containing data for training or prediction.
        :param data_limit: A limit to the number of files to process. Must be between 1 and number of raw text files in data_directory
        """
        self.data_directory = Path(data_directory)

        metamap_dir = self.data_directory / 'metamapped'
        self.metamapped_files_directory = metamap_dir if metamap_dir.is_dir() else None

        self.data_files = self._create_data_files()
        self.data_limit = data_limit or len(self.data_files)

[docs]    def _create_data_files(self):
        data_files = []
        all_files_in_directory = os.listdir(self.data_directory)
        all_file_base_names = {f.split(".")[0] for f in all_files_in_directory}

        for file_name in all_file_base_names:
            txt_path = None
            ann_path = None
            metamapped_path = None

            potential_txt_path = self.data_directory / (file_name + ".txt")
            if potential_txt_path.exists():
                txt_path = potential_txt_path

            potential_ann_path = self.data_directory / (file_name + ".ann")
            if potential_ann_path.exists():
                ann_path = potential_ann_path

            if self.metamapped_files_directory:
                potential_mm_path = self.metamapped_files_directory / (file_name + ".metamapped")
                if potential_mm_path.exists():
                    metamapped_path = potential_mm_path

            if txt_path or ann_path:
                new_df = DataFile(file_name, txt_path, ann_path, metamapped_path)
                data_files.append(new_df)

        return sorted(data_files, key=lambda x: x.file_name)

    def __iter__(self):
        return iter(self.data_files[0:self.data_limit])

    def __len__(self):
        return len(self.data_files)

[docs]    def is_metamapped(self):
        """
        Verifies if all fil es in the Dataset are metamapped.

        :return: True if all data files are metamapped, False otherwise.
        """
        if self.metamapped_files_directory is None or not self.metamapped_files_directory.exists():
            return False

        for file in self.data_files:
            potential_file_path = self.metamapped_files_directory / f"{file.file_name}.metamapped"
            if not potential_file_path.exists():
                return False

            # Metamapped file could exist, but metamapping it could have failed.
            # If the file is less than 200 bytes, log a warning.
            file_size_in_bytes = os.path.getsize(potential_file_path)
            if file_size_in_bytes < 200:
                logging.warning(f"Metamapped version of {file.file_name} is only {file_size_in_bytes} bytes. "
                                f"Metamapping could have failed: {potential_file_path}")

        return True

    def __str__(self):
        """
        Prints a list-like string of the names of the Datafile objects up to the data limit
        (can't be used if copied and pasted)
        """
        return str([d.file_name for d in self])

[docs]    def compute_counts(self):
        """
        Computes entity counts over all documents in this dataset.

        :return: a Counter of entity counts
        """
        total = Counter()

        for ann in self.generate_annotations():
            total += ann.compute_counts()

        return total

[docs]    def compute_confusion_matrix(self, other, leniency=0):
        """
        Generates a confusion matrix where this Dataset serves as the gold standard annotations and `dataset` serves
        as the predicted annotations. A typical workflow would involve creating a Dataset object with the prediction directory
        outputted by a model and then passing it into this method.

        :param other: a Dataset object containing a predicted version of this dataset.
        :param leniency: a floating point value between [0,1] defining the leniency of the character spans to count as different. A value of zero considers only exact character matches while a positive value considers entities that differ by up to :code:`ceil(leniency * len(span)/2)` on either side.
        :return: two element tuple containing a label array (of entity names) and a matrix where rows are gold labels and columns are predicted labels. matrix[i][j] indicates that entities[i] in this dataset was predicted as entities[j] in 'annotation' matrix[i][j] times
        """
        if not isinstance(other, Dataset):
            raise ValueError("other must be instance of Dataset")

        # verify files are consistent
        diff = {d.file_name for d in self} - {d.file_name for d in other}
        if diff:
            raise ValueError(f"Dataset of predictions is missing the files: {repr(diff)}")

        # sort entities in ascending order by count.
        entities = [key for key, _ in sorted(self.compute_counts().items(), key=lambda x: x[1])]
        confusion_matrix = [[0 * len(entities)] * len(entities)]

        for gold_data_file in self:
            prediction_iter = iter(other)
            prediction_data_file = next(prediction_iter)
            while str(gold_data_file) != str(prediction_data_file):
                prediction_data_file = next(prediction_iter)

            gold_annotation = Annotations(gold_data_file.ann_path)
            pred_annotation = Annotations(prediction_data_file.ann_path)

            # compute matrix on the Annotation file level
            ann_confusion_matrix = gold_annotation.compute_confusion_matrix(pred_annotation, entities, leniency=leniency)
            for i in range(len(confusion_matrix)):
                for j in range(len(confusion_matrix)):
                    confusion_matrix[i][j] += ann_confusion_matrix[i][j]

        return entities, confusion_matrix

[docs]    def compute_ambiguity(self, dataset):
        """
        Finds occurrences of spans from 'dataset' that intersect with a span from this annotation but do not have this spans label.
        label. If 'dataset' comprises a models predictions, this method provides a strong indicators
        of a model's in-ability to dis-ambiguate between entities. For a full analysis, compute a confusion matrix.

        :param dataset: a Dataset object containing a predicted version of this dataset.
        :return: a dictionary containing the ambiguity computations on each gold, predicted file pair
        """
        if not isinstance(dataset, Dataset):
            raise ValueError("dataset must be instance of Dataset")

        # verify files are consistent
        diff = {d.file_name for d in self} - {d.file_name for d in dataset}
        if diff:
            raise ValueError(f"Dataset of predictions is missing the files: {repr(diff)}")

        # Dictionary storing ambiguity over dataset
        ambiguity_dict = {}

        for gold_data_file in self:
            prediction_iter = iter(dataset)
            prediction_data_file = next(prediction_iter)
            while str(gold_data_file) != str(prediction_data_file):
                prediction_data_file = next(prediction_iter)

            gold_annotation = Annotations(gold_data_file.ann_path)
            pred_annotation = Annotations(prediction_data_file.ann_path)

            # compute matrix on the Annotation file level
            ambiguity_dict[str(gold_data_file)] = gold_annotation.compute_ambiguity(pred_annotation)

        return ambiguity_dict

[docs]    def get_labels(self, as_list=False):
        """
        Get all of the entities/labels used in the dataset.
        :param as_list: bool for if to return the results as a list; defaults to False
        :return: A set of strings. Each string is a label used.
        """
        labels = set()

        for ann in self.generate_annotations():
            labels.update(ann.get_labels())

        if as_list:
            return list(labels)
        return labels

[docs]    def generate_annotations(self):
        """Generates Annotation objects for all the files in this Dataset"""
        for file in self:
            if file.ann_path is not None:
                yield Annotations(file.ann_path, source_text_path=file.txt_path)
            else:
                yield Annotations([])

    def __getitem__(self, item):
        """
        Creates and returns the Annotations object with the given file name, else raises FileNotFoundError;
        useful for getting Annotations objects from parallel Datasets
        :param item: the name of the file to be represented (not including the extension or parent directories)
        :return: an Annotations object
        """
        path = os.path.join(self.data_directory, item + '.ann')
        return Annotations(path)


[docs]def main():
    """CLI for retrieving dataset information"""
    parser = argparse.ArgumentParser(description='Calculate data about a given data directory')
    parser.add_argument('directory')
    args = parser.parse_args()

    dataset = Dataset(args.directory)

    entities = json.dumps(dataset.get_labels(as_list=True))
    counts = dataset.compute_counts()

    print(f"Entities: {entities}")
    pprint.pprint(counts)


if __name__ == '__main__':
    main()