Source code for medacy.pipeline_components.units.unit_component

import logging

from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.tokens import Token

from medacy.pipeline_components.feature_overlayers.base import BaseOverlayer


[docs]class UnitOverlayer(BaseOverlayer):
    """
    A pipeline component that tags units.
    Begins by first tagging all mass, volume, time, and form units then aggregates as necessary.
    """

    name="unit_annotator"
    dependencies = []

    def __init__(self, nlp):
        self.nlp = nlp
        Token.set_extension('feature_is_mass_unit', default=False, force=True)
        nlp.entity.add_label('mass_unit')

        Token.set_extension('feature_is_volume_unit', default=False, force=True)
        nlp.entity.add_label('volume_unit')

        Token.set_extension('feature_is_time_unit', default=False, force=True)
        nlp.entity.add_label('time_unit')

        Token.set_extension('feature_is_route_type', default=False, force=True)
        nlp.entity.add_label('route_type')

        Token.set_extension('feature_is_form_unit', default=False, force=True)
        nlp.entity.add_label('form_unit')

        Token.set_extension('feature_is_frequency_indicator', default=False, force=True)
        nlp.entity.add_label('frequency_indicator')


        Token.set_extension('feature_is_measurement_unit', default=False, force=True)
        nlp.entity.add_label('measurement_unit')

        Token.set_extension('feature_is_measurement', default=False, force=True)
        nlp.entity.add_label('measurement')

        Token.set_extension('feature_is_duration_pattern', default=False)
        nlp.entity.add_label('duration_pattern')



        self.mass_matcher = Matcher(nlp.vocab)
        self.volume_matcher = Matcher(nlp.vocab)
        self.time_matcher = Matcher(nlp.vocab)
        self.route_matcher = Matcher(nlp.vocab)
        self.form_matcher = Matcher(nlp.vocab)
        self.unit_of_measurement_matcher = Matcher(nlp.vocab)
        self.measurement_matcher = Matcher(nlp.vocab)
        self.frequency_matcher = Matcher(nlp.vocab)
        self.duration_matcher = Matcher(nlp.vocab)

        self.mass_matcher.add('UNIT_OF_MASS', None,
                              [{'LOWER': 'mcg'}],
                              [{'LOWER': 'microgram'}],
                              [{'LOWER': 'micrograms'}],
                              [{'ORTH': 'mg'}],
                              [{'LOWER': 'milligram'}],
                              [{'LOWER': 'g'}],
                              [{'LOWER': 'kg'}],
                              [{'ORTH': 'mEq'}])

        self.volume_matcher.add('UNIT_OF_VOLUME', None,
                                [{'LOWER': 'ml'}],
                                 [{'ORTH': 'dL'}],
                                [{'LOWER': 'cc'}],
                                [{'ORTH': 'L'}])

        self.time_matcher.add('UNIT_OF_TIME', None,
                              [{'LOWER': 'sec'}],
                              [{'LOWER': 'second'}],
                              [{'LOWER': 'seconds'}],
                              [{'LOWER': 'min'}],
                              [{'LOWER': 'minute'}],
                              [{'LOWER': 'minutes'}],
                              [{'LOWER': 'hr'}],
                              [{'LOWER': 'hour'}],
                              [{'LOWER': 'day'}],
                              [{'LOWER': 'days'}],
                              [{'LOWER': 'week'}],
                              [{'LOWER': 'weeks'}],
                              [{'LOWER': 'month'}],
                              [{'LOWER': 'months'}],
                              [{'LOWER': 'year'}],
                              [{'LOWER': 'years'}],
                              [{'LOWER': 'yrs'}]
                              )

        self.frequency_matcher.add('FREQUENCY_MATCHER', None,
                               [{'LOWER': 'bid'}],
                               [{'LOWER': 'prn'}],
                               [{'LOWER': 'qid'}],
                               [{'LOWER': 'tid'}],
                               [{'LOWER': 'qd'}],
                               [{'LOWER': 'daily'}],
                               [{'LOWER': 'hs'}],
                               [{'LOWER': 'as'}, {'LOWER': 'needed'}],
                               [{'LOWER': 'once'}, {'LOWER': 'a'}, {'LOWER': 'day'}],
                               [{'LOWER': 'twice'}, {'LOWER': 'a'}, {'LOWER': 'day'}]
                               )


        self.form_matcher.add('UNIT_OF_FORM', None,
                              [{'ORTH': 'dose'}],
                              [{'ORTH': 'doses'}],
                              [{'LEMMA': 'pill'}],
                              [{'LEMMA': 'tablet'}],
                              [{'LEMMA': 'unit'}],
                              [{'LEMMA': 'u'}],
                              [{'LEMMA': 'patch'}],
                              [{'LEMMA': 'unit'}],
                              [{'ORTH': 'lotion'}],
                              [{'ORTH': 'powder'}],
                              [{'ORTH': 'amps'}],
                              [{'LOWER': 'actuation'}],
                              [{'LEMMA': 'suspension'}],
                              [{'LEMMA': 'syringe'}],
                              [{'LEMMA': 'puff'}],
                              [{'LEMMA': 'liquid'}],
                              [{'LEMMA': 'aerosol'}],
                              [{'LEMMA': 'cap'}]
                              )

        self.route_matcher.add('TYPE_OF_ROUTE', None,
                               [{'LOWER': 'IV'}],
                               [{'ORTH': 'intravenous'}],
                               [{'LOWER': 'po'}],
                               [{'ORTH': 'gtt'}],
                               [{'LOWER': 'drip'}],
                               [{'LOWER': 'inhalation'}],
                               [{'LOWER': 'by'}, {'LOWER': 'mouth'}],
                               [{'LOWER': 'topical'}],
                               [{'LOWER': 'subcutaneous'}],
                               [{'LOWER': 'ophthalmic'}],
                               [{'LEMMA': 'injection'}],
                               [{'LOWER': 'mucous'}, {'LOWER': 'membrane'}],
                               [{'LOWER': 'oral'}],
                               [{'LOWER': 'nebs'}],
                               [{'LOWER': 'transdermal'}],
                               [{'LOWER': 'nasal'}]
                              )


        self.unit_of_measurement_matcher.add('UNIT_OF_MEASUREMENT', None,
                         [{'ENT_TYPE': 'mass_unit'}, {'ORTH': '/'}, {'ENT_TYPE': 'volume_unit'}],
                         [{'ENT_TYPE': 'volume_unit'}, {'ORTH': '/'}, {'ENT_TYPE': 'time_unit'}],
                         [{'ENT_TYPE': 'form_unit'}, {'ORTH': '/'}, {'ENT_TYPE': 'volume_unit'}]
                         )
        self.measurement_matcher.add('MEASUREMENT', None,
                         [{'LIKE_NUM': True}, {'ORTH': '%'}],
                         [{'LIKE_NUM': True}, {'ENT_TYPE': 'measurement_unit'}],
                         [{'LIKE_NUM': True}, {'ENT_TYPE': 'mass_unit'}],
                         [{'LIKE_NUM': True}, {'ENT_TYPE': 'volume_unit'}],
                         [{'LIKE_NUM': True}, {'ENT_TYPE': 'form_unit'}],
                         [{'LIKE_NUM': True},{'LOWER': 'x'}, {'ENT_TYPE': 'form_unit'}]

                         )

        self.duration_matcher.add('DURATION', None,
                                  [{'POS': 'PREP'}, {'LIKE_NUM': True}, {'ENT_TYPE': 'time_unit'}],
                                  [{'LIKE_NUM': True}, {'ENT_TYPE': 'time_unit'}],
                                  [{'LOWER': 'in'}, {'LIKE_NUM': True},{'ENT_TYPE': 'time_unit'}],
                                  [{'LOWER': 'prn'}]
                                  )


    def __call__(self, doc):
        logging.debug("Called UnitAnnotator Component")
        nlp = self.nlp

        with doc.retokenize() as retokenizer:
            #match and tag mass units
            matches = self.mass_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc, start, end, label=nlp.vocab.strings['mass_unit'])
                if span is None:
                    raise BaseException("Span is none")
                for token in span:
                    token._.feature_is_mass_unit = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass
                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:
            #match and tag volume units
            matches = self.volume_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc, start, end, label=nlp.vocab.strings['volume_unit'])
                for token in span:
                    token._.feature_is_volume_unit = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass
                doc.ents = list(doc.ents) + [span]


        with doc.retokenize() as retokenizer:
            # match and tag time units
            matches = self.time_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc, start, end, label=nlp.vocab.strings['time_unit'])
                for token in span:
                    token._.feature_is_time_unit = True
                if len(span) > 1:
                    retokenizer.merge(span)
                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:
            # durations
            matches = self.duration_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc, start, end, label=nlp.vocab.strings['duration_pattern'])
                for token in span:
                    token._.feature_is_duration_pattern = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass

                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:

            # match and frequency indicators
            matches = self.frequency_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc, start, end, label=nlp.vocab.strings['frequency_indicator'])
                for token in span:
                    token._.feature_is_frequency_indicator = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass
                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:
            #match and tag form units
            matches = self.form_matcher(doc)
            spans = []
            for match_id, start, end in matches:
                span = Span(doc, start, end, label=nlp.vocab.strings['form_unit'])
                for token in span:
                    token._.feature_is_form_unit = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass
                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:
            # match and tag route types
            matches = self.route_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc, start, end, label=nlp.vocab.strings['route_type'])
                for token in span:
                    token._.feature_is_route_type = True
                    try:
                        if len(span) > 1:
                            retokenizer.merge(span)
                    except ValueError:
                        pass
                    doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:
            # match units of measurement (x/y, , etc)
            matches = self.unit_of_measurement_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc, start, end, label=nlp.vocab.strings['measurement_unit'])
                for token in span:
                    token._.feature_is_measurement_unit = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass
                doc.ents = list(doc.ents) + [span]

        with doc.retokenize() as retokenizer:

            # units of measures, numbers , percentages all together
            matches = self.measurement_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc, start, end, label=nlp.vocab.strings['measurement'])
                for token in span:
                    token._.feature_is_measurement = True
                try:
                    if len(span) > 1:
                        retokenizer.merge(span)
                except ValueError:
                    pass
                doc.ents = list(doc.ents) + [span]

        return doc