Source code for medacy.pipeline_components.units.unit_component

import logging

from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.tokens import Token

from medacy.pipeline_components.feature_overlayers.base import BaseOverlayer


[docs]class UnitOverlayer(BaseOverlayer): """ A pipeline component that tags units. Begins by first tagging all mass, volume, time, and form units then aggregates as necessary. """ name="unit_annotator" dependencies = [] def __init__(self, nlp): self.nlp = nlp Token.set_extension('feature_is_mass_unit', default=False, force=True) nlp.entity.add_label('mass_unit') Token.set_extension('feature_is_volume_unit', default=False, force=True) nlp.entity.add_label('volume_unit') Token.set_extension('feature_is_time_unit', default=False, force=True) nlp.entity.add_label('time_unit') Token.set_extension('feature_is_route_type', default=False, force=True) nlp.entity.add_label('route_type') Token.set_extension('feature_is_form_unit', default=False, force=True) nlp.entity.add_label('form_unit') Token.set_extension('feature_is_frequency_indicator', default=False, force=True) nlp.entity.add_label('frequency_indicator') Token.set_extension('feature_is_measurement_unit', default=False, force=True) nlp.entity.add_label('measurement_unit') Token.set_extension('feature_is_measurement', default=False, force=True) nlp.entity.add_label('measurement') Token.set_extension('feature_is_duration_pattern', default=False) nlp.entity.add_label('duration_pattern') self.mass_matcher = Matcher(nlp.vocab) self.volume_matcher = Matcher(nlp.vocab) self.time_matcher = Matcher(nlp.vocab) self.route_matcher = Matcher(nlp.vocab) self.form_matcher = Matcher(nlp.vocab) self.unit_of_measurement_matcher = Matcher(nlp.vocab) self.measurement_matcher = Matcher(nlp.vocab) self.frequency_matcher = Matcher(nlp.vocab) self.duration_matcher = Matcher(nlp.vocab) self.mass_matcher.add('UNIT_OF_MASS', None, [{'LOWER': 'mcg'}], [{'LOWER': 'microgram'}], [{'LOWER': 'micrograms'}], [{'ORTH': 'mg'}], [{'LOWER': 'milligram'}], [{'LOWER': 'g'}], [{'LOWER': 'kg'}], [{'ORTH': 'mEq'}]) self.volume_matcher.add('UNIT_OF_VOLUME', None, [{'LOWER': 'ml'}], [{'ORTH': 'dL'}], [{'LOWER': 'cc'}], [{'ORTH': 'L'}]) self.time_matcher.add('UNIT_OF_TIME', None, [{'LOWER': 'sec'}], [{'LOWER': 'second'}], [{'LOWER': 'seconds'}], [{'LOWER': 'min'}], [{'LOWER': 'minute'}], [{'LOWER': 'minutes'}], [{'LOWER': 'hr'}], [{'LOWER': 'hour'}], [{'LOWER': 'day'}], [{'LOWER': 'days'}], [{'LOWER': 'week'}], [{'LOWER': 'weeks'}], [{'LOWER': 'month'}], [{'LOWER': 'months'}], [{'LOWER': 'year'}], [{'LOWER': 'years'}], [{'LOWER': 'yrs'}] ) self.frequency_matcher.add('FREQUENCY_MATCHER', None, [{'LOWER': 'bid'}], [{'LOWER': 'prn'}], [{'LOWER': 'qid'}], [{'LOWER': 'tid'}], [{'LOWER': 'qd'}], [{'LOWER': 'daily'}], [{'LOWER': 'hs'}], [{'LOWER': 'as'}, {'LOWER': 'needed'}], [{'LOWER': 'once'}, {'LOWER': 'a'}, {'LOWER': 'day'}], [{'LOWER': 'twice'}, {'LOWER': 'a'}, {'LOWER': 'day'}] ) self.form_matcher.add('UNIT_OF_FORM', None, [{'ORTH': 'dose'}], [{'ORTH': 'doses'}], [{'LEMMA': 'pill'}], [{'LEMMA': 'tablet'}], [{'LEMMA': 'unit'}], [{'LEMMA': 'u'}], [{'LEMMA': 'patch'}], [{'LEMMA': 'unit'}], [{'ORTH': 'lotion'}], [{'ORTH': 'powder'}], [{'ORTH': 'amps'}], [{'LOWER': 'actuation'}], [{'LEMMA': 'suspension'}], [{'LEMMA': 'syringe'}], [{'LEMMA': 'puff'}], [{'LEMMA': 'liquid'}], [{'LEMMA': 'aerosol'}], [{'LEMMA': 'cap'}] ) self.route_matcher.add('TYPE_OF_ROUTE', None, [{'LOWER': 'IV'}], [{'ORTH': 'intravenous'}], [{'LOWER': 'po'}], [{'ORTH': 'gtt'}], [{'LOWER': 'drip'}], [{'LOWER': 'inhalation'}], [{'LOWER': 'by'}, {'LOWER': 'mouth'}], [{'LOWER': 'topical'}], [{'LOWER': 'subcutaneous'}], [{'LOWER': 'ophthalmic'}], [{'LEMMA': 'injection'}], [{'LOWER': 'mucous'}, {'LOWER': 'membrane'}], [{'LOWER': 'oral'}], [{'LOWER': 'nebs'}], [{'LOWER': 'transdermal'}], [{'LOWER': 'nasal'}] ) self.unit_of_measurement_matcher.add('UNIT_OF_MEASUREMENT', None, [{'ENT_TYPE': 'mass_unit'}, {'ORTH': '/'}, {'ENT_TYPE': 'volume_unit'}], [{'ENT_TYPE': 'volume_unit'}, {'ORTH': '/'}, {'ENT_TYPE': 'time_unit'}], [{'ENT_TYPE': 'form_unit'}, {'ORTH': '/'}, {'ENT_TYPE': 'volume_unit'}] ) self.measurement_matcher.add('MEASUREMENT', None, [{'LIKE_NUM': True}, {'ORTH': '%'}], [{'LIKE_NUM': True}, {'ENT_TYPE': 'measurement_unit'}], [{'LIKE_NUM': True}, {'ENT_TYPE': 'mass_unit'}], [{'LIKE_NUM': True}, {'ENT_TYPE': 'volume_unit'}], [{'LIKE_NUM': True}, {'ENT_TYPE': 'form_unit'}], [{'LIKE_NUM': True},{'LOWER': 'x'}, {'ENT_TYPE': 'form_unit'}] ) self.duration_matcher.add('DURATION', None, [{'POS': 'PREP'}, {'LIKE_NUM': True}, {'ENT_TYPE': 'time_unit'}], [{'LIKE_NUM': True}, {'ENT_TYPE': 'time_unit'}], [{'LOWER': 'in'}, {'LIKE_NUM': True},{'ENT_TYPE': 'time_unit'}], [{'LOWER': 'prn'}] ) def __call__(self, doc): logging.debug("Called UnitAnnotator Component") nlp = self.nlp with doc.retokenize() as retokenizer: #match and tag mass units matches = self.mass_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['mass_unit']) if span is None: raise BaseException("Span is none") for token in span: token._.feature_is_mass_unit = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: #match and tag volume units matches = self.volume_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['volume_unit']) for token in span: token._.feature_is_volume_unit = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: # match and tag time units matches = self.time_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['time_unit']) for token in span: token._.feature_is_time_unit = True if len(span) > 1: retokenizer.merge(span) doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: # durations matches = self.duration_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['duration_pattern']) for token in span: token._.feature_is_duration_pattern = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: # match and frequency indicators matches = self.frequency_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['frequency_indicator']) for token in span: token._.feature_is_frequency_indicator = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: #match and tag form units matches = self.form_matcher(doc) spans = [] for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['form_unit']) for token in span: token._.feature_is_form_unit = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: # match and tag route types matches = self.route_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['route_type']) for token in span: token._.feature_is_route_type = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: # match units of measurement (x/y, , etc) matches = self.unit_of_measurement_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['measurement_unit']) for token in span: token._.feature_is_measurement_unit = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: # units of measures, numbers , percentages all together matches = self.measurement_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['measurement']) for token in span: token._.feature_is_measurement = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] return doc