Source code for medacy.pipeline_components.units.frequency_unit_component

from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.tokens import Token

from medacy.pipeline_components.feature_overlayers.base import BaseOverlayer


[docs]class FrequencyUnitOverlayer(BaseOverlayer): """ A pipeline component that tags Frequency units """ name="frequency_unit_annotator" dependencies = [] def __init__(self, spacy_pipeline): self.nlp = spacy_pipeline Token.set_extension('feature_is_frequency_indicator', default=False) self.nlp.entity.add_label('frequency_indicator') self.frequency_matcher = Matcher(self.nlp.vocab) self.frequency_matcher.add('FREQUENCY_MATCHER', None, [{'LOWER': 'bid'}], [{'LOWER': 'prn'}], [{'LOWER': 'qid'}], [{'LOWER': 'tid'}], [{'LOWER': 'qd'}], [{'LOWER': 'daily'}], [{'LOWER': 'hs'}], [{'LOWER': 'as'}, {'LOWER': 'needed'}], [{'LOWER': 'once'}, {'LOWER': 'a'}, {'LOWER': 'day'}], [{'LOWER': 'twice'}, {'LOWER': 'a'}, {'LOWER': 'day'}] ) def __call__(self, doc): nlp = self.nlp with doc.retokenize() as retokenizer: # match and frequency indicators matches = self.frequency_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['frequency_indicator']) for token in span: token._.feature_is_frequency_indicator = True if len(span) > 1: retokenizer.merge(span) doc.ents = list(doc.ents) + [span] return doc