Skip to content

Annotator

Bases: ABC

An abstract base class for annotators.

Annotators are responsible for processing medical notes and extracting relevant concepts from them.

Attributes:

Name Type Description
cat CAT

The MedCAT instance used for concept extraction.

config AnnotatorConfig

The configuration for the annotator.

Source code in miade/annotators.py
class Annotator(ABC):
    """
    An abstract base class for annotators.

    Annotators are responsible for processing medical notes and extracting relevant concepts from them.

    Attributes:
        cat (CAT): The MedCAT instance used for concept extraction.
        config (AnnotatorConfig): The configuration for the annotator.
    """

    def __init__(self, cat: CAT, config: AnnotatorConfig = None):
        self.cat = cat
        self.config = config if config is not None else AnnotatorConfig()

        if self.config.negation_detection == "negex":
            self._add_negex_pipeline()

        self._set_lookup_data_path()
        self._load_paragraph_regex()

        # TODO make paragraph processing params configurable
        self.structured_prob_lists = {
            ParagraphType.prob: Relevance.PRESENT,
            ParagraphType.imp: Relevance.PRESENT,
            ParagraphType.pmh: Relevance.HISTORIC,
        }
        self.structured_med_lists = {
            ParagraphType.med: SubstanceCategory.TAKING,
            ParagraphType.allergy: SubstanceCategory.ADVERSE_REACTION,
        }
        self.irrelevant_paragraphs = [ParagraphType.ddx, ParagraphType.exam, ParagraphType.plan]

    def _add_negex_pipeline(self) -> None:
        """
        Adds the negex pipeline to the MedCAT instance.
        """
        self.cat.pipe.spacy_nlp.add_pipe("sentencizer")
        self.cat.pipe.spacy_nlp.enable_pipe("sentencizer")
        self.cat.pipe.spacy_nlp.add_pipe("negex")

    def _set_lookup_data_path(self) -> None:
        """
        Sets the lookup data path based on the configuration.

        If the `lookup_data_path` is not specified in the configuration, the default path "./data/" is used
        and `use_package_data` is set to True. Otherwise, the specified `lookup_data_path` is used and
        `use_package_data` is set to False.

        Raises:
            RuntimeError: If the specified `lookup_data_path` does not exist.
        """
        if self.config.lookup_data_path is None:
            self.lookup_data_path = "./data/"
            self.use_package_data = True
            log.info("Loading preconfigured lookup data")
        else:
            self.lookup_data_path = self.config.lookup_data_path
            self.use_package_data = False
            log.info(f"Loading lookup data from {self.lookup_data_path}")
            if not os.path.isdir(self.lookup_data_path):
                raise RuntimeError(f"No lookup data configured: {self.lookup_data_path} does not exist!")

    def _load_paragraph_regex(self) -> None:
        """
        Loads the paragraph regex mappings from a CSV file and initializes the paragraph_regex attribute.

        This method loads the paragraph regex mappings from a CSV file located the lookup data path specified in config.
        If unspecified, loads the default packaged regex lookup for paragraph headings.

        Returns:
            None
        """
        data = load_lookup_data(
            self.lookup_data_path + "regex_para_chunk.csv", is_package_data=self.use_package_data, as_dict=True
        )
        self.paragraph_regex = load_regex_paragraph_mappings(data)

    @property
    @abstractmethod
    def concept_types(self):
        """
        Abstract property that should return a list of concept types supported by the annotator.
        """
        pass

    @property
    @abstractmethod
    def pipeline(self):
        """
        Abstract property that should return a list of pipeline steps for the annotator.
        """
        pass

    @abstractmethod
    def postprocess(self):
        """
        Abstract method that should implement the logic for post-processing extracted concepts.
        """
        pass

    @abstractmethod
    def run_pipeline(self):
        """
        Abstract method that runs the annotation pipeline on a given note and returns the extracted concepts.
        """
        pass

    def get_concepts(self, note: Note) -> List[Concept]:
        """
        Extracts concepts from a note using the MedCAT instance.

        Args:
            note (Note): The input note to extract concepts from.

        Returns:
            The extracted concepts from the note.
        """
        concepts: List[Concept] = []
        for entity in self.cat.get_entities(note)["entities"].values():
            try:
                concepts.append(Concept.from_entity(entity))
                log.debug(f"Detected concept ({concepts[-1].id} | {concepts[-1].name})")
            except ValueError as e:
                log.warning(f"Concept skipped: {e}")

        return concepts

    def preprocess(self, note: Note, refine: bool = True) -> Note:
        """
        Preprocesses a note by cleaning its text and splitting it into paragraphs.

        Args:
            note (Note): The input note to preprocess.
            refine (bool): Whether to refine the paragraph detection algorithm and allow merging of continuous prose
            paragraphs, merging to paragraphs with empty bodies with the next prose paragraphs. Default True.

        Returns:
            The preprocessed note.
        """
        note.process(self.paragraph_regex, refine=refine)

        return note

    @staticmethod
    def filter_concepts_in_numbered_list(concepts: List[Concept], note: Note) -> List[Concept]:
        """
        Filters and returns a list of concepts in a numbered list in a note using a two-pointer algorithm.

        This filters out concepts that may not be relevant given a note that has structured list headings
        and numbered lists within that. i.e. only return the first line of a numbered list. e.g.
            1. CCF -
            - had echo on 15/6
            - on diuretics
        will only return the concept CCF as it is the first item in a numbered list

        Args:
            concepts (List[Concept]): The list of concepts to filter.
            note (Note): The note containing numbered lists.

        Returns:
           The filtered list of concepts.
        """
        # Check there is a numbered list
        if len(note.numbered_lists) == 0:
            return concepts

        # Get the global list ranges of all numbered lists in a note
        global_list_ranges = [
            (numbered_list.list_start, numbered_list.list_end) for numbered_list in note.numbered_lists
        ]

        # Flatten the list items from all numbered lists into a single list and sort them
        list_items = [item for numbered_list in note.numbered_lists for item in numbered_list.items]
        list_items.sort(key=lambda x: x.start)

        # Sort the concepts by their start index
        concepts.sort(key=lambda x: x.start)

        filtered_concepts = []
        concept_idx, item_idx = 0, 0

        # Iterate through concepts and list items simultaneously
        while concept_idx < len(concepts) and item_idx < len(list_items):
            concept = concepts[concept_idx]
            item = list_items[item_idx]

            # Check if the concept is within the global range of any list
            if any(start <= concept.start < end for start, end in global_list_ranges):
                # Check for partial or full overlap between concept and list item
                if (
                    concept.start >= item.start and concept.end <= item.end
                ):  # or (concept.start < item.end and concept.end > item.start)
                    # Concept overlaps with or is within the current list item
                    filtered_concepts.append(concept)
                    concept_idx += 1  # Move to the next concept
                elif concept.end <= item.start:
                    # If the concept ends before the item starts, move to the next concept
                    concept_idx += 1
                else:
                    # Otherwise, move to the next list item
                    item_idx += 1
            else:
                # If concept is not within a numbered list range, skip and return it
                filtered_concepts.append(concept)
                concept_idx += 1

        # After iterating, check if there are remaining concepts after the last list item that might not have been added
        while concept_idx < len(concepts):
            concept = concepts[concept_idx]
            if concept.start >= global_list_ranges[-1][1]:
                filtered_concepts.append(concept)
            concept_idx += 1

        return filtered_concepts

    @staticmethod
    def deduplicate(concepts: List[Concept], record_concepts: Optional[List[Concept]] = None) -> List[Concept]:
        """
        Removes duplicate concepts from the extracted concepts list by strict ID matching.

        Args:
            concepts (List[Concept]): The list of extracted concepts.
            record_concepts (Optional[List[Concept]]): The list of concepts from existing EHR records.

        Returns:
            The deduplicated list of concepts.
        """
        if record_concepts is not None:
            record_ids = {record_concept.id for record_concept in record_concepts}
            record_names = {record_concept.name for record_concept in record_concepts}
        else:
            record_ids = set()
            record_names = set()

        # Use an OrderedDict to keep track of ids as it preservers original MedCAT order (the order it appears in text)
        filtered_concepts: List[Concept] = []
        existing_concepts = OrderedDict()

        # Filter concepts that are in record or exist in concept list
        for concept in concepts:
            if concept.id is not None and (concept.id in record_ids or concept.id in existing_concepts):
                log.debug(f"Removed concept ({concept.id} | {concept.name}): concept id exists in record")
            # check name match for null ids - VTM deduplication
            elif concept.id is None and (concept.name in record_names or concept.name in existing_concepts.values()):
                log.debug(f"Removed concept ({concept.id} | {concept.name}): concept name exists in record")
            else:
                filtered_concepts.append(concept)
                existing_concepts[concept.id] = concept.name

        return filtered_concepts

    @staticmethod
    def add_dosages_to_concepts(
        dosage_extractor: DosageExtractor, concepts: List[Concept], note: Note
    ) -> List[Concept]:
        """
        Gets dosages for medication concepts

        Args:
            dosage_extractor (DosageExtractor): The dosage extractor object
            concepts (List[Concept]): List of concepts extracted
            note (Note): The input note

        Returns:
            List of concepts with dosages for medication concepts
        """

        for ind, concept in enumerate(concepts):
            next_med_concept = concepts[ind + 1] if len(concepts) > ind + 1 else None
            dosage_string = get_dosage_string(concept, next_med_concept, note.text)
            if len(dosage_string.split()) > 2:
                concept.dosage = dosage_extractor(dosage_string)
                concept.category = Category.MEDICATION if concept.dosage is not None else None
                if concept.dosage is not None:
                    log.debug(
                        f"Extracted dosage for medication concept "
                        f"({concept.id} | {concept.name}): {concept.dosage.text} {concept.dosage.dose}"
                    )

        return concepts

    @staticmethod
    def add_numbering_to_name(concepts: List[Concept]) -> List[Concept]:
        """
        Adds numbering to the names of problem concepts to control output ordering.

        Args:
            concepts (List[Concept]): The list of concepts to add numbering to.

        Returns:
            The list of concepts with numbering added to their names.
        """
        # Prepend numbering to problem concepts e.g. 01 asthma, 02 stroke...
        for i, concept in enumerate(concepts):
            concept.name = f"{(i+1):02} {concept.name}"

        return concepts

    def __call__(
        self,
        note: Note,
        record_concepts: Optional[List[Concept]] = None,
        dosage_extractor: Optional[DosageExtractor] = None,
    ) -> List[Concept]:
        """
        Runs the annotation pipeline on a given note and returns the extracted concepts.

        Args:
            note (Note): The input note to process.
            record_concepts (Optional[List[Concept]]): The list of concepts from existing EHR records.
            dosage_extractor (Optional[DosageExtractor]): The dosage extractor to use for extracting dosage information.

        Returns:
            List[Concept]: The extracted concepts from the note.
        """
        if dosage_extractor is not None:
            concepts = self.run_pipeline(note, record_concepts, dosage_extractor)
        else:
            concepts = self.run_pipeline(note, record_concepts)

        if self.config.add_numbering:
            concepts = self.add_numbering_to_name(concepts)

        return concepts

concept_types abstractmethod property

Abstract property that should return a list of concept types supported by the annotator.

pipeline abstractmethod property

Abstract property that should return a list of pipeline steps for the annotator.

__call__(note, record_concepts=None, dosage_extractor=None)

Runs the annotation pipeline on a given note and returns the extracted concepts.

Parameters:

Name Type Description Default
note Note

The input note to process.

required
record_concepts Optional[List[Concept]]

The list of concepts from existing EHR records.

None
dosage_extractor Optional[DosageExtractor]

The dosage extractor to use for extracting dosage information.

None

Returns:

Type Description
List[Concept]

List[Concept]: The extracted concepts from the note.

Source code in miade/annotators.py
def __call__(
    self,
    note: Note,
    record_concepts: Optional[List[Concept]] = None,
    dosage_extractor: Optional[DosageExtractor] = None,
) -> List[Concept]:
    """
    Runs the annotation pipeline on a given note and returns the extracted concepts.

    Args:
        note (Note): The input note to process.
        record_concepts (Optional[List[Concept]]): The list of concepts from existing EHR records.
        dosage_extractor (Optional[DosageExtractor]): The dosage extractor to use for extracting dosage information.

    Returns:
        List[Concept]: The extracted concepts from the note.
    """
    if dosage_extractor is not None:
        concepts = self.run_pipeline(note, record_concepts, dosage_extractor)
    else:
        concepts = self.run_pipeline(note, record_concepts)

    if self.config.add_numbering:
        concepts = self.add_numbering_to_name(concepts)

    return concepts

add_dosages_to_concepts(dosage_extractor, concepts, note) staticmethod

Gets dosages for medication concepts

Parameters:

Name Type Description Default
dosage_extractor DosageExtractor

The dosage extractor object

required
concepts List[Concept]

List of concepts extracted

required
note Note

The input note

required

Returns:

Type Description
List[Concept]

List of concepts with dosages for medication concepts

Source code in miade/annotators.py
@staticmethod
def add_dosages_to_concepts(
    dosage_extractor: DosageExtractor, concepts: List[Concept], note: Note
) -> List[Concept]:
    """
    Gets dosages for medication concepts

    Args:
        dosage_extractor (DosageExtractor): The dosage extractor object
        concepts (List[Concept]): List of concepts extracted
        note (Note): The input note

    Returns:
        List of concepts with dosages for medication concepts
    """

    for ind, concept in enumerate(concepts):
        next_med_concept = concepts[ind + 1] if len(concepts) > ind + 1 else None
        dosage_string = get_dosage_string(concept, next_med_concept, note.text)
        if len(dosage_string.split()) > 2:
            concept.dosage = dosage_extractor(dosage_string)
            concept.category = Category.MEDICATION if concept.dosage is not None else None
            if concept.dosage is not None:
                log.debug(
                    f"Extracted dosage for medication concept "
                    f"({concept.id} | {concept.name}): {concept.dosage.text} {concept.dosage.dose}"
                )

    return concepts

add_numbering_to_name(concepts) staticmethod

Adds numbering to the names of problem concepts to control output ordering.

Parameters:

Name Type Description Default
concepts List[Concept]

The list of concepts to add numbering to.

required

Returns:

Type Description
List[Concept]

The list of concepts with numbering added to their names.

Source code in miade/annotators.py
@staticmethod
def add_numbering_to_name(concepts: List[Concept]) -> List[Concept]:
    """
    Adds numbering to the names of problem concepts to control output ordering.

    Args:
        concepts (List[Concept]): The list of concepts to add numbering to.

    Returns:
        The list of concepts with numbering added to their names.
    """
    # Prepend numbering to problem concepts e.g. 01 asthma, 02 stroke...
    for i, concept in enumerate(concepts):
        concept.name = f"{(i+1):02} {concept.name}"

    return concepts

deduplicate(concepts, record_concepts=None) staticmethod

Removes duplicate concepts from the extracted concepts list by strict ID matching.

Parameters:

Name Type Description Default
concepts List[Concept]

The list of extracted concepts.

required
record_concepts Optional[List[Concept]]

The list of concepts from existing EHR records.

None

Returns:

Type Description
List[Concept]

The deduplicated list of concepts.

Source code in miade/annotators.py
@staticmethod
def deduplicate(concepts: List[Concept], record_concepts: Optional[List[Concept]] = None) -> List[Concept]:
    """
    Removes duplicate concepts from the extracted concepts list by strict ID matching.

    Args:
        concepts (List[Concept]): The list of extracted concepts.
        record_concepts (Optional[List[Concept]]): The list of concepts from existing EHR records.

    Returns:
        The deduplicated list of concepts.
    """
    if record_concepts is not None:
        record_ids = {record_concept.id for record_concept in record_concepts}
        record_names = {record_concept.name for record_concept in record_concepts}
    else:
        record_ids = set()
        record_names = set()

    # Use an OrderedDict to keep track of ids as it preservers original MedCAT order (the order it appears in text)
    filtered_concepts: List[Concept] = []
    existing_concepts = OrderedDict()

    # Filter concepts that are in record or exist in concept list
    for concept in concepts:
        if concept.id is not None and (concept.id in record_ids or concept.id in existing_concepts):
            log.debug(f"Removed concept ({concept.id} | {concept.name}): concept id exists in record")
        # check name match for null ids - VTM deduplication
        elif concept.id is None and (concept.name in record_names or concept.name in existing_concepts.values()):
            log.debug(f"Removed concept ({concept.id} | {concept.name}): concept name exists in record")
        else:
            filtered_concepts.append(concept)
            existing_concepts[concept.id] = concept.name

    return filtered_concepts

filter_concepts_in_numbered_list(concepts, note) staticmethod

Filters and returns a list of concepts in a numbered list in a note using a two-pointer algorithm.

This filters out concepts that may not be relevant given a note that has structured list headings and numbered lists within that. i.e. only return the first line of a numbered list. e.g. 1. CCF - - had echo on 15/6 - on diuretics will only return the concept CCF as it is the first item in a numbered list

Parameters:

Name Type Description Default
concepts List[Concept]

The list of concepts to filter.

required
note Note

The note containing numbered lists.

required

Returns:

Type Description
List[Concept]

The filtered list of concepts.

Source code in miade/annotators.py
@staticmethod
def filter_concepts_in_numbered_list(concepts: List[Concept], note: Note) -> List[Concept]:
    """
    Filters and returns a list of concepts in a numbered list in a note using a two-pointer algorithm.

    This filters out concepts that may not be relevant given a note that has structured list headings
    and numbered lists within that. i.e. only return the first line of a numbered list. e.g.
        1. CCF -
        - had echo on 15/6
        - on diuretics
    will only return the concept CCF as it is the first item in a numbered list

    Args:
        concepts (List[Concept]): The list of concepts to filter.
        note (Note): The note containing numbered lists.

    Returns:
       The filtered list of concepts.
    """
    # Check there is a numbered list
    if len(note.numbered_lists) == 0:
        return concepts

    # Get the global list ranges of all numbered lists in a note
    global_list_ranges = [
        (numbered_list.list_start, numbered_list.list_end) for numbered_list in note.numbered_lists
    ]

    # Flatten the list items from all numbered lists into a single list and sort them
    list_items = [item for numbered_list in note.numbered_lists for item in numbered_list.items]
    list_items.sort(key=lambda x: x.start)

    # Sort the concepts by their start index
    concepts.sort(key=lambda x: x.start)

    filtered_concepts = []
    concept_idx, item_idx = 0, 0

    # Iterate through concepts and list items simultaneously
    while concept_idx < len(concepts) and item_idx < len(list_items):
        concept = concepts[concept_idx]
        item = list_items[item_idx]

        # Check if the concept is within the global range of any list
        if any(start <= concept.start < end for start, end in global_list_ranges):
            # Check for partial or full overlap between concept and list item
            if (
                concept.start >= item.start and concept.end <= item.end
            ):  # or (concept.start < item.end and concept.end > item.start)
                # Concept overlaps with or is within the current list item
                filtered_concepts.append(concept)
                concept_idx += 1  # Move to the next concept
            elif concept.end <= item.start:
                # If the concept ends before the item starts, move to the next concept
                concept_idx += 1
            else:
                # Otherwise, move to the next list item
                item_idx += 1
        else:
            # If concept is not within a numbered list range, skip and return it
            filtered_concepts.append(concept)
            concept_idx += 1

    # After iterating, check if there are remaining concepts after the last list item that might not have been added
    while concept_idx < len(concepts):
        concept = concepts[concept_idx]
        if concept.start >= global_list_ranges[-1][1]:
            filtered_concepts.append(concept)
        concept_idx += 1

    return filtered_concepts

get_concepts(note)

Extracts concepts from a note using the MedCAT instance.

Parameters:

Name Type Description Default
note Note

The input note to extract concepts from.

required

Returns:

Type Description
List[Concept]

The extracted concepts from the note.

Source code in miade/annotators.py
def get_concepts(self, note: Note) -> List[Concept]:
    """
    Extracts concepts from a note using the MedCAT instance.

    Args:
        note (Note): The input note to extract concepts from.

    Returns:
        The extracted concepts from the note.
    """
    concepts: List[Concept] = []
    for entity in self.cat.get_entities(note)["entities"].values():
        try:
            concepts.append(Concept.from_entity(entity))
            log.debug(f"Detected concept ({concepts[-1].id} | {concepts[-1].name})")
        except ValueError as e:
            log.warning(f"Concept skipped: {e}")

    return concepts

postprocess() abstractmethod

Abstract method that should implement the logic for post-processing extracted concepts.

Source code in miade/annotators.py
@abstractmethod
def postprocess(self):
    """
    Abstract method that should implement the logic for post-processing extracted concepts.
    """
    pass

preprocess(note, refine=True)

Preprocesses a note by cleaning its text and splitting it into paragraphs.

Parameters:

Name Type Description Default
note Note

The input note to preprocess.

required
refine bool

Whether to refine the paragraph detection algorithm and allow merging of continuous prose

True

Returns:

Type Description
Note

The preprocessed note.

Source code in miade/annotators.py
def preprocess(self, note: Note, refine: bool = True) -> Note:
    """
    Preprocesses a note by cleaning its text and splitting it into paragraphs.

    Args:
        note (Note): The input note to preprocess.
        refine (bool): Whether to refine the paragraph detection algorithm and allow merging of continuous prose
        paragraphs, merging to paragraphs with empty bodies with the next prose paragraphs. Default True.

    Returns:
        The preprocessed note.
    """
    note.process(self.paragraph_regex, refine=refine)

    return note

run_pipeline() abstractmethod

Abstract method that runs the annotation pipeline on a given note and returns the extracted concepts.

Source code in miade/annotators.py
@abstractmethod
def run_pipeline(self):
    """
    Abstract method that runs the annotation pipeline on a given note and returns the extracted concepts.
    """
    pass