Source code for ehrcorral.ehrcorral

# -*- coding: utf-8 -*-
"""Contains core classes and functions for defining populations and acting upon
them.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import sys
from collections import namedtuple, defaultdict

import numpy as np
from pylev import damerau_levenshtein

try:
    from collections import Counter
except ImportError:
    from backport_collections import Counter
from .compressions import first_letter, dmetaphone
from .measures import record_similarity

# Make unicode compatible with Python 2 and 3
try:
    unicode = unicode
except NameError:
    # Using Python 3
    unicode = str
    basestring = (str, bytes)

PROFILE_FIELDS = (
    'forename',
    'mid_forename',
    'birth_surname',
    'current_surname',
    'suffix',
    'address1',
    'address2',
    'city',
    'state_province',
    'postal_code',
    'country',
    'sex',
    'gender',
    'national_id1',
    'id2',
    'mrn',
    'birth_year',
    'birth_month',
    'birth_day',
    'blood_type',
)
# Use a class and make these class variable so you can document these fields
# in Sphinx. Make sure that when looping through these variables you get them
# in the correct order that you write them. You might need to use a
# namedtuple class. Ideally, every field is its own variable in the class so
# you can add documentation for that individual variable.


META_FIELDS = (
    'person',  # Unique to this individual, which can be changed if match found
    'accession',  # Unique number in entire herd to identify this record
    'forename_freq_ref',  # Often phonemic compression, but not necessarily
    'mid_forename_freq_ref',  # Same as above
    'birth_surname_freq_ref',  # same as above
    'current_surname_freq_ref',  # Same as above
)


[docs]def compress(names, method):
    """Compresses surnames using different phonemic algorithms.

    Args:
        names (list): A list of names, typically surnames
        method (func): A function that performs phonemic compression

    Returns:
        A list of the compressions.
    """
    if not isinstance(names, list):
        ValueError("Expected a list of names, got a {0}.".format(type(names)))
    compressions = []
    raw_compressions = map(method, names)
    # Double metaphone returns a list of tuples, so need to unpack it
    for item in raw_compressions:
        if isinstance(item, (list, tuple)):
            compressions.extend([unicode(sub) for sub in item if sub != ''])
        elif item != '':
            compressions.append(unicode(item))
    return compressions if compressions else ['']


[docs]class Profile(namedtuple('Profile', PROFILE_FIELDS)):
    """A selection of patient-identifying information from a single electronic
    health record.

    All fields should be populated with an int or string and will be coerced
    to the proper type for that field automatically.

    .. py:attribute:: forename

        Also known as first name.

    .. py:attribute:: mid_forename

        Also known as middle name.

    .. py:attribute:: birth_surname

        Last name at birth, often same as mother's maiden name.

    .. py:attribute:: current_surname

        Current last name. Can differ from birth surname often in the case of
        marriage for females.

    .. py:attribute:: suffix

        Sr., Junior, II, etc.

    .. py:attribute:: address1

        Street address, such as "100 Main Street".

    .. py:attribute:: address2

        Apartment or unit information, such as "Apt. 201".

    .. py:attribute:: state_province

        State or province.

    .. py:attribute:: postal_code

    .. py:attribute:: country

        Consistent formatting should be used. Do not use USA in one Record
        and United States of America in another.

    .. py:attribute:: sex

        Physiological sex (M or F)

    .. py:attribute:: gender

        The gender the patient identifies with (M or F), e.g. in the case of
        transexualism.

    .. py:attribute:: national_id1

        For example, social security number. This should be the same type of
        number for all patients. Do not use USA social security in one
        Record and with Mexico passport number in another.

    .. py:attribute:: id2

        Can be used as an additional identifying ID number, such as driver's
        license number. Again, define the type of ID number this is for the
        entire sub-population.

    .. py:attribute:: mrn

        Medical record number.

    .. py:attribute:: birth_year

        In the format YYYY.

    .. py:attribute:: birth_month

        In the format MM.

    .. py:attribute:: birth_day

        In the format DD.

    .. py:attribute:: blood_type

        One of A, B, AB, or O with an optional +/- denoting RhD status.
    """
    __slots__ = ()  # Prevent per-instance dictionaries to reduce memory


class Meta(namedtuple('Meta', META_FIELDS)):
    __slots__ = ()


[docs]class Record(object):
    """A Record contains identifying information about a patient, as well as
    generated phonemic and meta information.
    """
    def __init__(self):
        self.profile = None
        self._meta = None
        self._blocks = None

    def __unicode__(self):
        if self.profile is None:
            return ''
        else:
            return str(self.profile._asdict())

    def __str__(self):
        return self.__unicode__()

[docs]    def save_name_freq_refs(self,
                            record_number,
                            forename_freq_method,
                            surname_freq_method):
        """Compress the forenames and surnames and save the compressions to
        the Record.

        Args:
            record_number (int): An integer to be assigned as initial person
                and accession number.
            forename_freq_method (func): A function that performs some sort of
                compression on a single name.
            surname_freq_method (func): A function that performs some sort of
            compression on a single name.
        """
        profile = self.profile
        compressions = {
            "forename":
                compress([profile.forename], forename_freq_method)[0],
            "mid_forename":
                compress([profile.mid_forename], forename_freq_method)[0],
            "current_surname":
                compress([profile.current_surname], surname_freq_method)[0],
            "birth_surname":
                compress([profile.birth_surname], surname_freq_method)[0]
        }
        meta = [
            record_number,  # Person number, can be changed if match found
            record_number,  # Accession number, unique to this record
            compressions['forename'],  # forename ref for dict
            compressions['mid_forename'],  # mid forename ref for dict
            compressions['birth_surname'],  # birth surname ref for dict
            compressions['current_surname']  # current surname ref for dict
        ]
        self._meta = Meta._make(meta)

[docs]    def gen_blocks(self, compression):
        """Generate and set the blocking codes for a given record.

        Blocking codes are comprised of the phonemic compressions of the
        profile surnames combined with the first letter of each forename.
        Generated blocking codes are stored in self._blocks, and only contain
        the unique set of blocking codes.

        Args:
            compression (func): A function that performs phonemic
                compression.
        """
        blocks = []
        profile = self.profile
        surnames = [profile.current_surname, profile.birth_surname]
        surnames = [surname for surname in surnames if surname != '']
        bases = compress(surnames, compression)
        # Bases are now [PJTR, PHTR] - base phonemic compressions of surnames
        forenames = [profile.forename, profile.mid_forename]
        forenames = [forename for forename in forenames if forename != '']
        # Append 1st letter of each forename to each surname compression
        for base in bases:
            for forename in forenames:
                block = base + forename[0]
                blocks.append(block.upper())
        self._blocks = tuple(set(blocks))


[docs]class Herd(object):
    """A collection of :py:class:`.Record` with methods for interacting with
    and linking records in the herd.

    Attributes:
        similarity_matrix (numpy.ndarray, None): A numpy array containing the
            similarities between :py:class:`.Record` instances, ordered by
            accession number on both axes. Each entry is between 0 and 1 with 1
            being perfect similarity.
    """
    def __init__(self):
        self._population = None
        self._block_dict = defaultdict(list)
        self._surname_freq_dict = Counter()
        self._forename_freq_dict = Counter()
        self.similarity_matrix = None

    def __unicode__(self):
        population = self._population
        if population is None:
            return str(())
        elif len(population) >= 4:
            return "({0},\n {1}\n ...,\n {2},\n {3})".format(
                population[0],
                population[1],
                population[-2],
                population[-1]
            )
        else:
            return str(population)

    def __str__(self):
        return self.__unicode__()

    @property
    def size(self):
        """Returns the size of the Herd's population."""
        population = self._population
        if population is None:
            return 0
        else:
            return len(population)

[docs]    def populate(self, records):
        """Sets the Herd's sub-population.

        Args:
            records (list, tuple): A list or tuple containing multiple
                :py:class:`.Record`
        """
        if self._population is not None:
            raise AttributeError("The herd is already populated.")
        if not isinstance(records, (tuple, list)):
            raise ValueError("Expected a tuple or list.")
        if isinstance(records, list):
            records = tuple(records)
        self._population = records

[docs]    def corral(self,
               forename_freq_method=first_letter,
               surname_freq_method=dmetaphone,
               blocking_compression=dmetaphone):
        """Perform record matching on all Records in the Herd.

        Args:
            forename_freq_method (func): A function that performs some sort of
                compression. Compression of forename can be different than
                compression of surname. The compression information is used to
                determine weights for certain matching scenarios. For example,
                if forename is compressed to be just the first initial, matching
                a name that begins with the letter 'F' will result in a weight
                equal to the fraction of names that begin with the letter 'F' in
                the entire Herd. The less common names that begin with 'F' are,
                the more significant a match between two same or similar
                forenames that begin with 'F' will be. Defaults to the first
                initial of the forename.
            surname_freq_method (func): A function that performs some sort of
                compression. Defaults to double metaphone.
            blocking_compression (func): Compression method to use when
                blocking. Blocks are created by compressing the surname and then
                appending the first initial of the forename. Defaults to double
                metaphone and then uses the primary compression from that
                compression. By default the first initial of the forenames are
                appended to the surname compressions to generate block codes.
        """
        pop_length = len(self._population)
        self.similarity_matrix = np.zeros((pop_length, pop_length),
                                           dtype=np.float32)
        for i, record in enumerate(self._population):
            try:
                record.gen_blocks(blocking_compression)  # Explode the record
                # Keep count of each fore/surname compression for weighting
            except TypeError:
                exc_type, trace = sys.exc_info()[:2]
                raise TypeError("{0}\nYou must populate the Herd "
                                "first.".format(trace))
            finally:
                # Clear per https://docs.python.org/2/library/sys.html#sys.exc_info
                sys.exc_info()
            record.save_name_freq_refs(i, forename_freq_method,
                                       surname_freq_method)
            self.append_names_freq_counters(record)
            # Keep track of the Record's blocking codes in the Herd
            self.append_block_dict(record)
        for record in self._population:
            self.append_similarity_matrix_row(record)

[docs]    def append_block_dict(self, record):
        """Appends the herd's block dictionary with the given Record's
        blocking codes.

        The dictionary keys are block codes. The value of each key is a list
        of references to Records that have that block.

        Args:
            record (:py:class:`.Record`): An object of class
                :py:class:`.Record`
        """
        for block in record._blocks:
            self._block_dict[block].append(record)

[docs]    def append_names_freq_counters(self, record):
        """Adds the forename and surname for the given Record to the forename
        and surname counters.

        Args:
            record (:py:class:`.Record`): An object of class
                :py:class:`.Record`
        """
        meta = record._meta
        forenames = [
            meta.forename_freq_ref,
            meta.mid_forename_freq_ref,
        ]
        forenames = [forename for forename in forenames if forename != '']
        surnames = [
            meta.birth_surname_freq_ref,
            meta.current_surname_freq_ref
        ]
        surnames = [surname for surname in surnames if surname != '']
        self._forename_freq_dict.update(forenames)
        self._surname_freq_dict.update(surnames)

    def append_similarity_matrix_row(self, comparison_record):
        row = comparison_record._meta.accession
        for block in comparison_record._blocks:
            for record in self._block_dict[block]:
                col = record._meta.accession
                self.similarity_matrix[row][col] = \
                    record_similarity(self,
                                      comparison_record,
                                      record,
                                      damerau_levenshtein,
                                      damerau_levenshtein)


[docs]def gen_record(data):
    """Generate a :py:class:`.Record` which can be used to populate a
    :py:class:`Herd`.

    In addition to extracting the profile information for

    Args:
        data (dict): A dictionary containing at least one of fields in
            :py:data:`PROFILE_FIELDS`.

    Returns:
        A object of class :py:class:`.Record`.
    """
    fields = [data.get(field, '') for field in PROFILE_FIELDS]
    profile = Profile._make(fields)
    if len(profile.forename) < 1 or len(profile.current_surname) < 1:
        raise ValueError("A forename and current surname must be supplied.")
    record = Record()
    record.profile = profile
    return record