Source code for ehrcorral.ehrcorral

# -*- coding: utf-8 -*-
"""Contains core classes and functions for defining populations and acting upon
them.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import sys
from collections import namedtuple, defaultdict

import numpy as np
from pylev import damerau_levenshtein

try:
    from collections import Counter
except ImportError:
    from backport_collections import Counter
from .compressions import first_letter, dmetaphone
from .measures import record_similarity

# Make unicode compatible with Python 2 and 3
try:
    unicode = unicode
except NameError:
    # Using Python 3
    unicode = str
    basestring = (str, bytes)

PROFILE_FIELDS = (
    'forename',
    'mid_forename',
    'birth_surname',
    'current_surname',
    'suffix',
    'address1',
    'address2',
    'city',
    'state_province',
    'postal_code',
    'country',
    'sex',
    'gender',
    'national_id1',
    'id2',
    'mrn',
    'birth_year',
    'birth_month',
    'birth_day',
    'blood_type',
)
# Use a class and make these class variable so you can document these fields
# in Sphinx. Make sure that when looping through these variables you get them
# in the correct order that you write them. You might need to use a
# namedtuple class. Ideally, every field is its own variable in the class so
# you can add documentation for that individual variable.


META_FIELDS = (
    'person',  # Unique to this individual, which can be changed if match found
    'accession',  # Unique number in entire herd to identify this record
    'forename_freq_ref',  # Often phonemic compression, but not necessarily
    'mid_forename_freq_ref',  # Same as above
    'birth_surname_freq_ref',  # same as above
    'current_surname_freq_ref',  # Same as above
)


[docs]def compress(names, method): """Compresses surnames using different phonemic algorithms. Args: names (list): A list of names, typically surnames method (func): A function that performs phonemic compression Returns: A list of the compressions. """ if not isinstance(names, list): ValueError("Expected a list of names, got a {0}.".format(type(names))) compressions = [] raw_compressions = map(method, names) # Double metaphone returns a list of tuples, so need to unpack it for item in raw_compressions: if isinstance(item, (list, tuple)): compressions.extend([unicode(sub) for sub in item if sub != '']) elif item != '': compressions.append(unicode(item)) return compressions if compressions else ['']
[docs]class Profile(namedtuple('Profile', PROFILE_FIELDS)): """A selection of patient-identifying information from a single electronic health record. All fields should be populated with an int or string and will be coerced to the proper type for that field automatically. .. py:attribute:: forename Also known as first name. .. py:attribute:: mid_forename Also known as middle name. .. py:attribute:: birth_surname Last name at birth, often same as mother's maiden name. .. py:attribute:: current_surname Current last name. Can differ from birth surname often in the case of marriage for females. .. py:attribute:: suffix Sr., Junior, II, etc. .. py:attribute:: address1 Street address, such as "100 Main Street". .. py:attribute:: address2 Apartment or unit information, such as "Apt. 201". .. py:attribute:: state_province State or province. .. py:attribute:: postal_code .. py:attribute:: country Consistent formatting should be used. Do not use USA in one Record and United States of America in another. .. py:attribute:: sex Physiological sex (M or F) .. py:attribute:: gender The gender the patient identifies with (M or F), e.g. in the case of transexualism. .. py:attribute:: national_id1 For example, social security number. This should be the same type of number for all patients. Do not use USA social security in one Record and with Mexico passport number in another. .. py:attribute:: id2 Can be used as an additional identifying ID number, such as driver's license number. Again, define the type of ID number this is for the entire sub-population. .. py:attribute:: mrn Medical record number. .. py:attribute:: birth_year In the format YYYY. .. py:attribute:: birth_month In the format MM. .. py:attribute:: birth_day In the format DD. .. py:attribute:: blood_type One of A, B, AB, or O with an optional +/- denoting RhD status. """ __slots__ = () # Prevent per-instance dictionaries to reduce memory
class Meta(namedtuple('Meta', META_FIELDS)): __slots__ = ()
[docs]class Record(object): """A Record contains identifying information about a patient, as well as generated phonemic and meta information. """ def __init__(self): self.profile = None self._meta = None self._blocks = None def __unicode__(self): if self.profile is None: return '' else: return str(self.profile._asdict()) def __str__(self): return self.__unicode__()
[docs] def save_name_freq_refs(self, record_number, forename_freq_method, surname_freq_method): """Compress the forenames and surnames and save the compressions to the Record. Args: record_number (int): An integer to be assigned as initial person and accession number. forename_freq_method (func): A function that performs some sort of compression on a single name. surname_freq_method (func): A function that performs some sort of compression on a single name. """ profile = self.profile compressions = { "forename": compress([profile.forename], forename_freq_method)[0], "mid_forename": compress([profile.mid_forename], forename_freq_method)[0], "current_surname": compress([profile.current_surname], surname_freq_method)[0], "birth_surname": compress([profile.birth_surname], surname_freq_method)[0] } meta = [ record_number, # Person number, can be changed if match found record_number, # Accession number, unique to this record compressions['forename'], # forename ref for dict compressions['mid_forename'], # mid forename ref for dict compressions['birth_surname'], # birth surname ref for dict compressions['current_surname'] # current surname ref for dict ] self._meta = Meta._make(meta)
[docs] def gen_blocks(self, compression): """Generate and set the blocking codes for a given record. Blocking codes are comprised of the phonemic compressions of the profile surnames combined with the first letter of each forename. Generated blocking codes are stored in self._blocks, and only contain the unique set of blocking codes. Args: compression (func): A function that performs phonemic compression. """ blocks = [] profile = self.profile surnames = [profile.current_surname, profile.birth_surname] surnames = [surname for surname in surnames if surname != ''] bases = compress(surnames, compression) # Bases are now [PJTR, PHTR] - base phonemic compressions of surnames forenames = [profile.forename, profile.mid_forename] forenames = [forename for forename in forenames if forename != ''] # Append 1st letter of each forename to each surname compression for base in bases: for forename in forenames: block = base + forename[0] blocks.append(block.upper()) self._blocks = tuple(set(blocks))
[docs]class Herd(object): """A collection of :py:class:`.Record` with methods for interacting with and linking records in the herd. Attributes: similarity_matrix (numpy.ndarray, None): A numpy array containing the similarities between :py:class:`.Record` instances, ordered by accession number on both axes. Each entry is between 0 and 1 with 1 being perfect similarity. """ def __init__(self): self._population = None self._block_dict = defaultdict(list) self._surname_freq_dict = Counter() self._forename_freq_dict = Counter() self.similarity_matrix = None def __unicode__(self): population = self._population if population is None: return str(()) elif len(population) >= 4: return "({0},\n {1}\n ...,\n {2},\n {3})".format( population[0], population[1], population[-2], population[-1] ) else: return str(population) def __str__(self): return self.__unicode__() @property def size(self): """Returns the size of the Herd's population.""" population = self._population if population is None: return 0 else: return len(population)
[docs] def populate(self, records): """Sets the Herd's sub-population. Args: records (list, tuple): A list or tuple containing multiple :py:class:`.Record` """ if self._population is not None: raise AttributeError("The herd is already populated.") if not isinstance(records, (tuple, list)): raise ValueError("Expected a tuple or list.") if isinstance(records, list): records = tuple(records) self._population = records
[docs] def corral(self, forename_freq_method=first_letter, surname_freq_method=dmetaphone, blocking_compression=dmetaphone): """Perform record matching on all Records in the Herd. Args: forename_freq_method (func): A function that performs some sort of compression. Compression of forename can be different than compression of surname. The compression information is used to determine weights for certain matching scenarios. For example, if forename is compressed to be just the first initial, matching a name that begins with the letter 'F' will result in a weight equal to the fraction of names that begin with the letter 'F' in the entire Herd. The less common names that begin with 'F' are, the more significant a match between two same or similar forenames that begin with 'F' will be. Defaults to the first initial of the forename. surname_freq_method (func): A function that performs some sort of compression. Defaults to double metaphone. blocking_compression (func): Compression method to use when blocking. Blocks are created by compressing the surname and then appending the first initial of the forename. Defaults to double metaphone and then uses the primary compression from that compression. By default the first initial of the forenames are appended to the surname compressions to generate block codes. """ pop_length = len(self._population) self.similarity_matrix = np.zeros((pop_length, pop_length), dtype=np.float32) for i, record in enumerate(self._population): try: record.gen_blocks(blocking_compression) # Explode the record # Keep count of each fore/surname compression for weighting except TypeError: exc_type, trace = sys.exc_info()[:2] raise TypeError("{0}\nYou must populate the Herd " "first.".format(trace)) finally: # Clear per https://docs.python.org/2/library/sys.html#sys.exc_info sys.exc_info() record.save_name_freq_refs(i, forename_freq_method, surname_freq_method) self.append_names_freq_counters(record) # Keep track of the Record's blocking codes in the Herd self.append_block_dict(record) for record in self._population: self.append_similarity_matrix_row(record)
[docs] def append_block_dict(self, record): """Appends the herd's block dictionary with the given Record's blocking codes. The dictionary keys are block codes. The value of each key is a list of references to Records that have that block. Args: record (:py:class:`.Record`): An object of class :py:class:`.Record` """ for block in record._blocks: self._block_dict[block].append(record)
[docs] def append_names_freq_counters(self, record): """Adds the forename and surname for the given Record to the forename and surname counters. Args: record (:py:class:`.Record`): An object of class :py:class:`.Record` """ meta = record._meta forenames = [ meta.forename_freq_ref, meta.mid_forename_freq_ref, ] forenames = [forename for forename in forenames if forename != ''] surnames = [ meta.birth_surname_freq_ref, meta.current_surname_freq_ref ] surnames = [surname for surname in surnames if surname != ''] self._forename_freq_dict.update(forenames) self._surname_freq_dict.update(surnames)
def append_similarity_matrix_row(self, comparison_record): row = comparison_record._meta.accession for block in comparison_record._blocks: for record in self._block_dict[block]: col = record._meta.accession self.similarity_matrix[row][col] = \ record_similarity(self, comparison_record, record, damerau_levenshtein, damerau_levenshtein)
[docs]def gen_record(data): """Generate a :py:class:`.Record` which can be used to populate a :py:class:`Herd`. In addition to extracting the profile information for Args: data (dict): A dictionary containing at least one of fields in :py:data:`PROFILE_FIELDS`. Returns: A object of class :py:class:`.Record`. """ fields = [data.get(field, '') for field in PROFILE_FIELDS] profile = Profile._make(fields) if len(profile.forename) < 1 or len(profile.current_surname) < 1: raise ValueError("A forename and current surname must be supplied.") record = Record() record.profile = profile return record