Source code for ehrcorral.measures

# -*- coding: utf-8 -*-
"""Contains functions for measures of similarity between records.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import json
import pkgutil
import string

from pylev import damerau_levenshtein


[docs]def record_similarity(herd,
                      first_record,
                      second_record,
                      forename_method=damerau_levenshtein,
                      surname_method=damerau_levenshtein):
    """Determine weights for the likelihood of two records being the same.

    Args:
        herd (Herd): An object of :py:class:`.Herd` which contains the two
            records being compared.
        first_record (Record): An object of :py:class:`.Record` to be
            compared to the other one.
        second_record (Record): An object of :py:class:`.Record` to be
            compared to the other one.
        forename_method (func): A function that performs some sort of
            comparison between strings.
        surname_method (func): A function that performs some sort of
            comparison between strings.

    Returns:
        A tuple of the sum of name weights and the sum of non-name weights.
    """
    forename_similarity, fore_max = \
        get_forename_similarity(herd,
                                [first_record, second_record],
                                forename_method,
                                "fore")
    mid_forename_similarity, mid_fore_max = \
        get_forename_similarity(herd,
                                [first_record, second_record],
                                forename_method,
                                "mid_fore")
    birth_surname_similarity, bir_sur_max = \
        get_surname_similarity(herd,
                               [first_record, second_record],
                               surname_method,
                               "birth")
    current_surname_similarity, cur_sur_max = \
        get_surname_similarity(herd,
                               [first_record, second_record],
                               surname_method,
                               "current")
    # no place of birth field for similarity
    address_similarity = get_address_similarity([first_record, second_record],
                                                damerau_levenshtein)
    post_code_similarity = get_post_code_similarity([first_record,
                                                     second_record],
                                                    damerau_levenshtein)
    sex_similarity = get_sex_similarity([first_record, second_record])
    dob_similarity = get_dob_similarity([first_record, second_record])
    id_similarity = get_id_similarity([first_record, second_record],
                                      damerau_levenshtein)
    # did not include GP (doctor), place of birth, hospital and hospital number
    name_sum = forename_similarity + mid_forename_similarity + \
        birth_surname_similarity + current_surname_similarity
    # since we are not using a few of the ox-link weights, the non-name
    # numbers will be different
    non_name_sum = address_similarity + post_code_similarity + sex_similarity +\
        dob_similarity + id_similarity
    # sum of max weights for all fields
    max_similarity = fore_max + mid_fore_max + bir_sur_max + cur_sur_max + 33.0
    return (name_sum + non_name_sum) / max_similarity


[docs]def get_forename_similarity(herd, records, method, name_type):
    """Determine weights for the likelihood of two forenames being the same.

    Args:
        herd (Herd): An object of :py:class:`.Herd` which contains the two
            records being compared.
        records (List[Record]): A list of two objects of :py:class:`.Record`
            to be compared to one another.
        method (func): A function to be used to compare the forenames.
        name_type (unicode): A unicode string to indicate which forename is
            being compared.

    Returns:
        The forename weight for the similarity of the forenames.
    """
    name_types = ["fore", "mid_fore"]
    first_forename, first_freq = \
        extract_forename_similarity_info(herd, records[0], name_type)
    # Get both names and frequencies from second record to compare to first
    second_forefreq = [
        extract_forename_similarity_info(herd, records[1], name)
        for name in name_types
        ]
    second_forename = [item[0] for item in second_forefreq]
    second_freq = [item[1] for item in second_forefreq]
    # if there is no forename for our first record, we either dismiss the
    # similarity if one of the second record forenames is empty, or return a
    # zero match.
    if first_forename == '':
        if second_forename[0] == '' or second_forename[1] == '':
            return 0, 0
        else:
            return 0, 6
    # Get difference between first record name and both second record names,
    # then find the one that has the minimum difference and keep that one
    diffs = [method(first_forename, name) for name in second_forename]
    difference = min(diffs)
    min_index = diffs.index(difference)
    second_forename = second_forename[min_index]
    second_freq = second_freq[min_index]
    max_length = max(len(first_forename), len(second_forename))
    prop_diff = float(difference) / max_length
    prop_freq = max(first_freq, second_freq, 1.0 / 1000)
    # scale instead of using cutoff
    cutoff = 5.0 / 26  # arbitrary, could be improved
    F = 3 if prop_freq > cutoff else 12
    # map prop_diff from (0, 1) to (-2, 2), then flip sign since lower diff
    # implies that the two name are more similar.
    weight = -(4 * prop_diff - 2)
    return weight * F, 2 * F


[docs]def extract_forename_similarity_info(herd, record, name_type):
    """Extract desired forename and associated frequency weight.

    Args:
        herd (Herd): An object of :py:class:`.Herd` which contains the
            frequency dictionary used for the frequency weight.
        record (Record): An object of :py:class:`.Record` from which to
            extract the forename.
        name_type (unicode): A unicode string to indicate which forename is
            being extracted.

    Returns:
        The forename and associated frequency weight for requested name.
    """
    profile = record.profile
    # Add try/except
    if name_type == "fore":
        forename = profile.forename.lower()
        weight = herd._forename_freq_dict[record._meta.forename_freq_ref] / \
            float(sum(herd._forename_freq_dict.values()))
    elif name_type == "mid_fore":
        forename = profile.mid_forename.lower()
        weight = herd._forename_freq_dict[record._meta.mid_forename_freq_ref]\
            / float(sum(herd._forename_freq_dict.values()))
    return forename, weight


[docs]def get_surname_similarity(herd, records, method, name_type):
    """Determine weights for the likelihood of two surnames being the same.

    Args:
        herd (Herd): An object of :py:class:`.Herd` which contains the two
            records being compared.
        records (List[Record]): A list of two objects of :py:class:`.Record`
            to be compared to one another.
        method (func): A function to be used to compare the surnames.
        name_type (unicode): A unicode string to indicate which surname is
            being compared.

    Returns:
        The surname weight for the similarity of the surnames.
    """
    name_types = ["birth", "current"]
    first_surname, first_freq = \
        extract_surname_similarity_info(herd, records[0], name_type)
    # Get both names and frequencies from second record to compare to first
    second_forefreq = [
        extract_surname_similarity_info(herd, records[1], name)
        for name in name_types
        ]
    second_surname = [item[0] for item in second_forefreq]
    second_freq = [item[1] for item in second_forefreq]
    # if there is no surname for our first record, we either dismiss the
    # similarity if one of the second record surnames is empty, or return a
    # zero match.
    if first_surname == '':
        if second_surname[0] == '' or second_surname[1] == '':
            return 0, 0
        else:
            return 0, 12
    # Get difference between first record name and both second record names,
    # then find the one that has the minimum difference and keep that one
    diffs = [method(first_surname, name) for name in second_surname]
    difference = min(diffs)
    min_index = diffs.index(difference)
    second_surname = second_surname[min_index]
    second_freq = second_freq[min_index]
    max_length = max(len(first_surname), len(second_surname))
    prop_diff = float(difference) / max_length
    prop_freq = max(first_freq, second_freq, 1.0 / 1000)
    cutoff = 1.0 / 500  # arbitrary, could be improved
    S = 6 if prop_freq > cutoff else 17
    # map prop_diff from (0, 1) to (-2, 2), then flip sign since lower diff
    # implies that the two name are more similar.
    weight = -(4 * prop_diff - 2)
    return weight * S, 2 * S


[docs]def extract_surname_similarity_info(herd, record, name_type):
    """Extract desired surname and associated frequency weight.

    Args:
        herd (Herd): An object of :py:class:`.Herd` which contains the
            frequency dictionary used for the frequency weight.
        record (Record): An object of :py:class:`.Record` from which to
            extract the surname.
        name_type (unicode): A unicode string to indicate which surname is
            being extracted.

    Returns:
        The forename and associated frequency weight for requested name.
    """
    profile = record.profile
    # Add try/except
    if name_type == "birth":
        surname = profile.birth_surname.lower()
        weight = herd._surname_freq_dict[record._meta.birth_surname_freq_ref]\
            / float(sum(herd._surname_freq_dict.values()))
    elif name_type == "current":
        surname = profile.current_surname.lower()
        weight = herd._surname_freq_dict[record._meta.current_surname_freq_ref]\
            / float(sum(herd._surname_freq_dict.values()))
    return surname, weight


[docs]def get_address_similarity(records, method=damerau_levenshtein):
    """Determine weights for the likelihood of two addresses being the same.

    Args:
        records (List[Record]): A list of two objects of :py:class:`.Record`
            to be compared to one another.
        method (func): A function to be used to compare the addresses.

    Returns:
        The address weight for the similarity of the addresses.
    """
    # ox-link only takes first 8 characters
    first_profile = records[0].profile
    second_profile = records[1].profile
    first_address = first_profile.address1.lower() +\
        ' ' +\
        first_profile.address2.lower()
    second_address = second_profile.address1.lower() +\
        ' ' +\
        second_profile.address2.lower()
    first_address = clean_address(first_address)
    second_address = clean_address(second_address)
    difference = method(first_address[:12], second_address[:12])
    if difference == 0:
        return 7
    elif difference <= 2:
        return 2
    else:
        return 0
    # ox-link method
    # return 7 if diff1 == 0 else 0


[docs]def clean_address(address):
    """Clean unicode string that contains an address of all punctuation and
    standardize all street suffixes and unit designators.

    Args:
        address (unicode): A unicode string that contains an address to be
            cleaned and standardized.

    Returns:
        The cleaned unicode address string.
    """
    new_address = ' ' + address + ' '
    generic_abbrevs = get_json('generic_abbrevs.json')
    generics = get_json('generics.json')
    unit_abbrevs = get_json('unit_abbrevs.json')
    designators = get_json('designators.json')
    for char in string.punctuation:
        new_address = new_address.replace(char, ' ')
    for i, generic in enumerate(generics):
        for g in generic:
            old = ' ' + g + ' '
            new = ' ' + generic_abbrevs[i] + ' '
            new_address = new_address.replace(old, new)
    for i, designator in enumerate(designators):
        old = ' ' + designator + ' '
        new = ' ' + unit_abbrevs[i] + ' '
        new_address = new_address.replace(old, new)
    return ' '.join(new_address.split())


[docs]def get_post_code_similarity(records, method=damerau_levenshtein):
    """Determine weights for the likelihood of two postal codes being the same.

    Args:
        records (List[Record]): A list of two objects of :py:class:`.Record`
            to be compared to one another.
        method (func): A function to be used to compare the postal codes.

    Returns:
        The postal code weight for the similarity of the postal codes.
    """
    first_profile = records[0].profile
    second_profile = records[1].profile
    first_post_code = str(first_profile.postal_code)  # must be a string
    second_post_code = str(second_profile.postal_code)  # must be a string
    difference = method(first_post_code, second_post_code)
    if difference == 0:
        return 4
    elif difference == 1:  # for transposition, ox-link does not do this
        return 1
    else:
        return 0
    # ox-link method
    # return 4 if difference == 0 else 0


[docs]def get_sex_similarity(records):
    """Determine weights for the likelihood of two sexes being the same.

    Args:
        records (List[Record]): A list of two objects of :py:class:`.Record`
            to be compared to one another.

    Returns:
        The sex weight for the similarity of the sexes.
    """
    # consider how better to account for sexes besides male and female
    first_profile = records[0].profile
    second_profile = records[1].profile
    # just take first letter so that male = m
    # TODO: Consider robust way to consider non-binary sexes
    first_sex = str(first_profile.sex.lower())  # should be a string
    second_sex = str(second_profile.sex.lower())  # should be a string
    return 1 if first_sex == second_sex else -10


[docs]def get_dob_similarity(records, method=damerau_levenshtein):
    """Determine weights for the likelihood of two dates of birth being the
    same.

    Args:
        records (List[Record]): A list of two objects of :py:class:`.Record`
            to be compared to one another.
        method (func): A function to be used to compare the dates of birth.

    Returns:
        The date of birth weight for the similarity of the dates of birth.
    """
    first_profile = records[0].profile
    second_profile = records[1].profile
    first_dob = str(first_profile.birth_year), \
        str(first_profile.birth_month), \
        str(first_profile.birth_day)
    second_dob = str(second_profile.birth_year), \
        str(second_profile.birth_month), \
        str(second_profile.birth_day)
    # just return 0 if either dob is empty
    if first_dob[0] == first_dob[1] == first_dob[2] == '' or \
        second_dob[0] == second_dob[1] == second_dob[2]:
        return 0
    # TODO: penalize for year diffs like 1983 to 1975
    year_diff = method(first_dob[0], second_dob[0])
    month_diff = method(first_dob[1], second_dob[1])
    day_diff = method(first_dob[2], second_dob[2])
    # could add more complexity here based off of ox-link
    year_prop = 0.5  # slightly arbitrary choice because year means more
    month_prop = 0.25
    day_prop = 0.25
    prop_diff = year_prop * (year_diff / 4.0) + \
        month_prop * (month_diff / 2.0) + \
        day_prop * (day_diff / 2.0)
    # map prop_diff from (0, 1) to (-23, 14), then flip sign since lower diff
    # implies that the two name are more similar.
    return -(37 * prop_diff - 14)


[docs]def get_id_similarity(records, method=damerau_levenshtein):
    """Determine weights for the likelihood of two national IDs being the same.

    Args:
        records (List[Record]): A list of two objects of :py:class:`.Record`
            to be compared to one another.
        method (func): A function to be used to compare the national IDs.

    Returns:
        The national ID weight for the similarity of the two national IDs.
    """
    first_profile = records[0].profile
    second_profile = records[1].profile
    first_id = str(first_profile.national_id1.lower())  # must be a string
    second_id = str(second_profile.national_id1.lower())  # must be a string
    difference = method(first_id, second_id)
    if difference == 0:
        return 7
    elif difference == 1:  # for transposition, ox-link does not do this
        return 2
    else:
        return 0
    # ox-link method
    # return 7 if difference == 0 else 0


def get_json(file_name):
    data = pkgutil.get_data('ehrcorral', file_name)
    return json.loads(data.decode())