diff --git a/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb b/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb index da5b6a18..b4ac45f3 100644 --- a/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb +++ b/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb @@ -12,7 +12,7 @@ "\n", "## Introduction\n", "\n", - "When you combine information about from multiple sources, you have to determine whether two individuals in two separate datasets are the same. You also might have multiple individuals with the same name in one dataset and need to decide whether to treat them as the same person or not. This has important implications for your analysis. Record linkage also goes by the terms data matching, merge/purge, duplication detection, de-duping, reference matching, co-reference/anaphora in various fields. \n", + "When you combine information from multiple sources, you have to determine whether two individuals in two separate datasets are the same. You also might have multiple individuals with the same name in one dataset and need to decide whether to treat them as the same person or not. This has important implications for your analysis. Record linkage also goes by the terms data matching, merge/purge, duplication detection, de-duping, reference matching, co-reference/anaphora in various fields. \n", "\n", "There are several approaches to record linkage that include **exact matching** (for example, joining records based on social security number), **rule-based linking** (applying a hierarchical set of rules that reflect domain knowledge; for example, if two people have the same first and last name and the same birthday they are considered the same); and **probabilistic linking**, or estimating the likelihood that two entities are the same and then deciding on a threshold above which two individuals will be considered to be the same. \n", "\n", @@ -43,7 +43,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:32:20.723952Z", + "start_time": "2019-06-30T21:32:19.173665Z" + } + }, "outputs": [], "source": [ "%pylab inline\n", @@ -74,7 +79,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:32:21.906272Z", + "start_time": "2019-06-30T21:32:21.242534Z" + } + }, "outputs": [], "source": [ "# Read in NSF awards data\n", @@ -84,7 +94,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:32:22.079207Z", + "start_time": "2019-06-30T21:32:21.990349Z" + } + }, "outputs": [], "source": [ "# Take a first look at the data\n", @@ -101,7 +116,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:32:25.168908Z", + "start_time": "2019-06-30T21:32:23.677475Z" + } + }, "outputs": [], "source": [ "# Read in UC data\n", @@ -111,7 +131,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:32:25.221344Z", + "start_time": "2019-06-30T21:32:25.173120Z" + } + }, "outputs": [], "source": [ "# Look at what the UC data contains\n", @@ -143,7 +168,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:32:28.641015Z", + "start_time": "2019-06-30T21:32:28.628761Z" + } + }, "outputs": [], "source": [ "# Get all unique entries in the 'year' column\n", @@ -160,7 +190,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:32:29.743462Z", + "start_time": "2019-06-30T21:32:29.690222Z" + } + }, "outputs": [], "source": [ "# Get all unique entries in the 'campus' column\n", @@ -177,7 +212,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:32:58.356538Z", + "start_time": "2019-06-30T21:32:57.678888Z" + } + }, "outputs": [], "source": [ "# Look at number of entries by campus in the dataset\n", @@ -187,7 +227,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:28:59.522668Z", + "start_time": "2019-06-30T20:28:59.452969Z" + } + }, "outputs": [], "source": [ "# Get all unique titles\n", @@ -197,7 +242,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:00.564528Z", + "start_time": "2019-06-30T20:29:00.485584Z" + } + }, "outputs": [], "source": [ "# Find out how many unique titles are present in the data\n", @@ -214,7 +264,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:03.109872Z", + "start_time": "2019-06-30T20:29:03.099584Z" + } + }, "outputs": [], "source": [ "# Get number of rows and columns of UC dataset\n", @@ -233,7 +288,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:05.748041Z", + "start_time": "2019-06-30T20:29:05.626151Z" + } + }, "outputs": [], "source": [ "# Use a mask to keep only entries that do NOT have stars instead of a name\n", @@ -253,7 +313,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:08.142006Z", + "start_time": "2019-06-30T20:29:08.100261Z" + } + }, "outputs": [], "source": [ "# Save df_ucpay with only named entries\n", @@ -263,7 +328,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:08.923390Z", + "start_time": "2019-06-30T20:29:08.825038Z" + } + }, "outputs": [], "source": [ "# Look at the first 15 entries in the updated dataset with redacted names removed\n", @@ -282,7 +352,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:12.209908Z", + "start_time": "2019-06-30T20:29:12.183091Z" + } + }, "outputs": [], "source": [ "# List of columns to keep \n", @@ -295,7 +370,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:12.859866Z", + "start_time": "2019-06-30T20:29:12.818769Z" + } + }, "outputs": [], "source": [ "# Look at the updated dataframe\n", @@ -312,7 +392,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:15.702750Z", + "start_time": "2019-06-30T20:29:15.659591Z" + } + }, "outputs": [], "source": [ "state_mask = df_nsf_awards['StateCode'] == 'CA'\n", @@ -322,7 +407,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:16.304034Z", + "start_time": "2019-06-30T20:29:16.236810Z" + } + }, "outputs": [], "source": [ "df_nsf_awards.head()" @@ -356,7 +446,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:19.359643Z", + "start_time": "2019-06-30T20:29:19.349544Z" + } + }, "outputs": [], "source": [ "# Get all of the values in the \"name\" column in the df_ucpay dataframe \n", @@ -374,7 +469,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:20.633341Z", + "start_time": "2019-06-30T20:29:20.624167Z" + } + }, "outputs": [], "source": [ "# Get all of the values in the \"FirstName\" column in the df_nsf_awards dataframe\n", @@ -385,7 +485,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:21.450620Z", + "start_time": "2019-06-30T20:29:21.436951Z" + } + }, "outputs": [], "source": [ "# Get all of the values in the \"LastName\" column in the df_nsf_awards dataframe\n", @@ -405,7 +510,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:24.330673Z", + "start_time": "2019-06-30T20:29:24.314937Z" + } + }, "outputs": [], "source": [ "# Take the first name from the UC dataset\n", @@ -430,7 +540,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:27.485413Z", + "start_time": "2019-06-30T20:29:27.428516Z" + } + }, "outputs": [], "source": [ "def split_names(name):\n", @@ -481,7 +596,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:31.060418Z", + "start_time": "2019-06-30T20:29:29.917927Z" + } + }, "outputs": [], "source": [ "# Apply our function to all the names in the UC dataset\n", @@ -498,7 +618,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:32.443955Z", + "start_time": "2019-06-30T20:29:32.159706Z" + } + }, "outputs": [], "source": [ "ls_first, ls_middle, ls_last = zip(*ls_cleaned_names)" @@ -507,7 +632,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:33.114011Z", + "start_time": "2019-06-30T20:29:32.727934Z" + } + }, "outputs": [], "source": [ "# Put colums in the UC dataset for first, middle, and last name\n", @@ -520,6 +650,10 @@ "cell_type": "code", "execution_count": null, "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:34.070913Z", + "start_time": "2019-06-30T20:29:34.021451Z" + }, "scrolled": true }, "outputs": [], @@ -539,7 +673,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:36.893652Z", + "start_time": "2019-06-30T20:29:36.791626Z" + } + }, "outputs": [], "source": [ "df_nsf_awards.dropna(subset=['FirstName','LastName'], inplace=True)" @@ -555,7 +694,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:37.959183Z", + "start_time": "2019-06-30T20:29:37.930545Z" + } + }, "outputs": [], "source": [ "df_nsf_awards['first'] = [unicode(name.lower()) for name in df_nsf_awards['FirstName'].values]\n", @@ -575,6 +719,10 @@ "cell_type": "code", "execution_count": null, "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:41.392713Z", + "start_time": "2019-06-30T20:29:41.320103Z" + }, "scrolled": false }, "outputs": [], @@ -598,7 +746,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:31:55.297567Z", + "start_time": "2019-06-30T20:31:55.230080Z" + } + }, "outputs": [], "source": [ "class StringComparators():\n", @@ -629,36 +782,56 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:31:56.414063Z", + "start_time": "2019-06-30T20:31:56.402161Z" + } + }, "outputs": [], "source": [ "# Get all of the unique names from NSF and UC \n", - "nsf_firstnames = set( df_nsf_awards['first'].values ) \n", + "nsf_firstnames = set(df_nsf_awards['first'].values) \n", "\n", "# grab the uc_names\n", - "uc_firstnames = df_ucpay['first'].values " + "uc_firstnames = df_ucpay['first'].values" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:31:56.831857Z", + "start_time": "2019-06-30T20:31:56.819260Z" + } + }, "outputs": [], "source": [ "# Comparison of records\n", - "testname = unicode(uc_firstnames[0])" + "testname = unicode(uc_firstnames[0])\n", + "testname" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:36:01.976639Z", + "start_time": "2019-06-30T20:36:01.934623Z" + } + }, "outputs": [], "source": [ - "# we should document this better and uc_names an argument\n", - "def get_matching_first_name(testname, NUM_NAMES=10):\n", + "def get_matching_first_name(testname, uc_firstnames, NUM_NAMES=10):\n", " \"\"\"\n", - " get top 10 first names that match\n", + " Get top 10 first names from UC that matches with the testname\n", + " \n", + " :param testname: string to test\n", + " :param uc_firstnames: list of names from UC\n", + " :param NUM_NAMES: nth most similar matches\n", + " :return: list with nth most similar matches orderd by similarity\n", " \"\"\"\n", " dict_name_pair = {}\n", " for name in uc_firstnames:\n", @@ -678,21 +851,31 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:32:31.795435Z", + "start_time": "2019-06-30T20:32:31.349966Z" + } + }, "outputs": [], "source": [ - "print(testname, get_matching_first_name(testname))" + "print(testname, get_matching_first_name(testname, uc_firstnames))" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:32:43.569250Z", + "start_time": "2019-06-30T20:32:32.044145Z" + } + }, "outputs": [], "source": [ - "for nm in uc_firstnames[:25]:\n", + "for i, nm in enumerate(uc_firstnames[:25]):\n", " testname = unicode(nm)\n", - " print(testname, get_matching_first_name(testname))" + " print(i+1, testname, get_matching_first_name(testname, uc_firstnames))" ] }, { @@ -715,7 +898,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:32:48.377777Z", + "start_time": "2019-06-30T20:32:48.345407Z" + } + }, "outputs": [], "source": [ "dict_nsf_awards = df_nsf_awards[:10].to_dict(orient='index')" @@ -724,7 +912,26 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:32:49.052801Z", + "start_time": "2019-06-30T20:32:49.009830Z" + } + }, + "outputs": [], + "source": [ + "dict_nsf_awards" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:36:17.939991Z", + "start_time": "2019-06-30T20:36:17.894355Z" + } + }, "outputs": [], "source": [ "def create_rule_mask(nsf_first_name, \n", @@ -767,7 +974,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:36:19.185824Z", + "start_time": "2019-06-30T20:36:19.094814Z" + } + }, "outputs": [], "source": [ "def match_records(dict_nsf_awards, df_ucpay, f_create_rule_mask):\n", @@ -802,12 +1014,13 @@ " df_matches = df_ucpay[jaro_mask]\n", " if len(df_matches) == 0:\n", " print('No Match: {} {}'.format(nsf_first_name,nsf_last_name))\n", - " for row in df_matches.iterrows():\n", - " dict_test_row['ID'] = row[1]['ID']\n", - " dict_test_row['campus'] = row[1]['campus']\n", - " dict_test_row['name'] = row[1]['name']\n", - " dict_test_row['title'] = row[1]['title']\n", - " df_linked_data = df_linked_data.append(dict_test_row, ignore_index=True)\n", + " else:\n", + " for row in df_matches.iterrows():\n", + " dict_test_row['ID'] = row[1]['ID']\n", + " dict_test_row['campus'] = row[1]['campus']\n", + " dict_test_row['name'] = row[1]['name']\n", + " dict_test_row['title'] = row[1]['title']\n", + " df_linked_data = df_linked_data.append(dict_test_row, ignore_index=True)\n", " \n", " return df_linked_data" ] @@ -815,7 +1028,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:36:28.469466Z", + "start_time": "2019-06-30T20:36:20.301998Z" + } + }, "outputs": [], "source": [ "df_linked_data = match_records(dict_nsf_awards, df_ucpay, create_rule_mask )" @@ -824,7 +1042,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:36:28.556271Z", + "start_time": "2019-06-30T20:36:28.474144Z" + } + }, "outputs": [], "source": [ "selected_columns = ['AwardId', 'CityName', 'FirstName', 'ID', 'LastName', 'Name', 'campus', 'title', 'first', 'last']\n", @@ -864,13 +1087,20 @@ "source": [ "The final way is matching records *probabilistically*. The **Fellegi-Sunter** model compares selected similiar fields in two records and calculates a similarity score, or a weighted probablity of the two records being the same entity. \n", "\n", - "The algorithm is the following: two fields are first compared using a metric, in this case, the Jaro-Winkler distance (which can be between 0 and 1). The Jaro-Winkler distance is binned into one of three categories: exact match, close match, or no match. Each category has an associated distribution, based on known matches and unmatches. The log probability of being a match and the log probability of being a non-match are calculated for each pair. The final score is the log probablity of being a match minus the log probablity of being a non-match. If the final score is greater than a threshold, then the records are considered to match." + "The algorithm is the following: two fields are first compared using a metric, in this case, the Jaro-Winkler distance (which can be between 0 and 1). The Jaro-Winkler distance is binned into one of three categories: exact match, close match, or no match. Each category has an associated distribution, based on known matches and unmatches. The log probability of being a match and the log probability of being a non-match are calculated for each pair. The final score is the log probablity of being a match minus the log probablity of being a non-match. If the final score is greater than a threshold, then the records are considered to match.\n", + "\n", + "There is already an implementation in python for the Fellegi-Sunter model in the **recordlinkage** package `pip install recordlinkage`." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:00:59.871185Z", + "start_time": "2019-06-30T21:00:59.526748Z" + } + }, "outputs": [], "source": [ "# We should probably just make these simple functions instead of objects\n", @@ -894,7 +1124,7 @@ " \n", " * exact match is a jaro-winkler score >= 0.92\n", " * close match is a jaro-winkler score > 0.85\n", - " * no match is a jaro-winkler score < 0.85\n", + " * no match is a jaro-winkler score <= 0.85\n", " \n", " Parameters\n", " ----------\n", @@ -946,9 +1176,9 @@ " #grab the m and u weights\n", " \n", " first_name_m_weight = self.m_weights['first_name'][first_name_score]\n", - " first_name_u_weight = self.u_weights['last_name'][first_name_score]\n", + " first_name_u_weight = self.u_weights['first_name'][first_name_score]\n", " \n", - " last_name_m_weight = self.m_weights['first_name'][last_name_score]\n", + " last_name_m_weight = self.m_weights['last_name'][last_name_score]\n", " last_name_u_weight = self.u_weights['last_name'][last_name_score]\n", " \n", " log_prob_match = math.log(first_name_m_weight) + math.log(last_name_m_weight)\n", @@ -988,7 +1218,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:01:00.290652Z", + "start_time": "2019-06-30T21:01:00.283463Z" + } + }, "outputs": [], "source": [ "fs = FellegiSunter()" @@ -997,7 +1232,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:01:00.790494Z", + "start_time": "2019-06-30T21:01:00.783007Z" + } + }, "outputs": [], "source": [ "print( fs.link_record(('Avishek','Kumar'), ('Avishek','Kumar')) )" @@ -1006,7 +1246,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:01:01.352939Z", + "start_time": "2019-06-30T21:01:01.342885Z" + } + }, "outputs": [], "source": [ "print( fs.link_record( ('Avishek','Kumar'), ('Anup','Kumar') ) )" @@ -1015,7 +1260,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:01:04.770898Z", + "start_time": "2019-06-30T21:01:04.756602Z" + } + }, "outputs": [], "source": [ "#let's take this new function for a spin\n", @@ -1027,7 +1277,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:06:27.846712Z", + "start_time": "2019-06-30T21:06:27.813922Z" + } + }, "outputs": [], "source": [ "def create_jaro_mask(nsf_first_name, nsf_last_name, df_ucpay):\n", @@ -1061,7 +1316,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:07:02.333067Z", + "start_time": "2019-06-30T21:06:40.122671Z" + } + }, "outputs": [], "source": [ "df_linked_data = match_records(dict_nsf_awards, df_ucpay, create_jaro_mask )" @@ -1071,6 +1331,10 @@ "cell_type": "code", "execution_count": null, "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:07:17.414495Z", + "start_time": "2019-06-30T21:07:17.351252Z" + }, "scrolled": true }, "outputs": [], @@ -1083,7 +1347,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here is the matching using probablistic matching. We can change the thresholds do see how results will vary." + "Here is the matching using probablistic matching. We can change the thresholds to see how results will vary." ] }, { @@ -1112,7 +1376,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.5.2" } }, "nbformat": 4,