1 Introduction

1.1 Libraries and datasets we will use

Load packages:

library(tidyverse)
library(haven)
library(labelled)


Load datasets:

# NLS student level data, containing variables about student demographics
nls_stu <- read_dta('https://github.com/anyone-can-cook/rclass1/raw/master/data/nls72/nls72stu_percontor_vars.dta') %>%
  select(id, schcode, bysex, csex, crace, cbirthm, cbirthd, cbirthyr)

# NLS student level data, containing variables about postsecondary education transcripts (PETS)
nls_stu_pets <- read_dta('https://github.com/ozanj/rclass/raw/master/data/nls72/nls72petsstu_v2.dta') %>%
  select(id, reqtrans, numtrans)

# NLS student-transcript level data
nls_tran <- read_dta('https://github.com/ozanj/rclass/raw/master/data/nls72/nls72petstrn_v2.dta') %>%
  select(id, transnum, findisp, trnsflag, terms, fice, state, cofcon, instype, itype)

# NLS student-transcript-term level data
nls_term <- read_dta('https://github.com/ozanj/rclass/raw/master/data/nls72/nls72petstrm_v2.dta') %>%
  select(id, transnum, termnum, courses, termtype, season, sortdate, gradcode, transfer)

1.2 Lecture overview

It is rare for an analysis dataset to consist of data from only one input dataset. For most projects, each analysis dataset contains data from multiple data sources. Therefore, you must become proficient in combining data from multiple data sources.

Two broad topics:

  1. Joining datasets [big topic]
    • Combine two datasets so that the resulting dataset has additional variables
    • The term “join” comes from the relational databases world; Social science research often uses the term “merge” rather than “join”
  2. Appending datasets [small topic]
    • Stack datasets on top of one another so that resulting dataset has more observations, but (typically) the same number of variables
    • Often, longitudinal datasets are created by appending datasets

Wickham differentiates between mutating joins and filtering joins:

  • Mutating joins “add new variables to one data frame from matching observations in another”
  • Filtering joins “filter observations from one data frame based on whether or not they match an observation in the other table”
    • Doesn’t add new variables
    • Usually, purpose is to check the quality of mutating joins (e.g., which observations didn’t match)

Our main focus today is on mutating joins. But filtering joins are useful for data quality checks of mutating joins.

1.3 NLS72 data

The datasets used for this lecture comes from the National Longitudinal Survey of 1972 (NLS72):

  • Student level data
  • Student-transcript level data
  • Student-transcript-term level data

These datasets are good for teaching “joining” because you get practice joining data sources with different “observational levels” (e.g., join student level and student-transcript level data)


Investigating student level nls_stu dataframe

id schcode bysex csex crace cbirthm cbirthd cbirthyr
18 3000 2 1 4 12 9 53
67 3000 1 1 2 10 14 53
83 2518 2 2 7 3 10 54
174 2911 1 1 7 5 11 54
190 800 2 2 7 1 5 55
# Get a feel for the data
names(nls_stu)
#> [1] "id"       "schcode"  "bysex"    "csex"     "crace"    "cbirthm" 
#> [7] "cbirthd"  "cbirthyr"
glimpse(nls_stu)
#> Observations: 22,652
#> Variables: 8
#> $ id       <dbl> 18, 67, 83, 174, 190, 232, 315, 380, 414, 430, 497, 521…
#> $ schcode  <dbl> 3000, 3000, 2518, 2911, 800, 7507, 3000, 9516, 2518, 27…
#> $ bysex    <dbl+lbl> 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 99, 1, 1, 2, 1, 1, 99…
#> $ csex     <dbl+lbl> 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, …
#> $ crace    <dbl+lbl> 4, 2, 7, 7, 7, 7, 7, 7, 7, 4, 7, 7, 7, 7, 7, 7, 7, …
#> $ cbirthm  <dbl> 12, 10, 3, 5, 1, 7, 3, 10, 9, 4, 6, 2, 5, 1, 10, 5, 9, …
#> $ cbirthd  <dbl> 9, 14, 10, 11, 5, 8, 1, 24, 3, 17, 11, 13, 28, 24, 9, 2…
#> $ cbirthyr <dbl> 53, 53, 54, 54, 55, 54, 54, 53, 53, 54, 54, 54, 54, 54,…
nls_stu %>% var_label()
#> $id
#> [1] "unique school code of high school student attended"
#> 
#> $schcode
#> [1] "SCHOOL CODE"
#> 
#> $bysex
#> [1] "SEX OF STUDENT FROM BASE YEAR INSTRUMENT"
#> 
#> $csex
#> [1] "CSEX - SEX OF RESPONDENT"
#> 
#> $crace
#> [1] "RACE COMPOSITE"
#> 
#> $cbirthm
#> [1] "COMPOSITE BIRTH MONTH"
#> 
#> $cbirthd
#> [1] "COMPOSITE BIRTH DAY"
#> 
#> $cbirthyr
#> [1] "COMPOSITE BIRTH YEAR"
# We can investigate individual variables (e.g., bysex variable)
class(nls_stu$bysex)
#> [1] "haven_labelled"
str(nls_stu$bysex)
#>  'haven_labelled' num [1:22652] 2 1 2 1 2 2 1 1 1 1 ...
#>  - attr(*, "label")= chr "SEX OF STUDENT FROM BASE YEAR INSTRUMENT"
#>  - attr(*, "format.stata")= chr "%23.0g"
#>  - attr(*, "labels")= Named num [1:4] 1 2 98 99
#>   ..- attr(*, "names")= chr [1:4] "1. male" "2. female" "98. {ILLEGITIMATE SKIP}" "99. {LEGITIMATE SKIP}"
nls_stu %>% select(bysex) %>% var_label()
#> $bysex
#> [1] "SEX OF STUDENT FROM BASE YEAR INSTRUMENT"
nls_stu %>% select(bysex) %>% val_labels()
#> $bysex
#>                 1. male               2. female 98. {ILLEGITIMATE SKIP} 
#>                       1                       2                      98 
#>   99. {LEGITIMATE SKIP} 
#>                      99
nls_stu %>% count(bysex)
#> # A tibble: 4 x 2
#>                          bysex     n
#>                      <dbl+lbl> <int>
#> 1  1 [1. male]                  8208
#> 2  2 [2. female]                8340
#> 3 98 [98. {ILLEGITIMATE SKIP}]   135
#> 4 99 [99. {LEGITIMATE SKIP}]    5969
nls_stu %>% count(bysex) %>% as_factor()
#> # A tibble: 4 x 2
#>   bysex                       n
#>   <fct>                   <int>
#> 1 1. male                  8208
#> 2 2. female                8340
#> 3 98. {ILLEGITIMATE SKIP}   135
#> 4 99. {LEGITIMATE SKIP}    5969

Investigating student level nls_stu_pets dataframe

id reqtrans numtrans
18 1 0
67 1 1
83 1 0
315 2 2
414 1 0
names(nls_stu_pets)
#> [1] "id"       "reqtrans" "numtrans"
glimpse(nls_stu_pets)
#> Observations: 14,759
#> Variables: 3
#> $ id       <dbl> 18, 67, 83, 315, 414, 430, 802, 836, 935, 1040, 1057, 1…
#> $ reqtrans <dbl> 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 1, 3, 1, 2, 1, 3, 2, 3, 1…
#> $ numtrans <dbl> 0, 1, 0, 2, 0, 0, 1, 1, 2, 2, 1, 1, 1, 2, 1, 0, 2, 3, 1…
nls_stu_pets %>% var_label()
#> $id
#> [1] "unique student identification variable"
#> 
#> $reqtrans
#> [1] "NUMBER OF TRANSCRIPTS REQUESTED"
#> 
#> $numtrans
#> [1] "NUMBER OF TRANSCRIPTS RECEIVED"

Investigating student-transcript level nls_tran dataframe

id transnum findisp trnsflag terms fice state cofcon instype itype
18 1 6 0 99 1009 1 6 3 1
67 1 1 1 8 5694 1 2 1 5
83 1 3 0 99 1166 5 4 4 4
315 1 1 1 20 1009 1 6 3 1
315 2 1 1 1 1057 1 6 3 2
names(nls_tran)
#>  [1] "id"       "transnum" "findisp"  "trnsflag" "terms"    "fice"    
#>  [7] "state"    "cofcon"   "instype"  "itype"
glimpse(nls_tran)
#> Observations: 24,253
#> Variables: 10
#> $ id       <dbl> 18, 67, 83, 315, 315, 414, 430, 802, 802, 836, 836, 935…
#> $ transnum <dbl> 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 3, 1, 2, 1, 1, 2…
#> $ findisp  <dbl+lbl> 6, 1, 3, 1, 1, 3, 4, 1, 2, 4, 1, 1, 1, 3, 1, 1, 1, …
#> $ trnsflag <dbl+lbl> 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, …
#> $ terms    <dbl> 99, 8, 99, 20, 1, 99, 99, 2, 99, 99, 6, 11, 6, 99, 1, 1…
#> $ fice     <dbl> 1009, 5694, 1166, 1009, 1057, 1166, 2089, 2865, 3687, 1…
#> $ state    <dbl> 1, 1, 5, 1, 1, 5, 21, 33, 46, 14, 23, 15, 23, 15, 5, 33…
#> $ cofcon   <dbl+lbl> 6, 2, 4, 6, 6, 4, 4, 4, 3, 5, 3, 5, 6, 1, 6, 5, 6, …
#> $ instype  <dbl+lbl> 3, 1, 4, 3, 3, 4, 4, 4, 2, 2, 2, 2, 3, 1, 3, 2, 3, …
#> $ itype    <dbl+lbl> 1, 5, 4, 1, 2, 4, 4, 4, 3, 6, 4, 2, 2, 5, 1, 1, 2, …
nls_tran %>% var_label()
#> $id
#> [1] "unique student identification variable"
#> 
#> $transnum
#> [1] "TRANSCRIPT NUMBER"
#> 
#> $findisp
#> [1] "FINAL DISPOSITION"
#> 
#> $trnsflag
#> [1] "TRANSCRIPT FLAG"
#> 
#> $terms
#> [1] "NUMBER OF TERMS ON THIS TRANSCRIPT"
#> 
#> $fice
#> [1] "POSTSECONDARY INSTITUTION ID CODE"
#> 
#> $state
#> [1] "state"
#> 
#> $cofcon
#> [1] "OFFERING & CONTROL"
#> 
#> $instype
#> [1] "INSTITUTION TYPE"
#> 
#> $itype
#> [1] "INSTITUTION TYPE"
nls_tran %>% val_labels()
#> $id
#> NULL
#> 
#> $transnum
#> NULL
#> 
#> $findisp
#>      1. TRANSCRIPT RECEIVED           2. SCHOOL REFUSED 
#>                           1                           2 
#>   3. STUDENT NEVER ATTENDED 4. SCHOOL LOST OR DESTROYED 
#>                           3                           4 
#>            5. SCHOOL CLOSED  6. NO RESPONSE FROM SCHOOL 
#>                           5                           6 
#> 
#> $trnsflag
#>     0. DUMMY TRANSCRIPT 1. REQUESTED & RECEIVED 
#>                       0                       1 
#> 
#> $terms
#> NULL
#> 
#> $fice
#> NULL
#> 
#> $state
#> NULL
#> 
#> $cofcon
#>          1. proprietary      2. PUBLIC < 2-YEAR 3. PRIVATE NFP < 4-YEAR 
#>                       1                       2                       3 
#>        4. PUBLIC 2-YEAR   5. PRIVATE NFP 4-YEAR        6. PUBLIC 4-YEAR 
#>                       4                       5                       6 
#> 
#> $instype
#>   1. proprietary   2. PRIVATE NFP 3. PUBLIC 4-YEAR 4. PUBLIC 2-YEAR 
#>                1                2                3                4 
#>       5. foreign 
#>                5 
#> 
#> $itype
#>   1. Research & Doctoral         2. Comprehensive          3. Liberal Arts 
#>                        1                        2                        3 
#>                4. 2-year      5. Less than 2-year           6. Specialized 
#>                        4                        5                        6 
#> 8. Special comprehensive 
#>                        8

Investigating student-transcript-term level nls_term dataframe

id transnum termnum courses termtype season sortdate gradcode transfer
67 1 1 3 4 2 8101 1 0
67 1 2 2 4 3 8104 1 0
67 1 3 2 4 1 8109 1 0
67 1 4 2 4 2 8201 1 0
67 1 5 2 4 3 8204 1 0
names(nls_term)
#> [1] "id"       "transnum" "termnum"  "courses"  "termtype" "season"  
#> [7] "sortdate" "gradcode" "transfer"
glimpse(nls_term)
#> Observations: 120,885
#> Variables: 9
#> $ id       <dbl> 67, 67, 67, 67, 67, 67, 67, 67, 315, 315, 315, 315, 315…
#> $ transnum <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ termnum  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, …
#> $ courses  <dbl> 3, 2, 2, 2, 2, 2, 2, 2, 4, 3, 5, 4, 4, 4, 4, 5, 1, 4, 5…
#> $ termtype <dbl+lbl> 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, …
#> $ season   <dbl+lbl> 2, 3, 1, 2, 3, 4, 1, 2, 1, 2, 3, 2, 3, 1, 2, 3, 1, …
#> $ sortdate <dbl> 8101, 8104, 8109, 8201, 8204, 8207, 8209, 8301, 7209, 7…
#> $ gradcode <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
#> $ transfer <dbl+lbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
nls_term %>% var_label()
#> $id
#> [1] "unique student identification variable"
#> 
#> $transnum
#> [1] "TRANSCRIPT NUMBER"
#> 
#> $termnum
#> [1] "TERM NUMBER"
#> 
#> $courses
#> [1] "NUMBER OF COURSES TERM"
#> 
#> $termtype
#> [1] "TYPE OF TERM"
#> 
#> $season
#> [1] "SEASON OF TERM"
#> 
#> $sortdate
#> [1] "DATE OF TERM"
#> 
#> $gradcode
#> [1] "GRADE SCALE TYPE"
#> 
#> $transfer
#> [1] "TRANSFER COURSES FLAG"
nls_term %>% val_labels()
#> $id
#> NULL
#> 
#> $transnum
#> NULL
#> 
#> $termnum
#> NULL
#> 
#> $courses
#> NULL
#> 
#> $termtype
#> 1. VARIABLE LENGTH OR NONCOURSE TERM                          2. semester 
#>                                    1                                    2 
#>                         3. trimester                           4. quarter 
#>                                    3                                    4 
#>             5. CREDIT BY EXAMINATION                         7. {UNKNOWN} 
#>                                    5                                    7 
#> 
#> $season
#>      1. fall    2. winter    3. spring    4. summer 9. {MISSING} 
#>            1            2            3            4            9 
#> 
#> $sortdate
#> NULL
#> 
#> $gradcode
#>  1. LETTER GRADES 2. NUMERIC GRADES 
#>                 1                 2 
#> 
#> $transfer
#>  0. NOT TRANSFER 1. TRANSFER TERM 
#>                0                1

1.4 Relational databases and tables

Traditionally, social scientists store data in “flat files”

  • Flat files are “rectangular” datasets consisting of columns (usually called variables) and rows (usually called observations)
  • When we want to incorporate variables from two flat files, we “merge” them together

The rest of the world stores data in “relational databases”

  • A relational database consists of multiple tables
  • Each table is a flat file
  • A goal of relational databases is to store data using the minimum possible space; therefore, a rule is to never duplicate information across tables
  • When you need information from multiple tables, you “join” (the database term for “merge”) tables “on the fly” rather than creating some permanent flat file that contains data from multiple tables
  • Each table in a relational database has a different “observational level”
    • For example, NLS72 has a student level table, a student-transcript level table, etc.
    • From the perspective of a database person, it wouldn’t make sense to store student level variables (e.g., birth-date) on the student-transcript level table because student birth-date does not vary by transcript, so this would result in needless duplication of data
  • Structured Query Language (SQL) is the universally-used programming language for relational databases

Real-world examples of relational databases

  • iTunes
    • Behind the scenes, there are separate tables for artist, album, song, genre, etc.
    • The “library” you see as a user is the result of “on the fly” joins of these tables
  • Every social media app (e.g., Twitter, FB, Instagram) you use is a relational database
    • What you see as a user is the result of “on the fly” joins of individual tables running in the background
  • Interactive maps typically have relational databases running in the background
    • Clicking on some part of the map triggers a join of tables and you see the result of some analysis based on that join
    • E.g., our off-campus recruiting map

Should you think of combining dataframes in R as “merging” flat files or “joining” tables of a relational database?

  • Can think of it either way; but I think better to think of it both ways
  • For example, you can think of the NLS72 datasets as:
    • A bunch of flat files that we merge
    • OR a set of tables that comprise a relational database
  • Although we are combining flat files, tidyverse uses the terminology (e.g, “keys”, “join”) of relational databases for doing so

2 Keys

What are keys?

  • Keys are “the variables used to connect each pair of tables” [see here]
  • Even though relational databases often consist of many tables, relations are always defined between a pair of tables
  • When joining tables, focus on joining one table to another; you make this “join” using the key variable(s) that define the relationship between these two tables
  • Even when your analysis requires variables from more than two tables, you proceed by joining one pair of tables at a time


There are two types of keys: primary keys and foreign keys

  • “The main reason for primary and foreign keys is to enforce data consistency” [see here]
  • Although primary and foreign keys do not technically need to be defined in order to perform a join, they are important for maintaining database integrity

2.1 Primary keys

What is a primary key?

  • A primary key uniquely identifies an observation in its own table
  • It is a variable (or combination of variables) in a table that uniquely identifies observations in its own table


Example: id is the primary key in the nls_stu table

The variable id (“unique student identification variable”) uniquely identifies observations in the dataset nls_stu. In other words, no two observations in nls_stu have the same value of id.

Let’s confirm that each value of id is associated with only one observation:

nls_stu %>% group_by(id) %>%  # group by primary key
  summarise(n_per_id=n()) %>%  # create a measure of number of observations per group
  ungroup() %>%  # ungroup, otherwise frequency table [next step] created separately for each group
  count(n_per_id)  # frequency of number of observations per group
#> # A tibble: 1 x 2
#>   n_per_id     n
#>      <int> <int>
#> 1        1 22652

An alternative way to confirm that no values of id have more than one observation:

nls_stu %>% 
  count(id) %>%  # create object that counts the number of obs for each value of id
  filter(n>1)  # keep only rows where count of obs per id is greater than 1
#> # A tibble: 0 x 2
#> # … with 2 variables: id <dbl>, n <int>


Example: id and transnum make up the primary key in the nls_tran table

Sometimes, more than one variable is required to make up the primary key in a table. For example, in the student-transcript level table nls_tran, both id and transnum are needed to uniquely identify the observations in the table:

nls_tran %>%
  group_by(id, transnum) %>%
  summarise(n_per_key=n()) %>% 
  ungroup() %>% 
  count(n_per_key)
#> # A tibble: 1 x 2
#>   n_per_key     n
#>       <int> <int>
#> 1         1 24253

2.1.1 Student exercise

Which variable(s) make up the primary key in the nls_term table?

Solution

The variables id, transnum, and termnum make up the primary key for the nls_term table because these variables together uniquely identify each observation in the table:

nls_term %>% 
  group_by(id, transnum, termnum) %>% 
  summarise(n_per_key=n()) %>% 
  ungroup() %>% 
  count(n_per_key)
#> # A tibble: 1 x 2
#>   n_per_key      n
#>       <int>  <int>
#> 1         1 120885

2.2 Foreign keys

What is a foreign key?

  • A foreign key uniquely identifies an observation in another table
  • Said differently, it is a variable (or combination of variables) in a table that is the primary key in another table
  • A foreign key creates a relationship between two tables


Example: With respect to nls_stu, id is the foreign key in the nls_tran table

Recall that the student id is the primary key in nls_stu because it uniquely identifies each observation in the table. In terms of the relationship between nls_stu and nls_tran, we can say the id variable is the foreign key in nls_tran because it can be used to link the two tables.


Ozan’s way of thinking about foreign key

Personally, I find the concept foreign key a little bit slippery. Here is how I wrap my head around it:

  • First, always remember that “joins” happen between two specific tables, so have two specific tables in mind
  • Second, to understand foreign key concept, I think of a “focal table” [my term] (e.g., nls_tran) and some “other table” (e.g., nls_stu)
  • Third, then, the foreign key is a variable (or combination of variables) that satisfies two conditions:
    1. Exists in the “focal table” (but may or may not be the primary key for the focal table)
    2. Exists in the “other table” AND is the primary key for that “other table”

Example of foreign key

  • With respect to the “focal table” nls_tran and the “other table” nls_stu, the variable id is the foreign key because:
    • id exists in the “focal table” nls_trans (though it does not uniquely identifies observations in nls_trans)
    • id exists in the “other table” nls_stu and id uniquely identifies observations in nls_stu
nls_tran %>% group_by(id) %>% summarise(n_per_key=n()) %>% ungroup %>% count(n_per_key)
#> # A tibble: 7 x 2
#>   n_per_key     n
#>       <int> <int>
#> 1         1  8022
#> 2         2  4558
#> 3         3  1681
#> 4         4   415
#> 5         5    71
#> 6         6     6
#> 7         7     3
nls_stu %>% group_by(id) %>% summarise(n_per_key=n()) %>% ungroup %>% count(n_per_key)
#> # A tibble: 1 x 2
#>   n_per_key     n
#>       <int> <int>
#> 1         1 22652

In practice, you join two tables without explicit thinking about “primary key” vs. “foreign key” and “focal table” vs. “other table”

  • Doesn’t matter which data frame you “start with” (e.g., as the “focal table”)

The only requirements for joining are:

  1. One of the two data frames have a primary key (variable or combination of variables that uniquely identify observations) AND
  2. That variable or combination of variables is available in the other of the two data frames

With this in mind, let’s examine the dataframes nls_stu and nls_tran

# requirement 1: at least one of the two data frames must have a primary key
  # let's check if id is the primary key in nls_stu
nls_stu %>% group_by(id) %>% summarise(n_per_key=n()) %>% ungroup %>% count(n_per_key)
#> # A tibble: 1 x 2
#>   n_per_key     n
#>       <int> <int>
#> 1         1 22652
# requirement 2: that primary key (id) is available in the other of the two data frames
names(nls_tran)
#>  [1] "id"       "transnum" "findisp"  "trnsflag" "terms"    "fice"    
#>  [7] "state"    "cofcon"   "instype"  "itype"
nls_tran %>% arrange(id, transnum) %>% select(id,transnum,findisp,itype) %>% head(20)
#> # A tibble: 20 x 4
#>       id transnum                         findisp                     itype
#>    <dbl>    <dbl>                       <dbl+lbl>                 <dbl+lbl>
#>  1    18        1 6 [6. NO RESPONSE FROM SCHOOL]  1 [1. Research & Doctora…
#>  2    67        1 1 [1. TRANSCRIPT RECEIVED]      5 [5. Less than 2-year]  
#>  3    83        1 3 [3. STUDENT NEVER ATTENDED]   4 [4. 2-year]            
#>  4   315        1 1 [1. TRANSCRIPT RECEIVED]      1 [1. Research & Doctora…
#>  5   315        2 1 [1. TRANSCRIPT RECEIVED]      2 [2. Comprehensive]     
#>  6   414        1 3 [3. STUDENT NEVER ATTENDED]   4 [4. 2-year]            
#>  7   430        1 4 [4. SCHOOL LOST OR DESTROYED] 4 [4. 2-year]            
#>  8   802        1 1 [1. TRANSCRIPT RECEIVED]      4 [4. 2-year]            
#>  9   802        2 2 [2. SCHOOL REFUSED]           3 [3. Liberal Arts]      
#> 10   836        1 4 [4. SCHOOL LOST OR DESTROYED] 6 [6. Specialized]       
#> 11   836        2 1 [1. TRANSCRIPT RECEIVED]      4 [4. 2-year]            
#> 12   935        1 1 [1. TRANSCRIPT RECEIVED]      2 [2. Comprehensive]     
#> 13   935        2 1 [1. TRANSCRIPT RECEIVED]      2 [2. Comprehensive]     
#> 14   935        3 3 [3. STUDENT NEVER ATTENDED]   5 [5. Less than 2-year]  
#> 15  1040        1 1 [1. TRANSCRIPT RECEIVED]      1 [1. Research & Doctora…
#> 16  1040        2 1 [1. TRANSCRIPT RECEIVED]      1 [1. Research & Doctora…
#> 17  1057        1 1 [1. TRANSCRIPT RECEIVED]      2 [2. Comprehensive]     
#> 18  1099        1 4 [4. SCHOOL LOST OR DESTROYED] 4 [4. 2-year]            
#> 19  1099        2 4 [4. SCHOOL LOST OR DESTROYED] 4 [4. 2-year]            
#> 20  1099        3 1 [1. TRANSCRIPT RECEIVED]      5 [5. Less than 2-year]
#nls_tran %>% group_by(id) %>% summarise(n_per_key=n()) %>% ungroup %>% count(n_per_key)

2.2.1 Student exercise

With respect to nls_trans, which variable(s) make up the foreign key in the nls_term table?

Solution

Since id and transnum make up the primary key in the nls_trans table, these variables make up the foreign key in nls_term because they can be used to link the two tables:

# id and transnum make up the primary key in nls_trans because they uniquely identify each obs
nls_tran %>% 
  group_by(id, transnum) %>% 
  summarise(n_per_key=n()) %>% 
  ungroup() %>% 
  count(n_per_key)
#> # A tibble: 1 x 2
#>   n_per_key     n
#>       <int> <int>
#> 1         1 24253
# id and transnum also both exist in nls_term, so they form the foreign key that links the 2 tables
names(nls_term)
#> [1] "id"       "transnum" "termnum"  "courses"  "termtype" "season"  
#> [7] "sortdate" "gradcode" "transfer"

3 Mutating joins

Following Wickham, we’ll explain joins using the hypothetical tables x and y:

Note: Credit for examples and images belong to Wickham


What is a join?

  • A join “is a way of connecting each row in x to zero, one, or more rows in y
  • Observations in table x matched to observations in table y using a “key” variable
  • A key is a variable (or combination of variables) that exist in both tables and uniquely identifies observations in at least one of the two tables


There are four types of joins between tables x and y:

  • inner join: keep all observations that appear in both table x and table y
  • left join: keep all observations in x (regardless of whether these obs appear in y)
  • right join: keep all observations in y (regardless of whether these obs appear in x)
  • full join: keep all observations that appear in x or in y

The last three joins – left, right, full – keep observations that appear in at least one table and are collectively referred to as outer joins.


We will join tables x and y using the join() command from dplyr package. join() is a general command, which has more specific commands for each type of join:

  • inner_join()
  • left_join()
  • right_join()
  • full_join()

3.1 Inner joins

Inner joins keep all observations that appear in both table x and table y (i.e., keep observations 1 and 2):


The inner_join() function:

?inner_join

# SYNTAX AND DEFAULT VALUES
inner_join(
  x,
  y,
  by = NULL,
  copy = FALSE,
  suffix = c(".x", ".y"),
  ...,
  na_matches = c("na", "never")
)
  • Function: Perform inner join between dataframes x and y
  • Arguments:
    • x, y: Dataframes to join
    • by: Variable(s) to join by
      • If joining by single variable: inner_join(x, y, by = "keyvar")
      • If joining by multiple variables: inner_join(x, y, by = c("keyvar1", "keyvar2", ...))
      • If variables to join by have different names: inner_join(x, y, by = c("keyvarx" = "keyvary", ...))


Example: Using inner_join() to join x and y

Table x
key val_x
1 x1
2 x2
3 x3
Table y
key val_y
1 y1
2 y2
4 y3

We want to join x and y by the common variable key:

inner_join(x, y, by = "key")  # Equivalent to: inner_join(x, y, by = c("key" = "key"))
#> # A tibble: 2 x 3
#>     key val_x val_y
#>   <dbl> <chr> <chr>
#> 1     1 x1    y1   
#> 2     2 x2    y2


This is an example of a one-to-one relationship because key uniquely identifies observations in both table x and y. In other words, key can be the primary key for both x and y, as well as foreign key when we think about the relationship between the tables.


Example: One-to-one inner join between nls_stu and nls_stu_pets

Recall that nls_stu and nls_stu_pets are both student level data

  • nls_stu contains info on student demographics and nls_stu_pets contains info on student postsecondary education transcripts (PETS). It makes sense that we’ll want to join the tables by student, namely using the student id variable. This will be a one-to-one join because id uniquely identifies the observations in both nls_stu and nls_stu_pets:
nls_stu %>% group_by(id) %>% summarise(n_per_id=n()) %>% ungroup() %>% count(n_per_id) 
#> # A tibble: 1 x 2
#>   n_per_id     n
#>      <int> <int>
#> 1        1 22652
nls_stu_pets %>% group_by(id) %>% summarise(n_per_id=n()) %>% ungroup() %>% count(n_per_id) 
#> # A tibble: 1 x 2
#>   n_per_id     n
#>      <int> <int>
#> 1        1 14759
# Join the nls_stu and nls_stu_pets by id variable
nls_stu_stu <- nls_stu %>% inner_join(nls_stu_pets, by = "id")

# Joined dataframe now have variables from both nls_stu and nls_stu_pets
names(nls_stu_stu)
#>  [1] "id"       "schcode"  "bysex"    "csex"     "crace"    "cbirthm" 
#>  [7] "cbirthd"  "cbirthyr" "reqtrans" "numtrans"
# Print a few obs and selected variables from joined dataframe
nls_stu_stu %>% select(id, bysex, crace, reqtrans, numtrans) %>% as_factor()
#> # A tibble: 14,759 x 5
#>       id bysex                 crace           reqtrans numtrans
#>    <dbl> <fct>                 <fct>              <dbl>    <dbl>
#>  1    18 2. female             4. PUERTO RICAN        1        0
#>  2    67 1. male               2. black               1        1
#>  3    83 2. female             7. white               1        0
#>  4   315 1. male               7. white               2        2
#>  5   414 1. male               7. white               1        0
#>  6   430 1. male               4. PUERTO RICAN        1        0
#>  7   802 2. female             7. white               2        1
#>  8   836 1. male               7. white               2        1
#>  9   935 99. {LEGITIMATE SKIP} 7. white               3        2
#> 10  1040 1. male               7. white               2        2
#> # … with 14,749 more rows
# Compare number of obs in input datasets to joined dataset
nrow(nls_stu)
#> [1] 22652
nrow(nls_stu_pets)
#> [1] 14759
# Remember inner join only keeps obs that are common in both tables, so it makes sense that the joined dataset cannot have more rows than either input datasets
nrow(nls_stu_stu)
#> [1] 14759

3.1.1 Duplicate keys

There are times when the key variable(s) we join by do not uniquely identify observations in both the tables. For example, the table on the right (y) below has unique values for key but the table on the left (x) contains multiple observations with the same value: