NLS72 data
The datasets used for this lecture comes from the National
Longitudinal Survey of 1972 (NLS72):
- Student level data
- Student-transcript level data
- Student-transcript-term level data
These datasets are good for teaching “joining” because you get
practice joining data sources with different “observational levels”
(e.g., join student level and student-transcript level data)
Investigating student level nls_stu
dataframe
id
|
schcode
|
bysex
|
csex
|
crace
|
cbirthm
|
cbirthd
|
cbirthyr
|
18
|
3000
|
2
|
1
|
4
|
12
|
9
|
53
|
67
|
3000
|
1
|
1
|
2
|
10
|
14
|
53
|
83
|
2518
|
2
|
2
|
7
|
3
|
10
|
54
|
174
|
2911
|
1
|
1
|
7
|
5
|
11
|
54
|
190
|
800
|
2
|
2
|
7
|
1
|
5
|
55
|
# Get a feel for the data
names(nls_stu)
#> [1] "id" "schcode" "bysex" "csex" "crace" "cbirthm" "cbirthd"
#> [8] "cbirthyr"
glimpse(nls_stu)
#> Rows: 22,652
#> Columns: 8
#> $ id <dbl> 18, 67, 83, 174, 190, 232, 315, 380, 414, 430, 497, 521, 554,…
#> $ schcode <dbl> 3000, 3000, 2518, 2911, 800, 7507, 3000, 9516, 2518, 2701, 28…
#> $ bysex <dbl+lbl> 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 99, 1, 1, 2, …
#> $ csex <dbl+lbl> 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, …
#> $ crace <dbl+lbl> 4, 2, 7, 7, 7, 7, 7, 7, 7, 4, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
#> $ cbirthm <dbl> 12, 10, 3, 5, 1, 7, 3, 10, 9, 4, 6, 2, 5, 1, 10, 5, 9, 8, 12,…
#> $ cbirthd <dbl> 9, 14, 10, 11, 5, 8, 1, 24, 3, 17, 11, 13, 28, 24, 9, 24, 20,…
#> $ cbirthyr <dbl> 53, 53, 54, 54, 55, 54, 54, 53, 53, 54, 54, 54, 54, 54, 54, 5…
nls_stu %>% var_label()
#> $id
#> [1] "unique school code of high school student attended"
#>
#> $schcode
#> [1] "SCHOOL CODE"
#>
#> $bysex
#> [1] "SEX OF STUDENT FROM BASE YEAR INSTRUMENT"
#>
#> $csex
#> [1] "CSEX - SEX OF RESPONDENT"
#>
#> $crace
#> [1] "RACE COMPOSITE"
#>
#> $cbirthm
#> [1] "COMPOSITE BIRTH MONTH"
#>
#> $cbirthd
#> [1] "COMPOSITE BIRTH DAY"
#>
#> $cbirthyr
#> [1] "COMPOSITE BIRTH YEAR"
# We can investigate individual variables (e.g., bysex variable)
class(nls_stu$bysex)
#> [1] "haven_labelled" "vctrs_vctr" "double"
str(nls_stu$bysex)
#> dbl+lbl [1:22652] 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 99, 1, 1, 2, ...
#> @ label : chr "SEX OF STUDENT FROM BASE YEAR INSTRUMENT"
#> @ format.stata: chr "%23.0g"
#> @ labels : Named num [1:4] 1 2 98 99
#> ..- attr(*, "names")= chr [1:4] "1. male" "2. female" "98. {ILLEGITIMATE SKIP}" "99. {LEGITIMATE SKIP}"
nls_stu %>% select(bysex) %>% var_label()
#> $bysex
#> [1] "SEX OF STUDENT FROM BASE YEAR INSTRUMENT"
nls_stu %>% select(bysex) %>% val_labels()
#> $bysex
#> 1. male 2. female 98. {ILLEGITIMATE SKIP}
#> 1 2 98
#> 99. {LEGITIMATE SKIP}
#> 99
nls_stu %>% count(bysex)
#> # A tibble: 4 × 2
#> bysex n
#> <dbl+lbl> <int>
#> 1 1 [1. male] 8208
#> 2 2 [2. female] 8340
#> 3 98 [98. {ILLEGITIMATE SKIP}] 135
#> 4 99 [99. {LEGITIMATE SKIP}] 5969
nls_stu %>% count(bysex) %>% as_factor()
#> # A tibble: 4 × 2
#> bysex n
#> <fct> <int>
#> 1 1. male 8208
#> 2 2. female 8340
#> 3 98. {ILLEGITIMATE SKIP} 135
#> 4 99. {LEGITIMATE SKIP} 5969
Investigating student level nls_stu_pets
dataframe
id
|
reqtrans
|
numtrans
|
18
|
1
|
0
|
67
|
1
|
1
|
83
|
1
|
0
|
315
|
2
|
2
|
414
|
1
|
0
|
names(nls_stu_pets)
#> [1] "id" "reqtrans" "numtrans"
glimpse(nls_stu_pets)
#> Rows: 14,759
#> Columns: 3
#> $ id <dbl> 18, 67, 83, 315, 414, 430, 802, 836, 935, 1040, 1057, 1099, 1…
#> $ reqtrans <dbl> 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 1, 3, 1, 2, 1, 3, 2, 3, 1, 1, 1…
#> $ numtrans <dbl> 0, 1, 0, 2, 0, 0, 1, 1, 2, 2, 1, 1, 1, 2, 1, 0, 2, 3, 1, 1, 1…
nls_stu_pets %>% var_label()
#> $id
#> [1] "unique student identification variable"
#>
#> $reqtrans
#> [1] "NUMBER OF TRANSCRIPTS REQUESTED"
#>
#> $numtrans
#> [1] "NUMBER OF TRANSCRIPTS RECEIVED"
Investigating student-transcript level nls_tran
dataframe
id
|
transnum
|
findisp
|
trnsflag
|
terms
|
fice
|
state
|
cofcon
|
instype
|
itype
|
18
|
1
|
6
|
0
|
99
|
1009
|
1
|
6
|
3
|
1
|
67
|
1
|
1
|
1
|
8
|
5694
|
1
|
2
|
1
|
5
|
83
|
1
|
3
|
0
|
99
|
1166
|
5
|
4
|
4
|
4
|
315
|
1
|
1
|
1
|
20
|
1009
|
1
|
6
|
3
|
1
|
315
|
2
|
1
|
1
|
1
|
1057
|
1
|
6
|
3
|
2
|
names(nls_tran)
#> [1] "id" "transnum" "findisp" "trnsflag" "terms" "fice"
#> [7] "state" "cofcon" "instype" "itype"
glimpse(nls_tran)
#> Rows: 24,253
#> Columns: 10
#> $ id <dbl> 18, 67, 83, 315, 315, 414, 430, 802, 802, 836, 836, 935, 935,…
#> $ transnum <dbl> 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 3, 1, 2, 1, 1, 2, 3, 1…
#> $ findisp <dbl+lbl> 6, 1, 3, 1, 1, 3, 4, 1, 2, 4, 1, 1, 1, 3, 1, 1, 1, 4, 4, …
#> $ trnsflag <dbl+lbl> 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, …
#> $ terms <dbl> 99, 8, 99, 20, 1, 99, 99, 2, 99, 99, 6, 11, 6, 99, 1, 10, 8, …
#> $ fice <dbl> 1009, 5694, 1166, 1009, 1057, 1166, 2089, 2865, 3687, 1666, 2…
#> $ state <dbl> 1, 1, 5, 1, 1, 5, 21, 33, 46, 14, 23, 15, 23, 15, 5, 33, 11, …
#> $ cofcon <dbl+lbl> 6, 2, 4, 6, 6, 4, 4, 4, 3, 5, 3, 5, 6, 1, 6, 5, 6, 4, 4, …
#> $ instype <dbl+lbl> 3, 1, 4, 3, 3, 4, 4, 4, 2, 2, 2, 2, 3, 1, 3, 2, 3, 4, 4, …
#> $ itype <dbl+lbl> 1, 5, 4, 1, 2, 4, 4, 4, 3, 6, 4, 2, 2, 5, 1, 1, 2, 4, 4, …
nls_tran %>% var_label()
#> $id
#> [1] "unique student identification variable"
#>
#> $transnum
#> [1] "TRANSCRIPT NUMBER"
#>
#> $findisp
#> [1] "FINAL DISPOSITION"
#>
#> $trnsflag
#> [1] "TRANSCRIPT FLAG"
#>
#> $terms
#> [1] "NUMBER OF TERMS ON THIS TRANSCRIPT"
#>
#> $fice
#> [1] "POSTSECONDARY INSTITUTION ID CODE"
#>
#> $state
#> [1] "state"
#>
#> $cofcon
#> [1] "OFFERING & CONTROL"
#>
#> $instype
#> [1] "INSTITUTION TYPE"
#>
#> $itype
#> [1] "INSTITUTION TYPE"
nls_tran %>% val_labels()
#> $id
#> NULL
#>
#> $transnum
#> NULL
#>
#> $findisp
#> 1. TRANSCRIPT RECEIVED 2. SCHOOL REFUSED
#> 1 2
#> 3. STUDENT NEVER ATTENDED 4. SCHOOL LOST OR DESTROYED
#> 3 4
#> 5. SCHOOL CLOSED 6. NO RESPONSE FROM SCHOOL
#> 5 6
#>
#> $trnsflag
#> 0. DUMMY TRANSCRIPT 1. REQUESTED & RECEIVED
#> 0 1
#>
#> $terms
#> NULL
#>
#> $fice
#> NULL
#>
#> $state
#> NULL
#>
#> $cofcon
#> 1. proprietary 2. PUBLIC < 2-YEAR 3. PRIVATE NFP < 4-YEAR
#> 1 2 3
#> 4. PUBLIC 2-YEAR 5. PRIVATE NFP 4-YEAR 6. PUBLIC 4-YEAR
#> 4 5 6
#>
#> $instype
#> 1. proprietary 2. PRIVATE NFP 3. PUBLIC 4-YEAR 4. PUBLIC 2-YEAR
#> 1 2 3 4
#> 5. foreign
#> 5
#>
#> $itype
#> 1. Research & Doctoral 2. Comprehensive 3. Liberal Arts
#> 1 2 3
#> 4. 2-year 5. Less than 2-year 6. Specialized
#> 4 5 6
#> 8. Special comprehensive
#> 8
Investigating student-transcript-term level
nls_term
dataframe
id
|
transnum
|
termnum
|
courses
|
termtype
|
season
|
sortdate
|
gradcode
|
transfer
|
67
|
1
|
1
|
3
|
4
|
2
|
8101
|
1
|
0
|
67
|
1
|
2
|
2
|
4
|
3
|
8104
|
1
|
0
|
67
|
1
|
3
|
2
|
4
|
1
|
8109
|
1
|
0
|
67
|
1
|
4
|
2
|
4
|
2
|
8201
|
1
|
0
|
67
|
1
|
5
|
2
|
4
|
3
|
8204
|
1
|
0
|
names(nls_term)
#> [1] "id" "transnum" "termnum" "courses" "termtype" "season" "sortdate"
#> [8] "gradcode" "transfer"
glimpse(nls_term)
#> Rows: 120,885
#> Columns: 9
#> $ id <dbl> 67, 67, 67, 67, 67, 67, 67, 67, 315, 315, 315, 315, 315, 315,…
#> $ transnum <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ termnum <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12…
#> $ courses <dbl> 3, 2, 2, 2, 2, 2, 2, 2, 4, 3, 5, 4, 4, 4, 4, 5, 1, 4, 5, 5, 5…
#> $ termtype <dbl+lbl> 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
#> $ season <dbl+lbl> 2, 3, 1, 2, 3, 4, 1, 2, 1, 2, 3, 2, 3, 1, 2, 3, 1, 2, 4, …
#> $ sortdate <dbl> 8101, 8104, 8109, 8201, 8204, 8207, 8209, 8301, 7209, 7301, 7…
#> $ gradcode <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
#> $ transfer <dbl+lbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
nls_term %>% var_label()
#> $id
#> [1] "unique student identification variable"
#>
#> $transnum
#> [1] "TRANSCRIPT NUMBER"
#>
#> $termnum
#> [1] "TERM NUMBER"
#>
#> $courses
#> [1] "NUMBER OF COURSES TERM"
#>
#> $termtype
#> [1] "TYPE OF TERM"
#>
#> $season
#> [1] "SEASON OF TERM"
#>
#> $sortdate
#> [1] "DATE OF TERM"
#>
#> $gradcode
#> [1] "GRADE SCALE TYPE"
#>
#> $transfer
#> [1] "TRANSFER COURSES FLAG"
nls_term %>% val_labels()
#> $id
#> NULL
#>
#> $transnum
#> NULL
#>
#> $termnum
#> NULL
#>
#> $courses
#> NULL
#>
#> $termtype
#> 1. VARIABLE LENGTH OR NONCOURSE TERM 2. semester
#> 1 2
#> 3. trimester 4. quarter
#> 3 4
#> 5. CREDIT BY EXAMINATION 7. {UNKNOWN}
#> 5 7
#>
#> $season
#> 1. fall 2. winter 3. spring 4. summer 9. {MISSING}
#> 1 2 3 4 9
#>
#> $sortdate
#> NULL
#>
#> $gradcode
#> 1. LETTER GRADES 2. NUMERIC GRADES
#> 1 2
#>
#> $transfer
#> 0. NOT TRANSFER 1. TRANSFER TERM
#> 0 1