Lecture overview
The College Scorecard is a tool created by the US Department of Education that seeks to help prospective students investigate/compare postsecondary institutions and degree programs
Recently, the US Department of Education released new College Scorecard data on debt and earnings associated with postsecondary degree programs
'PrivacySuppressed' because number of graduates is sufficiently small that there are concerns about being able to identify individual studentsIn the following sub-sections, we “load” the data, create modified datasets, investigate the data, and run some basic descriptive statistics
Load data frame (i.e., dataset)
load(file = url('https://github.com/anyone-can-cook/educ152/raw/main/data/college_scorecard/output_data/df_debt_earn_panel_labelled.RData'))Create subset data frame:
df_scorecard <- df_debt_earn_panel_labelled %>%
# keep most recent year of data
filter(field_ay == '2017-18') %>%
# keep master's degrees
filter(credlev == 5) %>%
# carnegie categories to keep: 15 = Doctoral Universities: Very High Research Activity; 16 = Doctoral Universities: High Research Activity
filter(ccbasic %in% c(15,16)) %>%
# drop "parent plus" loan variables and other vars we won't use in this lecture
select(-contains('_pp'),-contains('_any'),-field_ay,-st_fips,-zip,-longitude,-latitude,-locale2,-highdeg,-accredagency,-relaffil,-hbcu,-annhi,-tribal,-aanapii,-hsi,-nanti,-main,-numbranch) %>%
# create variable for broad field of degree (e.g., education, business)
mutate(cipdig2 = str_sub(string = cipcode, start = 1, end = 2)) %>%
# shorten variable cipdesc to make it more suitable for printing
mutate(cipdesc = str_sub(string = cipdesc, start = 1, end = 50)) %>%
# re-order variables
relocate(opeid6,unitid,instnm,control,ccbasic,stabbr,city,cipdig2)
# "glimpse" data frame
df_scorecard %>% glimpse()
#> Rows: 15,336
#> Columns: 24
#> $ opeid6 <chr> "001009", "001009", "001009", "001009...
#> $ unitid <dbl> 100858, 100858, 100858, 100858, 10085...
#> $ instnm <chr> "Auburn University", "Auburn Universi...
#> $ control <chr+lbl> Public, Public, Public, Public, P...
#> $ ccbasic <dbl+lbl> 15, 15, 15, 15, 15, 15, 15, 15, 1...
#> $ stabbr <chr> "AL", "AL", "AL", "AL", "AL", "AL", "...
#> $ city <chr> "Auburn", "Auburn", "Auburn", "Auburn...
#> $ cipdig2 <chr> "01", "01", "01", "01", "01", "03", "...
#> $ cipcode <chr> "0101", "0103", "0109", "0111", "0181...
#> $ cipdesc <chr> "Agricultural Business and Management...
#> $ credlev <dbl+lbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
#> $ creddesc <chr> "Master's Degree", "Master's Degree",...
#> $ ipedscount1 <chr> "7", "15", "17", "24", "NULL", "7", "...
#> $ ipedscount2 <chr> "3", "14", "11", "38", "NULL", "4", "...
#> $ debt_all_stgp_eval_n <chr> "PrivacySuppressed", "PrivacySuppress...
#> $ debt_all_stgp_eval_mean <chr> "PrivacySuppressed", "PrivacySuppress...
#> $ debt_all_stgp_eval_mdn <chr> "PrivacySuppressed", "PrivacySuppress...
#> $ debt_all_stgp_eval_mdn10yrpay <chr> "PrivacySuppressed", "PrivacySuppress...
#> $ earn_count_wne_hi_1yr <chr> "PrivacySuppressed", "15", "12", "11"...
#> $ earn_mdn_hi_1yr <chr> "PrivacySuppressed", "46478", "43426"...
#> $ earn_count_wne_hi_2yr <chr> "PrivacySuppressed", "13", "PrivacySu...
#> $ earn_mdn_hi_2yr <chr> "PrivacySuppressed", "44942", "Privac...
#> $ region <dbl+lbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
#> $ locale <dbl+lbl> 13, 13, 13, 13, 13, 13, 13, 13, 1...
# For debt and earnings variables, convert from character to numeric variables (which replaces "PrivacySuppressed" values with NA values)
df_scorecard <- df_scorecard %>%
mutate(
debt_all_stgp_eval_n = as.numeric(debt_all_stgp_eval_n),
debt_all_stgp_eval_mean = as.numeric(debt_all_stgp_eval_mean),
debt_all_stgp_eval_mdn = as.numeric(debt_all_stgp_eval_mdn),
debt_all_stgp_eval_mdn10yrpay = as.numeric(debt_all_stgp_eval_mdn10yrpay),
earn_count_wne_hi_1yr = as.numeric(earn_count_wne_hi_1yr),
earn_mdn_hi_1yr = as.numeric(earn_mdn_hi_1yr),
earn_count_wne_hi_2yr = as.numeric(earn_count_wne_hi_2yr),
earn_mdn_hi_2yr = as.numeric(earn_mdn_hi_2yr)
)
# add variable label to variable cipdig2
attr(df_scorecard[['cipdig2']], which = 'label') <- 'broad degree field code = 2-digit classification of instructional programs (CIP) degree code'
# add variable label attribute back to debt and earnings variables
for(v in c('debt_all_stgp_eval_n','debt_all_stgp_eval_mean','debt_all_stgp_eval_mdn','debt_all_stgp_eval_mdn10yrpay','earn_count_wne_hi_1yr','earn_mdn_hi_1yr','earn_count_wne_hi_2yr','earn_mdn_hi_2yr','cipdesc')) {
#writeLines(str_c('object v=', v))
#writeLines(attr(df_debt_earn_panel_labelled[[v]], which = 'label'))
attr(df_scorecard[[v]], which = 'label') <- attr(df_debt_earn_panel_labelled[[v]], which = 'label')
}
df_scorecard %>% glimpse()
#> Rows: 15,336
#> Columns: 24
#> $ opeid6 <chr> "001009", "001009", "001009", "001009...
#> $ unitid <dbl> 100858, 100858, 100858, 100858, 10085...
#> $ instnm <chr> "Auburn University", "Auburn Universi...
#> $ control <chr+lbl> Public, Public, Public, Public, P...
#> $ ccbasic <dbl+lbl> 15, 15, 15, 15, 15, 15, 15, 15, 1...
#> $ stabbr <chr> "AL", "AL", "AL", "AL", "AL", "AL", "...
#> $ city <chr> "Auburn", "Auburn", "Auburn", "Auburn...
#> $ cipdig2 <chr> "01", "01", "01", "01", "01", "03", "...
#> $ cipcode <chr> "0101", "0103", "0109", "0111", "0181...
#> $ cipdesc <chr> "Agricultural Business and Management...
#> $ credlev <dbl+lbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
#> $ creddesc <chr> "Master's Degree", "Master's Degree",...
#> $ ipedscount1 <chr> "7", "15", "17", "24", "NULL", "7", "...
#> $ ipedscount2 <chr> "3", "14", "11", "38", "NULL", "4", "...
#> $ debt_all_stgp_eval_n <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 1...
#> $ debt_all_stgp_eval_mean <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 6...
#> $ debt_all_stgp_eval_mdn <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
#> $ debt_all_stgp_eval_mdn10yrpay <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
#> $ earn_count_wne_hi_1yr <dbl> NA, 15, 12, 11, NA, NA, NA, NA, NA, 2...
#> $ earn_mdn_hi_1yr <dbl> NA, 46478, 43426, 43314, NA, NA, NA, ...
#> $ earn_count_wne_hi_2yr <dbl> NA, 13, NA, NA, NA, NA, NA, 11, NA, 2...
#> $ earn_mdn_hi_2yr <dbl> NA, 44942, NA, NA, NA, NA, NA, 42712,...
#> $ region <dbl+lbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
#> $ locale <dbl+lbl> 13, 13, 13, 13, 13, 13, 13, 13, 1...Investigate dataset, show variable labels
df_scorecard %>% var_label()
#> $opeid6
#> [1] "6-digit OPE ID for institution"
#>
#> $unitid
#> [1] "Unit ID for institution"
#>
#> $instnm
#> [1] "Institution name"
#>
#> $control
#> [1] "Control of institution"
#>
#> $ccbasic
#> [1] "Carnegie Classification -- basic"
#>
#> $stabbr
#> [1] "State postcode"
#>
#> $city
#> [1] "City"
#>
#> $cipdig2
#> [1] "broad degree field code = 2-digit classification of instructional programs (CIP) degree code"
#>
#> $cipcode
#> [1] "Classification of Instructional Programs (CIP) code for the field of study"
#>
#> $cipdesc
#> [1] "Text description of the field of study CIP Code"
#>
#> $credlev
#> [1] "Level of credential"
#>
#> $creddesc
#> [1] "Text description of the level of credential"
#>
#> $ipedscount1
#> [1] "Number of awards to all students in year 1 of the pooled debt cohort"
#>
#> $ipedscount2
#> [1] "Number of awards to all students in year 2 of the pooled debt cohort"
#>
#> $debt_all_stgp_eval_n
#> [1] "Borrower count for average/median Stafford and Grad PLUS loan debt disbursed at this institution"
#>
#> $debt_all_stgp_eval_mean
#> [1] "Average Stafford and Grad PLUS loan debt disbursed at this institution"
#>
#> $debt_all_stgp_eval_mdn
#> [1] "Median Stafford and Grad PLUS loan debt disbursed at this institution"
#>
#> $debt_all_stgp_eval_mdn10yrpay
#> [1] "Median estimated monthly payment for Stafford and Grad PLUS loan debt disbursed at this institution"
#>
#> $earn_count_wne_hi_1yr
#> [1] "Number of graduates working and not enrolled 1 year after completing highest credential"
#>
#> $earn_mdn_hi_1yr
#> [1] "Median earnings of graduates working and not enrolled 1 year after completing highest credential"
#>
#> $earn_count_wne_hi_2yr
#> [1] "Number of graduates working and not enrolled 2 years after completing highest credential"
#>
#> $earn_mdn_hi_2yr
#> [1] "Median earnings of graduates working and not enrolled 2 years after completing highest credential"
#>
#> $region
#> [1] "Region (IPEDS)"
#>
#> $locale
#> [1] "Locale of institution"Investigate dataset, show “value labels” for variables that have value labels
Investigate debt and earnings from a few fancy universities
unitid == 110635opeid6 == 001312unitid == 110662opeid6 == 001315unitid == 123961opeid6 == 001328unitid == 243744opeid6 == 001305unitid == 190150opeid6 == 002707unitid == 196468opeid6 == 003979unitid == 193900opeid6 == 002785unitid == 166027opeid6 == 002155Examine debt and earnings from master’s programs at some fancy universities
df_scorecard %>%
# opeid6: UC-Berkeley = 001312; UCLA = 001315; USC = 001328; Stanford = 001305; Columbia = 002707; Columbia, Teacher's College = 003979; NYU = 002785; Harvard = 002155
filter(opeid6 %in% c('001312','001315','001328','001305','002707','003979','002785','002155')) %>%
# filter observations where debt_all_stgp_eval_n is not missing (NA)
filter(is.na(debt_all_stgp_eval_n)==0) %>%
select(instnm,cipdig2,cipcode,cipdesc,debt_all_stgp_eval_n,debt_all_stgp_eval_mean,earn_mdn_hi_2yr)
#> # A tibble: 220 x 7
#> instnm cipdig2 cipcode cipdesc debt_all_stgp_e~ debt_all_stgp_e~
#> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 Stanf~ 05 0501 Area S~ 14 41792
#> 2 Stanf~ 09 0901 Commun~ 11 45456
#> 3 Stanf~ 11 1107 Comput~ 22 43580
#> 4 Stanf~ 13 1301 Educat~ 171 35264
#> 5 Stanf~ 14 1402 Aerosp~ 13 63721
#> 6 Stanf~ 14 1408 Civil ~ 51 47367
#> 7 Stanf~ 14 1410 Electr~ 25 50608
#> 8 Stanf~ 14 1419 Mechan~ 20 52503
#> 9 Stanf~ 15 1515 Engine~ 20 56004
#> 10 Stanf~ 51 5122 Public~ 11 60488
#> # ... with 210 more rows, and 1 more variable: earn_mdn_hi_2yr <dbl>
#%>% print(n=200)Examine debt and earnings from master’s programs in education at some fancy universities
df_scorecard %>%
# opeid6: UC-Berkeley = 001312; UCLA = 001315; USC = 001328; Stanford = 001305; Columbia = 002707; Columbia, Teacher's College = 003979; NYU = 002785; Harvard = 002155
filter(opeid6 %in% c('001312','001315','001328','001305','002707','003979','002785','002155')) %>%
# filter observations where debt_all_stgp_eval_n is not missing (NA)
filter(is.na(debt_all_stgp_eval_n)==0) %>%
# filter degree programs in education
filter(cipdig2 == '13') %>%
select(instnm,cipdig2,cipcode,cipdesc,debt_all_stgp_eval_n,debt_all_stgp_eval_mean,earn_mdn_hi_2yr)
#> # A tibble: 26 x 7
#> instnm cipdig2 cipcode cipdesc debt_all_stgp_e~ debt_all_stgp_e~
#> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 Stanf~ 13 1301 "Educa~ 171 35264
#> 2 Unive~ 13 1301 "Educa~ 221 32803
#> 3 Unive~ 13 1304 "Educa~ 59 67159
#> 4 Unive~ 13 1306 "Educa~ 17 61611
#> 5 Unive~ 13 1311 "Stude~ 48 84293
#> 6 Unive~ 13 1313 "Teach~ 273 66099
#> 7 Unive~ 13 1314 "Teach~ 71 61120
#> 8 Harva~ 13 1301 "Educa~ 641 35492
#> 9 New Y~ 13 1304 "Educa~ 17 73952
#> 10 New Y~ 13 1305 "Educa~ 17 86476
#> # ... with 16 more rows, and 1 more variable: earn_mdn_hi_2yr <dbl>Create a new data frame that contains data about Education MA programs in which debt variables are not suppressed
df_scorecard_edu <- df_scorecard %>%
# filter: degree programs in education; and debt_all_stgp_eval_n is not missing (NA)
filter(cipdig2 == '13',is.na(debt_all_stgp_eval_n)==0)
Investigate new data frame df_scorecard_edu
#df_scorecard_edu %>% glimpse()
# investigate data structure
# one observation per opeid6-cipcode
df_scorecard_edu %>% group_by(opeid6,cipcode) %>% summarise(n_per_key=n()) %>% ungroup() %>% count(n_per_key)
#> # A tibble: 1 x 2
#> n_per_key n
#> <int> <int>
#> 1 1 773
#df_scorecard_edu %>% group_by(unitid,cipcode) %>% summarise(n_per_key=n()) %>% ungroup() %>% count(n_per_key)
# name of institutions
df_scorecard_edu %>% group_by(instnm) %>% slice(1) %>% ungroup() %>% select(instnm)
#> # A tibble: 216 x 1
#> instnm
#> <chr>
#> 1 American University
#> 2 Arizona State University-Tempe
#> 3 Arkansas State University-Main Campus
#> 4 Auburn University
#> 5 Azusa Pacific University
#> 6 Ball State University
#> 7 Baylor University
#> 8 Binghamton University
#> 9 Boise State University
#> 10 Boston College
#> # ... with 206 more rows
# control (public, private)
df_scorecard_edu %>% group_by(opeid6) %>% slice(1) %>% ungroup %>% count(control) %>% as_factor()
#> # A tibble: 2 x 2
#> control n
#> <fct> <int>
#> 1 Private, nonprofit 51
#> 2 Public 165
# carnegie classification
df_scorecard_edu %>% group_by(opeid6) %>% slice(1) %>% ungroup %>% count(ccbasic) %>% as_factor()
#> # A tibble: 2 x 2
#> ccbasic n
#> <fct> <int>
#> 1 Doctoral Universities: Very High Research Activity 108
#> 2 Doctoral Universities: High Research Activity 108
# level of urbanization
df_scorecard_edu %>% group_by(opeid6) %>% slice(1) %>% ungroup %>% count(locale) %>% as_factor()
#> # A tibble: 9 x 2
#> locale n
#> <fct> <int>
#> 1 City: Large (population of 250,000 or more) 76
#> 2 City: Midsize (population of at least 100,000 but less than 250,000) 40
#> 3 City: Small (population less than 100,000) 39
#> 4 Suburb: Large (outside principal city, in urbanized area with populatio~ 29
#> 5 Suburb: Midsize (outside principal city, in urbanized area with populat~ 6
#> 6 Suburb: Small (outside principal city, in urbanized area with populatio~ 7
#> 7 Town: Fringe (in urban cluster up to 10 miles from an urbanized area) 3
#> 8 Town: Distant (in urban cluster more than 10 miles and up to 35 miles f~ 9
#> 9 Town: Remote (in urban cluster more than 35 miles from an urbanized are~ 7
# which education degrees
df_scorecard_edu %>% group_by(cipdesc) %>% slice(1) %>% ungroup %>% count(cipdesc)
#> # A tibble: 13 x 2
#> cipdesc n
#> <chr> <int>
#> 1 "Bilingual, Multilingual, and Multicultural Educati" 1
#> 2 "Curriculum and Instruction." 1
#> 3 "Education, General." 1
#> 4 "Education, Other." 1
#> 5 "Educational Administration and Supervision." 1
#> 6 "Educational Assessment, Evaluation, and Research." 1
#> 7 "Educational/Instructional Media Design." 1
#> 8 "International and Comparative Education." 1
#> 9 "Social and Philosophical Foundations of Education." 1
#> 10 "Special Education and Teaching." 1
#> 11 "Student Counseling and Personnel Services." 1
#> 12 "Teacher Education and Professional Development, Sp" 1
#> 13 "Teaching English or French as a Second or Foreign " 1Investigate data frame df_scorecard_edu, debt and earnings variables
# mean of mean debt from sfafford and grad plus
df_scorecard_edu %>% summarize(
mean_debt = mean(debt_all_stgp_eval_mean, na.rm = TRUE)
)
#> # A tibble: 1 x 1
#> mean_debt
#> <dbl>
#> 1 35415.
# separate for public vs. private
df_scorecard_edu %>% group_by(control) %>% summarize(
mean_debt = mean(debt_all_stgp_eval_mean, na.rm = TRUE)
)
#> # A tibble: 2 x 2
#> control mean_debt
#> <chr+lbl> <dbl>
#> 1 Private, nonprofit 47341.
#> 2 Public 32447.
# mean of median earnings, 2 years after graduation
df_scorecard_edu %>% summarize(
mean_earn = mean(earn_mdn_hi_2yr, na.rm = TRUE)
)
#> # A tibble: 1 x 1
#> mean_earn
#> <dbl>
#> 1 48194.
# separate for public vs. private
df_scorecard_edu %>% group_by(control) %>% summarize(
mean_earn = mean(earn_mdn_hi_2yr, na.rm = TRUE)
)
#> # A tibble: 2 x 2
#> control mean_earn
#> <chr+lbl> <dbl>
#> 1 Private, nonprofit 52645.
#> 2 Public 47104.In general, the goal of statistical inference is to infer something about a population based on a sample from that population
Population Parameter: a measure of the population
Sample is part of, a subset, of the population
Estimator (or Statistic): a formula or procedure used to estimate the value of the population parameter using a sample of the population
Point Estimate: numeric value generated from calculating an estimator from a specific sample of data
Introductory statistics class
Multivariate regression class
Population Parameters
Estimators of Population Parameters (two general approaches)
In statistics, we are usually working with a (called a ‘data frame’ in R) that contains variables (columns) and observations (rows)
df_scorecard_edu %>% glimpse() # a dataset
#> Rows: 773
#> Columns: 24
#> $ opeid6 <chr> "001009", "001009", "001009", "001009...
#> $ unitid <dbl> 100858, 100858, 100858, 100858, 10085...
#> $ instnm <chr> "Auburn University", "Auburn Universi...
#> $ control <chr+lbl> Public, Public, Public, Public, P...
#> $ ccbasic <dbl+lbl> 15, 15, 15, 15, 15, 15, 15, 15, 1...
#> $ stabbr <chr> "AL", "AL", "AL", "AL", "AL", "AL", "...
#> $ city <chr> "Auburn", "Auburn", "Auburn", "Auburn...
#> $ cipdig2 <chr> "13", "13", "13", "13", "13", "13", "...
#> $ cipcode <chr> "1304", "1310", "1311", "1312", "1313...
#> $ cipdesc <chr> "Educational Administration and Super...
#> $ credlev <dbl+lbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
#> $ creddesc <chr> "Master's Degree", "Master's Degree",...
#> $ ipedscount1 <chr> "53", "34", "27", "59", "61", "36", "...
#> $ ipedscount2 <chr> "30", "39", "36", "64", "56", "22", "...
#> $ debt_all_stgp_eval_n <dbl> 34, 38, 32, 52, 43, 45, 20, 40, 25, 2...
#> $ debt_all_stgp_eval_mean <dbl> 29768, 43336, 41572, 31462, 28726, 35...
#> $ debt_all_stgp_eval_mdn <dbl> 30942, 41000, 43500, 30000, 20500, 32...
#> $ debt_all_stgp_eval_mdn10yrpay <dbl> 318, 421, 447, 308, 210, 335, 295, 21...
#> $ earn_count_wne_hi_1yr <dbl> 46, 34, 26, 56, 82, 51, 32, 52, 21, 3...
#> $ earn_mdn_hi_1yr <dbl> 46321, 42905, 37568, 44495, 49179, 52...
#> $ earn_count_wne_hi_2yr <dbl> 36, 24, 30, 40, 83, 40, 34, 36, 20, 4...
#> $ earn_mdn_hi_2yr <dbl> 41844, 44021, 34603, 48041, 45249, 53...
#> $ region <dbl+lbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
#> $ locale <dbl+lbl> 13, 13, 13, 13, 13, 12, 12, 12, 1...In general, a variable is something that varies
# the variable 'cipdesc' from the dataset df_scorecard_edu
df_scorecard_edu %>% select(cipdesc)
#> # A tibble: 773 x 1
#> cipdesc
#> <chr>
#> 1 Educational Administration and Supervision.
#> 2 Special Education and Teaching.
#> 3 Student Counseling and Personnel Services.
#> 4 Teacher Education and Professional Development, Sp
#> 5 Teacher Education and Professional Development, Sp
#> 6 Educational Administration and Supervision.
#> 7 Special Education and Teaching.
#> 8 Teacher Education and Professional Development, Sp
#> 9 Teacher Education and Professional Development, Sp
#> 10 Educational Administration and Supervision.
#> # ... with 763 more rows
# a different approach to showing the same variable
df_scorecard_edu$cipdesc[1:10]
#> [1] "Educational Administration and Supervision."
#> [2] "Special Education and Teaching."
#> [3] "Student Counseling and Personnel Services."
#> [4] "Teacher Education and Professional Development, Sp"
#> [5] "Teacher Education and Professional Development, Sp"
#> [6] "Educational Administration and Supervision."
#> [7] "Special Education and Teaching."
#> [8] "Teacher Education and Professional Development, Sp"
#> [9] "Teacher Education and Professional Development, Sp"
#> [10] "Educational Administration and Supervision."Continuous Variables
Variable ipedscount2 = number of degrees awarded in most recent academic year reported
df_scorecard_edu %>% select(ipedscount2) %>% var_label() # variable label
#> $ipedscount2
#> [1] "Number of awards to all students in year 2 of the pooled debt cohort"
df_scorecard_edu %>% select(instnm,opeid6,cipdesc,ipedscount2)
#> # A tibble: 773 x 4
#> instnm opeid6 cipdesc ipedscount2
#> <chr> <chr> <chr> <chr>
#> 1 Auburn University 001009 Educational Administration and S~ 30
#> 2 Auburn University 001009 Special Education and Teaching. 39
#> 3 Auburn University 001009 Student Counseling and Personnel~ 36
#> 4 Auburn University 001009 Teacher Education and Profession~ 64
#> 5 Auburn University 001009 Teacher Education and Profession~ 56
#> 6 The University of Alaba~ 001051 Educational Administration and S~ 22
#> 7 The University of Alaba~ 001051 Special Education and Teaching. 28
#> 8 The University of Alaba~ 001051 Teacher Education and Profession~ 65
#> 9 The University of Alaba~ 001051 Teacher Education and Profession~ 38
#> 10 University of Alabama a~ 001052 Educational Administration and S~ 30
#> # ... with 763 more rows
# showing just the first ten values of variable ipedscount2
df_scorecard_edu$ipedscount2[1:10]
#> [1] "30" "39" "36" "64" "56" "22" "28" "65" "38" "30"Discrete Variables (or categorical variables)
1=very satisfied, 2=satisfied)Variable region = Geographic Census region the university is located in
# indicates this variable has "value labels
df_scorecard_edu %>% select(region) %>% glimpse()
#> Rows: 773
#> Columns: 1
#> $ region <dbl+lbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,...
df_scorecard_edu$region %>% class()
#> [1] "haven_labelled"
df_scorecard_edu %>% select(region) %>% var_label() # variable label
#> $region
#> [1] "Region (IPEDS)"
df_scorecard_edu %>% select(region) %>% val_labels() # label assigned to variable values
#> $region
#> U.S. Service Schools
#> 0
#> New England (CT, ME, MA, NH, RI, VT)
#> 1
#> Mid East (DE, DC, MD, NJ, NY, PA)
#> 2
#> Great Lakes (IL, IN, MI, OH, WI)
#> 3
#> Plains (IA, KS, MN, MO, NE, ND, SD)
#> 4
#> Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)
#> 5
#> Southwest (AZ, NM, OK, TX)
#> 6
#> Rocky Mountains (CO, ID, MT, UT, WY)
#> 7
#> Far West (AK, CA, HI, NV, OR, WA)
#> 8
#> Outlying Areas (AS, FM, GU, MH, MP, PR, PW, VI)
#> 9
# print a few observations
df_scorecard_edu %>% select(instnm,opeid6,cipdesc,region)
#> # A tibble: 773 x 4
#> instnm opeid6 cipdesc region
#> <chr> <chr> <chr> <dbl+lbl>
#> 1 Auburn University 001009 Educational Administrat~ 5 [Southeast (AL, AR, FL, ~
#> 2 Auburn University 001009 Special Education and T~ 5 [Southeast (AL, AR, FL, ~
#> 3 Auburn University 001009 Student Counseling and ~ 5 [Southeast (AL, AR, FL, ~
#> 4 Auburn University 001009 Teacher Education and P~ 5 [Southeast (AL, AR, FL, ~
#> 5 Auburn University 001009 Teacher Education and P~ 5 [Southeast (AL, AR, FL, ~
#> 6 The University o~ 001051 Educational Administrat~ 5 [Southeast (AL, AR, FL, ~
#> 7 The University o~ 001051 Special Education and T~ 5 [Southeast (AL, AR, FL, ~
#> 8 The University o~ 001051 Teacher Education and P~ 5 [Southeast (AL, AR, FL, ~
#> 9 The University o~ 001051 Teacher Education and P~ 5 [Southeast (AL, AR, FL, ~
#> 10 University of Al~ 001052 Educational Administrat~ 5 [Southeast (AL, AR, FL, ~
#> # ... with 763 more rows
# frequency count of the variable
df_scorecard_edu %>% count(region)
#> # A tibble: 8 x 2
#> region n
#> <dbl+lbl> <int>
#> 1 1 [New England (CT, ME, MA, NH, RI, VT)] 36
#> 2 2 [Mid East (DE, DC, MD, NJ, NY, PA)] 117
#> 3 3 [Great Lakes (IL, IN, MI, OH, WI)] 138
#> 4 4 [Plains (IA, KS, MN, MO, NE, ND, SD)] 50
#> 5 5 [Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)] 232
#> 6 6 [Southwest (AZ, NM, OK, TX)] 93
#> 7 7 [Rocky Mountains (CO, ID, MT, UT, WY)] 37
#> 8 8 [Far West (AK, CA, HI, NV, OR, WA)] 70
# frequency count of the variable, but show value labels rather than underlying variable value
df_scorecard_edu %>% count(region) %>% as_factor()
#> # A tibble: 8 x 2
#> region n
#> <fct> <int>
#> 1 New England (CT, ME, MA, NH, RI, VT) 36
#> 2 Mid East (DE, DC, MD, NJ, NY, PA) 117
#> 3 Great Lakes (IL, IN, MI, OH, WI) 138
#> 4 Plains (IA, KS, MN, MO, NE, ND, SD) 50
#> 5 Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV) 232
#> 6 Southwest (AZ, NM, OK, TX) 93
#> 7 Rocky Mountains (CO, ID, MT, UT, WY) 37
#> 8 Far West (AK, CA, HI, NV, OR, WA) 70Several types of “variation” exist
Cross sectional data: data on different “observations” (e.g., students, classrooms, universities) for a single point in time [focus of this course]
Time-Series Data: data on a single “observation” collected at multiple time points
Longitudinal Data (or panel data): data on multiple “observations” at multiple time points
10 X 10 = 100 observationsExperimental Data: obtained from experiments designed to assess the causal effect of a “treatment” on an outcome
Observational Data: obtained from surveys, administrative records [focus of this course]
Descriptive statistics describe data
Sample mean of Y or denoted as \(\bar{Y}\)
Example: Variable Y has the following six observations (\(Y_1... Y_6\))
Calculate sample mean in R
x <- c(5,2,13,11,18,22)
length(x)
#> [1] 6
mean(x)
#> [1] 11.83333
# "by hand"
sum(x) # sum of x
#> [1] 71
length(x) # number of observations
#> [1] 6
sum(x)/length(x)
#> [1] 11.83333Calculate mean value of (mean) debt from Stafford and grad plus, using first 10 observations
df_scorecard_edu %>% select(debt_all_stgp_eval_mean) %>% var_label()
#> $debt_all_stgp_eval_mean
#> [1] "Average Stafford and Grad PLUS loan debt disbursed at this institution"
df_scorecard_edu$debt_all_stgp_eval_mean[1:10]
#> [1] 29768 43336 41572 31462 28726 35316 28032 32904 39299 25015
debt_i <- df_scorecard_edu$debt_all_stgp_eval_mean[1:10]
debt_i
#> [1] 29768 43336 41572 31462 28726 35316 28032 32904 39299 25015
# calculate mean
mean(debt_i)
#> [1] 33543
# calculate by hand
sum(debt_i) # sum of values
#> [1] 335430
length(debt_i) # number of observations
#> [1] 10
sum(debt_i)/length(debt_i)
#> [1] 33543Other measures of central tendency:
Sample standard deviation of Y or denoted as \(\hat\sigma_Y\)
Standard deviation is, on average, how far away a random observation, \(Y_i\), is from the sample mean, \(\bar{Y}\)
\(\hat\sigma_Y = \sqrt{\frac{\sum_{i=1}^N (Y_i - \overline{Y})^2}{N-1}}\)
Example: Variable Y has the following six observations (\(Y_1... Y_6\))
Calculate standard deviation for sample data
x
#> [1] 5 2 13 11 18 22
sd(x) # on aveage each value of x is ~7.5 away from the mean
#> [1] 7.574079
# by hand
mean(x)
#> [1] 11.83333
x
#> [1] 5 2 13 11 18 22
(x-mean(x))^2 # squared deviations
#> [1] 46.6944444 96.6944444 1.3611111 0.6944444 38.0277778 103.3611111
sum((x-mean(x))^2) # sum squared deviation
#> [1] 286.8333
sqrt(sum((x-mean(x))^2)/(length(x)-1))
#> [1] 7.574079Calcualte standard deviation of debt (first ten obs)
debt_i
#> [1] 29768 43336 41572 31462 28726 35316 28032 32904 39299 25015
sd(debt_i) # on aveage each value of debt is 6164 away from the mean
#> [1] 6163.816
# by hand
(debt_i-mean(debt_i))^2 # squared deviations
#> [1] 14250625 95902849 64464841 4330561 23203489 3143529 30371121 408321
#> [9] 33131536 72726784
sum((debt_i-mean(debt_i))^2) # sum squared deviation
#> [1] 341933656
sqrt(sum((debt_i-mean(debt_i))^2)/(length(debt_i)-1)) # standard deviation
#> [1] 6163.816Other measures of dispersion: