link <- "https://chung-jiwoong.github.io/FMB819-Slides/chapter_causality/data/toy_data_2.csv"
toy_data <- read.csv(link)
library(tidyverse)
toy_data <- toy_data %>%
mutate(Y = Y1 * group_dummy + (1 - group_dummy) * Y0,
delta = Y1 - Y0)Causality
Task 1: SDO, ATE and Randomization
π 1. λ°μ΄ν° λΆλ¬μ€κΈ° λ° λ³μ μμ±
λ°μ΄ν° λ€μ΄λ‘λ λ§ν¬.
read.csv()λ₯Ό μ¬μ©νμ¬ λ°μ΄ν°λ₯Ό λΆλ¬μ€κΈ°.λ³μ μ€λͺ
group_dummy: μ²μΉλ₯Ό λ°μλμ§ μ¬λΆ (1 = μ²μΉκ΅°,0 = ν΅μ κ΅°).
Y0: μ²μΉλ₯Ό λ°μ§ μμμ κ²½μ°μ μ μ¬μ κ²°κ³Ό \((Y_i^0)\).
Y1: μ²μΉλ₯Ό λ°μμ κ²½μ°μ μ μ¬μ κ²°κ³Ό \((Y_i^1)\).
λ€μ λ³μλ€μ μμ±νμμ€.
- κ΄μΈ‘λ κ²°κ³Ό \(Y_i = D_i \times Y_i^1 + (1 - D_i) \times Y_i^0\)
- κ°λ³ μ²μΉ ν¨κ³Ό \(\delta_i = Y_i^1 - Y_i^0\)
π 2. ATE λ° SDO κ³μ°
- ATE (Average Treatment Effect) κ³μ°νμμ€.
- SDO (Simple Difference in Mean Outcomes) κ³μ°νμμ€.
- Biasκ° μ‘΄μ¬νλκ°? ν¬κΈ°λ μΌλ§λ ν°κ°?
ATE = mean(toy_data$delta)
ATE## [1] 2.10418
treatment_mean <- mean(toy_data$Y[toy_data$group == "treatment"])
control_mean <- mean(toy_data$Y[toy_data$group == "control"])
SDO <- treatment_mean - control_meanSDCκ° ATEλ³΄λ€ μ½ 50%μ λ ν½λλ€.
π 3. 무μμ λ°°μ λ λ°μ΄ν°μμ SDO κ³μ°
μλ‘μ΄ λ°μ΄ν° λ€μ΄λ‘λ λ§ν¬. μ΄ λ°μ΄ν°μμλ λμΌν κ°μΈμ μμλ‘ λ¬΄μμ λ°°μ (random assignment)νμμ.
\(Y_i\) μ¬κ³μ° νμμ€. μλ‘μ΄ μ²μΉ λ°°μ μ λ§μΆ° λ€μ κ³μ°ν΄μΌ ν¨.
무μμ λ°°μ μμ SDO κ³μ°
- νΈν₯μ΄ κ±°μ 0μ κ°κΉμμΌ ν¨.
- νμ§λ§ μ νν 0μ΄ λμ§ μλ μ΄μ λ 무μμΌκΉ?
- νΈν₯μ΄ κ±°μ 0μ κ°κΉμμΌ ν¨.
link_rand <- "https://chung-jiwoong.github.io/FMB819-Slides/chapter_causality/data/toy_data_random.csv"
toy_data_random <- read.csv(link_rand)
toy_data_random <- toy_data_random %>%
mutate(Y = Y1 * group_random_dummy + (1 - group_random_dummy) * Y0)
SDO_random = mean(toy_data_random$Y[toy_data_random$group_random == "treatment"]) - mean(toy_data_random$Y[toy_data_random$group_random == "control"])
SDO_random## [1] 2.047158
ATE - SDO_random## [1] 0.05702187
νΈν₯μ 0.057μ κ°μ΅λλ€. μ΄λ 0μ΄ μλλλ€. κ·Έ μ΄μ λ νλ³Έ ν¬κΈ°κ° μΆ©λΆν ν¬μ§ μμ λ κ·Έλ£Ή κ° μ°¨μ΄μ μ‘΄μ¬νλ 무μμ λ³λμ±μ μμ ν μμν μ μκΈ° λλ¬Έμ λλ€.
π 4. (Optional) νΈν₯ μμ νμΈ
- μ ν νΈν₯(Selection Bias) κ³μ°νμμ€.
- μ΄μ§μ μ²μΉ ν¨κ³Ό νΈν₯(Heterogeneous Treatment Effect Bias) κ³μ°νμμ€.
- μλ μμ΄ μ±λ¦½νλμ§ νμΈ: \[ SDO = ATE + \text{Selection Bias} + \text{Heterogeneous Treatment Effect Bias} \]
selection_bias = mean(toy_data$Y0[toy_data$group == "treatment"]) - mean(toy_data$Y0[toy_data$group == "control"])
het_trt_effect_bias = (1 - sum(toy_data$group == "treatment") / nrow(toy_data)) * (mean(toy_data$delta[toy_data$group == "treatment"]) - mean(toy_data$delta[toy_data$group == "control"]))
SDO## [1] 3.208584
ATE + selection_bias + het_trt_effect_bias## [1] 3.208584
Task 2: STAR data
π 1. λ°μ΄ν° λΆλ¬μ€κΈ°: λ°μ΄ν° λ€μ΄λ‘λ λ§ν¬
- λ°μ΄ν°λ₯Ό star_df κ°μ²΄μ μ μ₯. λ³μ μ€λͺ
λμλ§ νμΈ.
(πλ°μ΄ν°κ° μ¬κ΅¬μ±(reshaped)λμ΄, λ³μλͺ
λμ βkβ, β1β λ±μ μ«μλ 무μ.)
star_df = read_csv("https://chung-jiwoong.github.io/FMB819-Slides/chapter_causality/data/star_data.csv")π 2. λ°μ΄ν°μ κΈ°λ³Έ μ 보 νμΈ
- κ΄μ°° λ¨μ(Unit of observation)λ 무μμΈκ°?
str(star_df)κ΄μΈ‘λ¨μλ νμ-νκΈ (student-grade).
- λλ€ νκΈ λ°°μ (random class assignment):
star, (ii) νμμ νλ (class grade):grade, (iii) κ²°κ³Ό λ³μ(outcomes of interest):read&math
- λλ€ νκΈ λ°°μ (random class assignment):
π 3. λ°μ΄ν° ν¬κΈ° λ° κ²°μΈ‘κ°(NA) λΆμ
- μ΄ κ΄μΈ‘μΉ μλ λͺ κ°μΈκ°? μλμλ νμλ³λ‘ κ΄μΈ‘λ¨μμλλ°, νλ
-νμ λ¨μλ‘ λ°μ΄ν°λ₯Ό μ¬κ΅¬μ‘°ννμκΈ° λλ¬Έμ NA κ°μ΄ λ§μ. λν
NAλ μ¬λ¬ κ°μ§ μ΄μ λ‘ μ€νμ λ λκ² λ νμλ€μ.
# μ΄ κ΄μΈ‘μΉ μ νμΈ
nrow(star_df)
# κ²°μΈ‘κ° κ°μ νμΈ
sum(is.na(star_df))
# κ²°μΈ‘κ°μ΄ ν¬ν¨λ λ³μ νμΈ
colSums(is.na(star_df))π 4. κ²°μΈ‘κ° μ²λ¦¬ (NA μ κ±°)
- λ€μ μ½λ μ€ννμ¬ κ²°μΈ‘κ°μ΄ μλ κ²½μ°λ§ μ μ§:
star_df <- star_df[complete.cases(star_df),] # λλ
star_df <- na.omit(star_df)π 5. 무μμ λ°°μ νμΈ (Balancing Checks)
- λλ€ λ°°μ μ΄ μ μ΄λ£¨μ΄μ‘λμ§ νμΈνκΈ° μν΄, κ·Έλ£Ήλ³ κΈ°μ΄ ν΅κ³λμ κ³μ°.
- λ€μ νλͺ©λ³ νκ· λΉμ¨(%)μ νλ λ³(grade) λ° μ²μΉκ΅°λ³(treatment class)λ‘ λΉκ΅: 1οΈβ£ μ¬νμ λΉμ¨ (percentage of girls), 2οΈβ£ μνλ¦¬μΉ΄κ³ λ―Έκ΅μΈ λΉμ¨ (percentage of African Americans), 3οΈβ£ λ¬΄λ£ κΈμ λμ λΉμ¨ (percentage of free lunch qualifiers)μ μ΄ν΄λ³΄μ.
(π ννΈ: λ€μ μ½λλ‘ μ¬νμ λΉμ¨ κ³μ° κ°λ₯ (dplyr νμ© νμ): share_female = mean(gender == "female") * 100.)
star_df %>%
group_by(grade, star) %>%
summarise(
share_female = mean(gender == "female") * 100,
share_african_american = mean(ethnicity == "afam") * 100,
share_free_lunch = mean(lunch == "free") * 100)## # A tibble: 12 Γ 5
## # Groups: grade [4]
## grade star share_female share_african_american share_free_lunch
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 regular 48.7 37.3 51.5
## 2 1 regular+aide 47.8 29.8 50.8
## 3 1 small 48.6 31.9 47.9
## 4 2 regular 48.3 36.7 50.6
## 5 2 regular+aide 47.7 33.8 48.2
## 6 2 small 49.1 33.3 46.6
## 7 3 regular 48.9 35.2 49.7
## 8 3 regular+aide 47.2 34.3 49.1
## 9 3 small 50.1 31.6 46.8
## 10 k regular 48.5 28.5 46.0
## 11 k regular+aide 48.8 32.2 49.3
## 12 k small 48.0 30.2 46.7
Task 3
π 1. μλ μ½λλ₯Ό μ€ννμ¬ 1νλ (grade == β1β)μ΄λ©°, μΌλ° νκΈ(regular) λλ μκ·λͺ¨ νκΈ(small)μ μν νμλ€λ§ μ ν.
star_df_1_small <- star_df %>%
filter(star %in% c("small","regular") & grade == "1")π 2. λ κ·Έλ£Ήμ νκ· μν μ μ λ° μ°¨μ΄ κ³μ° (Base R μ¬μ©)
mean_small = mean(star_df_1_small$math[star_df_1_small$star == "small"])
mean_small## [1] 539.0885
mean_regular = mean(star_df_1_small$math[star_df_1_small$star == "regular"])
mean_regular## [1] 526.4434
ATE = mean_small - mean_regular
ATE## [1] 12.64506
π 3. λλ―Έ λ³μ μμ±: μκ·λͺ¨ νκΈ(small) = 1 (TRUE), μΌλ° νκΈ(regular) = 0 (FALSE) ννΈ: treatment = (star == "small").
star_df_1_small <- star_df_1_small %>%
mutate(treatment = (star == "small"))
table(star_df_1_small$treatment)##
## FALSE TRUE
## 2359 1786
π 4. νκ· λΆμ μ€ν
β 5. κ²°κ³Ό ν΄μ: νκ· λΆμ κ²°κ³Όκ° 2λ² μ§λ¬Έμμ ꡬν νκ· μ°¨μ΄μ μΌμΉνλκ°?
μ νΈμ ν΅μ μ§λ¨(μ¦, μΌλ° ν¬κΈ°μ νκΈ)μ μν 1νλ νμλ€μ μμ μν μ μλ₯Ό μλ―Έν©λλ€. μ¦, ν΅μ μ§λ¨μ μν 1νλ νμλ€μ μμ μν μ μλ 526.44μ μ λλ€. μ΄λ μ§λ¬Έ 2μμ κ³μ°λ λμΌν νκ· κ³Ό κ³μλ₯Ό λΉκ΅νλ©΄ μ§μ νμΈν μ μμ΅λλ€.
κΈ°μΈκΈ° κ³μλ μ€ν μ§λ¨κ³Ό ν΅μ μ§λ¨μ μν 1νλ νμλ€μ μμ μν μ μ μ°¨μ΄λ₯Ό λνλ λλ€. μ¦, μκ·λͺ¨ νκΈμ μν 1νλ νμλ€μ μΌλ° ν¬κΈ°μ νκΈ νμλ€λ³΄λ€ νκ· μ μΌλ‘ 12.65μ λ λμ μ μλ₯Ό λ°μ κ²μΌλ‘ μμλ©λλ€. μ΄ κ³μ μμ μ§λ¬Έ 2μμ κ³μ°λ νκ· μ°¨μ΄μ λΉκ΅ν μ μμ΅λλ€.