Tidying, Visualising and Summarising Data - Tasks

Task 1

๋‹ค์Œ ์ฝ”๋“œ๋ฅผ ์‹คํ–‰ํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.

library(dslabs)
data(polls_us_election_2016)

library(tidyverse)
  1. grade ๊ฐ’์ด ๊ฒฐ์ธก์น˜(NA)์ธ ์—ฌ๋ก ์กฐ์‚ฌ๋Š”?

๋ฌธ์„œ๊ฐ€ ๋„ˆ๋ฌด ๊ธธ์–ด์ง€๋Š” ๊ฒƒ์„ ๋ฐฉ์ง€ํ•˜๊ธฐ ์œ„ํ•ด head๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์ฒ˜์Œ 6๊ฐœ ํ–‰๋งŒ ํ‘œ์‹œํ•ฉ๋‹ˆ๋‹ค.

polls_us_election_2016 %>%
    filter(is.na(grade)) %>%
    head
         state  startdate    enddate                 pollster grade samplesize
1   New Mexico 2016-11-06 2016-11-06                 Zia Poll  <NA>       8439
2         U.S. 2016-11-05 2016-11-07 The Times-Picayune/Lucid  <NA>       2521
3         U.S. 2016-11-01 2016-11-07    USC Dornsife/LA Times  <NA>       2972
4     Virginia 2016-11-01 2016-11-02                Remington  <NA>       3076
5    Wisconsin 2016-11-01 2016-11-02                Remington  <NA>       2720
6 Pennsylvania 2016-11-01 2016-11-02                Remington  <NA>       2683
  population rawpoll_clinton rawpoll_trump rawpoll_johnson rawpoll_mcmullin
1         lv           46.00         44.00               6               NA
2         lv           45.00         40.00               5               NA
3         lv           43.61         46.84              NA               NA
4         lv           46.00         44.00              NA               NA
5         lv           49.00         41.00              NA               NA
6         lv           46.00         45.00              NA               NA
  adjpoll_clinton adjpoll_trump adjpoll_johnson adjpoll_mcmullin
1        44.82594      41.59978        7.870127               NA
2        45.13966      42.26495        3.679914               NA
3        45.32156      43.38579              NA               NA
4        45.27399      41.91459              NA               NA
5        48.22713      38.86464              NA               NA
6        45.30896      42.94988              NA               NA
  1. ๋‹ค์Œ ์กฐ๊ฑด์„ ๋ชจ๋‘ ๋งŒ์กฑํ•˜๋Š” ์—ฌ๋ก ์กฐ์‚ฌ๋Š”? (i) American Strategies, GfK Group, Merrill Poll์—์„œ ์กฐ์‚ฌํ•œ ๊ฒฝ์šฐ, (ii) ํ‘œ๋ณธ ํฌ๊ธฐ๊ฐ€ 1,000๋ช… ์ด์ƒ์ธ ๊ฒฝ์šฐ, (iii) 2016๋…„ 10์›” 20์ผ์— ์‹œ์ž‘๋œ ๊ฒฝ์šฐ. ํžŒํŠธ: (i) ์กฐ๊ฑด์—์„œ๋Š” %in% ์—ฐ์‚ฐ์ž๊ฐ€ ์œ ์šฉ, ๋ฒกํ„ฐ๋Š” c()ํ•จ์ˆ˜๋กœ ๋งŒ๋“ค ์ˆ˜ ์žˆ์Œ. (iii)์—์„œ๋Š” ๋‚ ์งœ ๋ณ€์ˆ˜์˜ ํ˜•์‹์„ ํ™•์ธ
polls_us_election_2016 %>%
    filter(pollster %in% c("American Strategies","GfK Group","Merrill Poll") &
               samplesize > 1000 &
               startdate == "2016-10-20")
  state  startdate    enddate  pollster grade samplesize population
1  U.S. 2016-10-20 2016-10-24 GfK Group    B+       1212         lv
  rawpoll_clinton rawpoll_trump rawpoll_johnson rawpoll_mcmullin
1              51            37               6               NA
  adjpoll_clinton adjpoll_trump adjpoll_johnson adjpoll_mcmullin
1        50.28058      39.98632        4.733277               NA
  1. ๋‹ค์Œ ์กฐ๊ฑด์„ ๋ชจ๋‘ ๋งŒ์กฑํ•˜๋Š” ์—ฌ๋ก ์กฐ์‚ฌ๋Š”? (i) Johnson ํ›„๋ณด์˜ ์—ฌ๋ก ์กฐ์‚ฌ ๋ฐ์ดํ„ฐ๊ฐ€ ๋ˆ„๋ฝ๋˜์ง€ ์•Š์€ ๊ฒฝ์šฐ, (ii) ํŠธ๋Ÿผํ”„์™€ ํด๋ฆฐํ„ด์˜ ์›๋ณธ ์—ฌ๋ก ์กฐ์‚ฌ ์ง€์ง€์œจ ํ•ฉ์ด 95%๋ฅผ ์ดˆ๊ณผํ•˜๋Š” ๊ฒฝ์šฐ, (iii) ์˜คํ•˜์ด์˜ค(OH) ์ฃผ์—์„œ ์‹ค์‹œ๋œ ๊ฒฝ์šฐ ํžŒํŠธ: ํŠธ๋Ÿผํ”„์™€ ํด๋ฆฐํ„ด์˜ ์ง€์ง€์œจ ํ•ฉ๊ณ„๋ฅผ ๊ณ„์‚ฐํ•˜๋Š” ์ƒˆ๋กœ์šด ๋ณ€์ˆ˜๋ฅผ ์ƒ์„ฑํ•œ ํ›„ filter() ๋ฅผ ์ ์šฉ
polls_us_election_2016 %>%
    mutate(rawpoll_clintontrump = rawpoll_clinton + rawpoll_trump) %>%
    filter(!is.na(rawpoll_johnson) & rawpoll_clintontrump > 95 & state == "Ohio")
 [1] state                startdate            enddate             
 [4] pollster             grade                samplesize          
 [7] population           rawpoll_clinton      rawpoll_trump       
[10] rawpoll_johnson      rawpoll_mcmullin     adjpoll_clinton     
[13] adjpoll_trump        adjpoll_johnson      adjpoll_mcmullin    
[16] rawpoll_clintontrump
<0 rows> (or 0-length row.names)
  1. ํ‘œ๋ณธ ํฌ๊ธฐ๊ฐ€ 2,000๋ช… ์ด์ƒ์ธ ์—ฌ๋ก ์กฐ์‚ฌ์—์„œ ํŠธ๋Ÿผํ”„์˜ ํ‰๊ท  ์ง€์ง€์œจ์ด ๊ฐ€์žฅ ๋†’์€ ์ฃผ๋Š”? ํžŒํŠธ: filter(), group_by(), summarise(), arrange() ์‚ฌ์šฉ. ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌํ•˜๋ ค๋ฉด arrange() ํ•จ์ˆ˜ ์‚ฌ์šฉ.
polls_us_election_2016 %>%
    filter(samplesize >= 2000) %>%
    group_by(state) %>%
    summarise(mean_trump = mean(rawpoll_trump)) %>%
    arrange(desc(mean_trump))
# A tibble: 26 ร— 2
   state          mean_trump
   <fct>               <dbl>
 1 Alabama              62.5
 2 Missouri             48.5
 3 Indiana              47  
 4 Texas                46.0
 5 South Carolina       45.5
 6 Georgia              45.3
 7 Kansas               44  
 8 New Mexico           44  
 9 Florida              44.0
10 Ohio                 43.9
# โ„น 16 more rows

Task 2

๋‹ค์Œ ์ฝ”๋“œ๋ฅผ ์‹คํ–‰ํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.

library(dslabs)
data(gapminder, package = "dslabs")
  1. ๋Œ€๋ฅ™/์—ฐ๋„๋ณ„ ํ‰๊ท  ์ธ๊ตฌ(๋ณ€์ˆ˜๋ช…: mean_pop)๋ฅผ ๊ณ„์‚ฐํ•˜๊ณ , ๊ฒฐ๊ณผ๋ฅผ ์ƒˆ๋กœ์šด ๊ฐ์ฒด gapminder_mean์— ์ €์žฅํ•˜์‹œ์˜ค. ํžŒํŠธ: ๊ฐ ์—ฐ๋„๋ณ„ ๋Œ€๋ฅ™๋‹น ํ•˜๋‚˜์˜ ๊ด€์ธก์น˜(ํ–‰)๋งŒ ์žˆ์–ด์•ผ ํ•จ. group_by ๋ฐ summarise ์‚ฌ์šฉ
gapminder_mean <- gapminder %>%
  group_by(continent, year) %>%
  summarise(mean_pop = mean(population)) %>%
  ungroup()
  • ๋งŒ์•ฝ ungroup()์„ ํ•˜์ง€ ์•Š์œผ๋ฉด ์ดํ›„์— ์ˆ˜ํ–‰ํ•˜๋Š” mutate(), summarise(), filter() ๋“ฑ์˜ ์—ฐ์‚ฐ์ด ์—ฌ์ „ํžˆ ๊ทธ๋ฃน ๋‹จ์œ„๋กœ ์ ์šฉ๋ฉ๋‹ˆ๋‹ค. ์˜ˆ๋ฅผ ๋“ค๋ฉด ์•„๋ž˜์™€ ๊ฐ™์Šต๋‹ˆ๋‹ค. ungroup() ๋Œ€์‹  summarise ํ•จ์ˆ˜์—์„œ .groups="drop" ์˜ต์…˜์„ ์จ๋„ ๋ฉ๋‹ˆ๋‹ค.
df <- tibble(group = c("A", "A", "B", "B"), value = c(1, 2, 3, 4))

# ๊ทธ๋ฃน๋ณ„ ํ‰๊ท  ๊ณ„์‚ฐ
df_summary <- df %>%
  group_by(group) %>%
  summarise(mean_value = mean(value))

# mutate๋ฅผ ์ถ”๊ฐ€ํ•˜์ง€๋งŒ ungroup ์•ˆ ํ•จ
df_summary %>%
  mutate(overall_mean = mean(mean_value))
# A tibble: 2 ร— 3
  group mean_value overall_mean
  <chr>      <dbl>        <dbl>
1 A            1.5          2.5
2 B            3.5          2.5

Task 3

gapminder ๋ฐ์ดํ„ฐ๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‹ค์Œ์˜ ๊ทธ๋ž˜ํ”„๋ฅผ ggplot2๋กœ ์ƒ์„ฑํ•˜์‹œ์˜ค

  1. 2015๋…„ ๊ธฐ๋Œ€์ˆ˜๋ช…(Life Expectancy)์˜ ํžˆ์Šคํ† ๊ทธ๋žจ์„ ์ž‘์„ฑํ•˜์‹œ์˜ค. ํžŒํŠธ: ํžˆ์Šคํ† ๊ทธ๋žจ์„ ๋งŒ๋“ค ๋•Œ aes()์— y ๊ฐ’์„ ์ง€์ •ํ•ด์•ผ ํ• ๊นŒ? geom_* ๋‚ด์—์„œ ๋‹ค์Œ ์˜ต์…˜์„ ์„ค์ •ํ•˜์‹œ์˜ค: binwidth = 5, boundary = 45, colour = โ€œwhiteโ€, fill = โ€œ#d90502โ€. ์ด๋Ÿฌํ•œ ์˜ต์…˜์ด ๋ฌด์—‡์„ ์˜๋ฏธํ•˜๋Š”์ง€ ์„ค๋ช…ํ•˜์‹œ์˜ค.

๊ธฐ๋ณธ ํžˆ์Šคํ† ๊ทธ๋žจ:

gapminder %>%
    filter(year == 2015) %>%
    ggplot() +
    aes(x = life_expectancy) +
    geom_histogram()

์ˆ˜์ •๋œ ํžˆ์Šคํ† ๊ทธ๋žจ(์ถ• ๋ ˆ์ด๋ธ” ํฌํ•จ):

life_exp_hist <- gapminder %>%
    filter(year == 2015) %>%
    ggplot() +
    aes(x = life_expectancy) +
    geom_histogram(binwidth = 5,
                   boundary = 45,
                   colour = "white",
                   fill = "#d90502") +
    labs(x = "๊ธฐ๋Œ€ ์ˆ˜๋ช…",
         y = "๋นˆ๋„")
life_exp_hist

facet๋œ ํžˆ์Šคํ† ๊ทธ๋žจ:

life_exp_hist +
    facet_grid(rows = vars(continent))

  1. ์—ฐ๋„/๋Œ€๋ฅ™๋ณ„ ํ‰๊ท  ๊ธฐ๋Œ€์ˆ˜๋ช…์„ ๊ตฌํ•˜๊ณ  ๋Œ€๋ฅ™๋ณ„๋กœ ํ‰๊ท  ๊ธฐ๋Œ€์ˆ˜๋ช…์— ๋Œ€ํ•œ Boxplot ์œผ๋กœ ๊ทธ๋ฆฌ์‹œ์˜ค. geom_* ์˜ต์…˜: colour = โ€œblackโ€, fill = โ€œ#d90502โ€.
    ํžŒํŠธ: continent ๋ฐ year ๋‘ ๊ฐœ์˜ ๋ณ€์ˆ˜๋กœ ๊ทธ๋ฃนํ™”ํ•ด์•ผํ•จ.
gapminder %>%
    group_by(continent, year) %>%
    summarise(mean_life_exp = mean(life_expectancy)) %>%
    ggplot() +
    aes(x = continent, y = mean_life_exp) +
    geom_boxplot(colour = "black",
                 fill = "#d90502") +
    labs(x = "๋Œ€๋ฅ™",
         y = "๊ธฐ๋Œ€ ์ˆ˜๋ช…")

3. 2015๋…„์˜ ์˜์•„ ์‚ฌ๋ง๋ฅ (x์ถ•)์— ๋Œ€ํ•œ ์ถœ์‚ฐ์œจ(y์ถ•)์˜ ์‚ฐ์ ๋„. ์‚ฐ์ ๋„๋ฅผ ๋งŒ๋“  ํ›„ ํ•ด๋‹น geom_* ๋‚ด์—์„œ size๋ฅผ 3์œผ๋กœ, alpha๋ฅผ 0.5๋กœ, colour๋ฅผ โ€œ#d90502โ€๋กœ ์„ค์ •ํ•ฉ๋‹ˆ๋‹ค.

๊ธฐ๋ณธ ์‚ฐ์ ๋„:

gapminder %>%
    filter(year == 2015) %>%
    ggplot() +
    aes(x = infant_mortality, y = fertility) +
    geom_point()

์ถ• ๋ ˆ์ด๋ธ”์ด ์žˆ๋Š” ์‚ฐ์ ๋„:

gapminder %>%
    filter(year == 2015) %>%
    ggplot() +
    aes(x = infant_mortality, y = fertility) +
    geom_point(size = 3,
               alpha = 0.5,
               colour = "#d90502") +
    labs(x = "์˜์•„ ์‚ฌ๋ง๋ฅ ", y = "์ถœ์‚ฐ์œจ")


Task 4

  1. 2011๋…„ GDP์˜ ํ‰๊ท ์„ ๊ณ„์‚ฐํ•˜๊ณ  mean์ด๋ผ๋Š” ๊ฐ์ฒด์— ํ• ๋‹นํ•˜์‹œ์˜ค. ๊ฒฐ์ธก๊ฐ’์€ ์ œ์™ธ. ํžŒํŠธ: mean ํ•จ์ˆ˜์˜ ๋„์›€๋ง์„ ์ฝ๊ณ  NA๋ฅผ ์ œ๊ฑฐํ•˜๋Š” ๋ฐฉ๋ฒ•์„ ํ™•์ธ
mean_GDP <- gapminder %>%
    filter(year == 2011) %>%
    summarise(mean(gdp, na.rm = T))
mean_GDP
  mean(gdp, na.rm = T)
1         246954895975
  1. 2011๋…„ GDP์˜ ์ค‘์•™๊ฐ’์„ ๊ณ„์‚ฐํ•˜๊ณ  median์ด๋ผ๋Š” ๊ฐ์ฒด์— ํ• ๋‹นํ•˜์‹œ์˜ค. ๋งˆ์ฐฌ๊ฐ€์ง€๋กœ ๊ฒฐ์ธก๊ฐ’์„ ์ œ์™ธ. ์ค‘์•™๊ฐ’์ด ํ‰๊ท ๋ณด๋‹ค ํฐ๊ฐ€, ์ž‘์€๊ฐ€?
median_GDP <- gapminder %>%
    filter(year == 2011) %>%
    summarise(median(gdp, na.rm = T))
median_GDP
  median(gdp, na.rm = T)
1            16031265699

์ค‘์•™๊ฐ’์€ ํ‰๊ท ๋ณด๋‹ค ํ›จ์”ฌ ์ž‘์Šต๋‹ˆ๋‹ค.

  1. geom_density๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ 2011๋…„ GDP์˜ ๋ฐ€๋„ ๊ทธ๋ž˜ํ”„(density plot)๋ฅผ ์ƒ์„ฑํ•˜์‹œ์˜ค. ๋ฐ€๋„ ๊ทธ๋ž˜ํ”„๋Š” ์ˆซ์žํ˜• ๋ณ€์ˆ˜์˜ ๋ถ„ํฌ๋ฅผ ๋‚˜ํƒ€๋‚ด๋Š” ๋ฐฉ๋ฒ•์ž„. ๋˜ํ•œ ๋‹ค์Œ ์ฝ”๋“œ๋ฅผ ์ถ”๊ฐ€ํ•˜์—ฌ ํ‰๊ท ๊ณผ ์ค‘์•™๊ฐ’์„ ์ˆ˜์ง์„ ์œผ๋กœ ํ‘œ์‹œํ•˜์‹œ์˜ค.

    geom_vline(xintercept = as.numeric(mean), colour = "red") +
    geom_vline(xintercept = as.numeric(median), colour = "orange")
gdp_density <- gapminder %>%
    filter(year == 2011) %>%
    ggplot() +
    aes(x = gdp) +
    geom_density() +
    geom_vline(xintercept = as.numeric(mean_GDP), colour = "red") +
    geom_vline(xintercept = as.numeric(median_GDP), colour = "orange")
gdp_density

GDP ๋ถ„ํฌ๋Š” ๋งค์šฐ ์™œ๊ณก(Skewed)๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค. GDP๊ฐ€ ์ ์€ ๊ตญ๊ฐ€๊ฐ€ ๋งŽ๊ณ  GDP๊ฐ€ ๋งค์šฐ ํฐ ๊ตญ๊ฐ€(๋ฏธ๊ตญ, ์ผ๋ณธ, ์ค‘๊ตญ)๋Š” ์ ์Šต๋‹ˆ๋‹ค. ์ด๋Ÿฌํ•œ ๊ฒฝ์šฐ ํ‰๊ท ์€ ์ค‘์•™๊ฐ’๋ณด๋‹ค (์ƒ๋‹นํžˆ) ํด ๊ฒƒ์ž…๋‹ˆ๋‹ค. ์ด๊ฒƒ์„ ๋” ๋ช…ํ™•ํ•˜๊ฒŒ ๋ณด๊ธฐ ์œ„ํ•ด ๊ฐ ๋ˆˆ๊ธˆ์ด ์ด์ „ ๋ˆˆ๊ธˆ๋ณด๋‹ค 10๋ฐฐ ๋” ํฌ๊ฒŒ x์ถ•์„ ๋ณ€ํ™˜ํ•œ ๊ทธ๋ž˜ํ”„๊ฐ€ ์•„๋ž˜์ž…๋‹ˆ๋‹ค(๋”ฐ๋ผ์„œ ์ฒ™๋„๋Š” ์„ ํ˜•์ด ์•„๋‹™๋‹ˆ๋‹ค. ์ฆ‰, ์ฒซ ๋ฒˆ์งธ ๋ˆˆ๊ธˆ์€ 100,000, ๋‘ ๋ฒˆ์งธ๋Š” 100๋งŒ, ์„ธ ๋ฒˆ์งธ๋Š” 1์ฒœ๋งŒ ๋“ฑ์ž…๋‹ˆ๋‹ค.).

gdp_density +
    scale_x_log10()

  1. 2015๋…„ ์ถœ์‚ฐ์œจ(fertility)๊ณผ ์œ ์•„ ์‚ฌ๋ง๋ฅ (infant mortality)์˜ ์ƒ๊ด€๊ด€๊ณ„๋ฅผ ๊ณ„์‚ฐํ•˜์‹œ์˜ค. NA ๊ฐ’์„ ์ œ์™ธํ•˜๋ ค๋ฉด cor() ํ•จ์ˆ˜์˜ use ์ธ์ˆ˜๋ฅผ โ€œpairwise.complete.obsโ€๋กœ ์„ค์ •. ์ด ์ƒ๊ด€๊ด€๊ณ„ ๊ฐ’์ด Task 3์—์„œ ์ƒ์„ฑํ•œ ๊ทธ๋ž˜ํ”„์™€ ์ผ์น˜ํ•˜๋Š”๊ฐ€?
gapminder %>%
    filter(year == 2015) %>%
    summarise(cor(fertility, infant_mortality, use = "pairwise.complete.obs"))
  cor(fertility, infant_mortality, use = "pairwise.complete.obs")
1                                                       0.8286402

์ด ์ƒ๊ด€ ๊ด€๊ณ„๋Š” ์–‘์ˆ˜์ด๊ณ  ๊ฐ•ํ•ฉ๋‹ˆ๋‹ค(1์— ๋น„๊ต์  ๊ฐ€๊น์Šต๋‹ˆ๋‹ค). ์ด๋Š” Task 3์—์„œ ์ƒ์„ฑ๋œ ๊ทธ๋ž˜ํ”„์™€ ์ผ์น˜ํ•ฉ๋‹ˆ๋‹ค. ์‹ค์ œ๋กœ ํ•ด๋‹น ๊ทธ๋ž˜ํ”„๋Š” ์ด๋Ÿฌํ•œ ๋‘ ๋ณ€์ˆ˜ ๊ฐ„์˜ ์–‘์˜ ๊ด€๊ณ„๋ฅผ ๋ณด์—ฌ ์ฃผ์—ˆ๊ณ  ์ ์ด ๊ทธ๋ ‡๊ฒŒ ๋ถ„์‚ฐ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.

Task (Optional): ์ฃผ๊ฐ€ ๋ฐ์ดํ„ฐ ๋ถ„์„

  1. tidyquant๋ฅผ ์ด์šฉํ•˜์—ฌ ํ•œ๊ตญ์˜ ๋Œ€ํ‘œ ์ฃผ์‹์ธ ์‚ผ์„ฑ์ „์ž(โ€œ005930.KSโ€)์™€ ๋ฏธ๊ตญ์˜ ์‹œ์žฅ ์ง€์ˆ˜์ธ S&P500 ETF(โ€œSPYโ€)์˜ 2021๋…„ 1์›” 1์ผ ์ดํ›„ ์ฃผ๊ฐ€ ๋ฐ์ดํ„ฐ๋ฅผ ๋‹ค์šด๋กœ๋“œํ•˜์‹œ์˜ค. (tq_get์— ํ‹ฐ์ปค๋ฅผ ๋ฒกํ„ฐ c() ํ˜•ํƒœ๋กœ ์ž…๋ ฅ)

  2. group_by()์™€ mutate(), lag() ํ•จ์ˆ˜๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‘ ์ข…๋ชฉ์˜ ์ผ๋ณ„ ์ˆ˜์ต๋ฅ (daily_return)์„ ๊ฐ๊ฐ ๊ณ„์‚ฐํ•˜๊ณ  drop_na()๋กœ ๊ฒฐ์ธก์น˜๋ฅผ ์ œ๊ฑฐํ•˜์‹œ์˜ค.

# ํ•„์ˆ˜ ํŒจํ‚ค์ง€ ๋กœ๋“œ
library(tidyquant)
library(tidyverse)

# 1. ์‚ผ์„ฑ์ „์ž์™€ S&P 500 ETF(SPY) ๋ฐ์ดํ„ฐ ๋‹ค์šด๋กœ๋“œ
tickers <- c("005930.KS", "SPY")
stock_data <- tq_get(tickers, from = "2021-01-01")

# 2. ์ผ๋ณ„ ์ˆ˜์ต๋ฅ  ๊ณ„์‚ฐ ๋ฐ ๊ฒฐ์ธก์น˜ ์ œ๊ฑฐ
stock_returns <- stock_data %>%
  group_by(symbol) %>%          # ์ข…๋ชฉ๋ณ„๋กœ ๊ทธ๋ฃนํ™”
  arrange(date) %>%             # ๋‚ ์งœ ์˜ค๋ฆ„์ฐจ์ˆœ ์ •๋ ฌ
  mutate(daily_return = adjusted / lag(adjusted) - 1) %>% # ์ˆ˜์ต๋ฅ  ๊ณ„์‚ฐ
  drop_na(daily_return)         # ์ฒซ ๋‚ ์˜ NA ๊ฐ’ ์ œ๊ฑฐ

# ๊ฒฐ๊ณผ ํ™•์ธ
head(stock_returns)
# A tibble: 6 ร— 9
# Groups:   symbol [2]
  symbol    date         open   high    low  close  volume adjusted daily_return
  <chr>     <date>      <dbl>  <dbl>  <dbl>  <dbl>   <dbl>    <dbl>        <dbl>
1 005930.KS 2021-01-05 81600  83900  81600  83900   3.53e7   75474.      0.0108 
2 SPY       2021-01-05   368.   372.   368.   371.  6.64e7     347.      0.00689
3 005930.KS 2021-01-06 83300  84500  82100  82200   4.21e7   73945.     -0.0203 
4 SPY       2021-01-06   370.   377.   369.   374.  1.08e8     349.      0.00598
5 005930.KS 2021-01-07 82800  84200  82700  82900   3.26e7   74574.      0.00852
6 SPY       2021-01-07   376.   380.   376.   379.  6.88e7     354.      0.0149 
  1. ggplot2๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‘ ์ข…๋ชฉ์˜ ์ผ๋ณ„ ์ˆ˜์ต๋ฅ  ๋ถ„ํฌ๋ฅผ ๋น„๊ตํ•˜๋Š” ๋ฐ€๋„ ๊ทธ๋ž˜ํ”„(geom_density)๋ฅผ ๊ฒน์ณ์„œ(alpha ๊ฐ’ ์กฐ์ •) ๊ทธ๋ฆฌ์‹œ์˜ค.
# 3. ๋ฐ€๋„ ๊ทธ๋ž˜ํ”„ ๊ทธ๋ฆฌ๊ธฐ (alpha๋กœ ํˆฌ๋ช…๋„๋ฅผ ์ฃผ์–ด ๊ฒน์น˜๋Š” ๋ถ€๋ถ„ ํ™•์ธ)
ggplot(stock_returns, aes(x = daily_return, fill = symbol)) +
  geom_density(alpha = 0.5) +
  labs(title = "์‚ผ์„ฑ์ „์ž vs S&P 500 ์ผ๋ณ„ ์ˆ˜์ต๋ฅ  ๋ถ„ํฌ (2021~)",
       x = "์ผ๋ณ„ ์ˆ˜์ต๋ฅ ",
       y = "๋ฐ€๋„(Density)",
       fill = "์ข…๋ชฉ") +
  theme_minimal()

  1. summarise()๋ฅผ ํ™œ์šฉํ•˜์—ฌ ๋‘ ์ข…๋ชฉ ๊ฐ๊ฐ์˜ ์ผํ‰๊ท  ์ˆ˜์ต๋ฅ (mean)๊ณผ ๋ณ€๋™์„ฑ(sd)์„ ๊ณ„์‚ฐํ•˜์‹œ์˜ค.
# 4. summarise()๋ฅผ ํ™œ์šฉํ•œ ์š”์•ฝ ํ†ต๊ณ„ ๊ณ„์‚ฐ
summary_stats <- stock_returns %>%
  summarise(
    mean_return = mean(daily_return),
    sd_return = sd(daily_return) # ๊ธˆ์œต์—์„œ๋Š” ์ด๊ฒƒ์ด ์ผ๊ฐ„ '๋ณ€๋™์„ฑ(๋ฆฌ์Šคํฌ)'์„ ์˜๋ฏธํ•ฉ๋‹ˆ๋‹ค.
  )

print(summary_stats)
# A tibble: 2 ร— 3
  symbol    mean_return sd_return
  <chr>           <dbl>     <dbl>
1 005930.KS    0.000924    0.0192
2 SPY          0.000555    0.0107
  1. ๋‘ ์ข…๋ชฉ์˜ ์ˆ˜์ต๋ฅ  ๊ฐ„ ์ƒ๊ด€๊ณ„์ˆ˜(cor)๋ฅผ ๊ณ„์‚ฐํ•˜์‹œ์˜ค. (์ƒ๊ด€๊ด€๊ณ„๋ฅผ ๊ตฌํ•˜๋ ค๋ฉด ๋‘ ์ˆ˜์ต๋ฅ ์„ ๊ฐ€๋กœ(์—ด)๋กœ ๋‚˜๋ž€ํžˆ ๋ฐฐ์น˜ํ•ด์•ผ ํ•˜๋Š”๋ฐ, ํ•œ๊ตญ๊ณผ ๋ฏธ๊ตญ์˜ ํœด์žฅ์ผ์ด ๋‹ฌ๋ผ ๋ฐ์ดํ„ฐ์˜ ๊ธธ์ด๊ฐ€ ์–ด๊ธ‹๋‚˜๋Š” ํ˜„์ƒ์ด ๋ฐœ์ƒํ•˜๊ธฐ ๋•Œ๋ฌธ์— ๋‚ ์งœ๋ฅผ ๊ธฐ์ค€์œผ๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ๋งž์ถ”๋Š” ๊ณผ์ •(pivot_wider ๋˜๋Š” inner_join)์ด ํ•„์š”ํ•จ.)
spy_returns <- stock_returns %>% filter(symbol=="SPY") %>% select(date, daily_return) %>% rename(spy=daily_return)
ks_returns <- stock_returns %>% filter(symbol=="005930.KS") %>% select(date, daily_return) %>% rename(ks=daily_return)
returns_wide <- inner_join(spy_returns, ks_returns, by = "date")

correlation <- cor(returns_wide$ks, returns_wide$spy)
print(correlation)
[1] 0.05643054
# 5. ๋„“์€ ํ˜•ํƒœ(Wide format)๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ ์ƒ๊ด€๊ณ„์ˆ˜ ๊ณ„์‚ฐ
returns_wide <- stock_returns %>%
    select(date, symbol, daily_return) %>%
    distinct(date, symbol, .keep_all = TRUE) %>% # 2026-03-20 KS ์ˆ˜์ต๋ฅ  ์ค‘๋ณต๋œ ๊ฒƒ ์ œ์™ธ
    pivot_wider(names_from = symbol, values_from = daily_return) %>%
    drop_na() # ์–ด๋А ํ•œ ๊ตญ๊ฐ€๋ผ๋„ ํœด์žฅ์ด์–ด์„œ NA๊ฐ€ ๋œ ๋‚ ์งœ๋Š” ์ œ์™ธ

# ์ƒ๊ด€๊ณ„์ˆ˜ ๊ณ„์‚ฐ
correlation <- cor(returns_wide$`005930.KS`, returns_wide$SPY)
print(correlation)
[1] 0.05643054

(์ฐธ๊ณ : ํ‹ฐ์ปค๋ช…์— ์ˆซ์ž๊ฐ€ ํฌํ•จ๋˜์–ด ์—ด ์ด๋ฆ„์œผ๋กœ ๋ฐ”๋€” ๊ฒฝ์šฐ, R์—์„œ๋Š” ์—ญ๋”ฐ์˜ดํ‘œ()๋กœ ๊ฐ์‹ธ์ฃผ์–ด์•ผ ๋ณ€์ˆ˜๋ช…์œผ๋กœ ์ •์ƒ ์ธ์‹๋จ. ์˜ˆ์‹œ์—์„œ๋Š”005930.KS๊ฐ€005930.KS๋กœ ์—ด ์ด๋ฆ„์ด ๋˜์—ˆ๊ธฐ ๋•Œ๋ฌธ์—, ์ƒ๊ด€๊ณ„์ˆ˜ ๊ณ„์‚ฐ ์‹œreturns_wide$`005930.KS``์™€ ๊ฐ™์ด ์—ญ๋”ฐ์˜ดํ‘œ๋กœ ๊ฐ์‹ธ์ฃผ์–ด์•ผ ํ•จ.)