All Materials Here

Use the sharks dataset for this HW. This should be more slightly more difficult than the first homework. You will likely have to do some googling, but I have hinted some functions to look into. After each line, try to view the data to see if the results of your code match what you wanted it to do.

library("stargazer")
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library("scales")
rm(list = ls())

sharks <- read.csv("C:/Users/alexc/Desktop/Empirical Workshop/data/sharks.csv", stringsAsFactors = FALSE)
#View(sharks)

1 - Make a time series plot of the number of attacks per year.

timeSeries <- table(sharks$Year)
plot(timeSeries, type = "l", ylab = "Frequency", xlab = "Year")

2 - Make a frequency distribution for the ages of those being attacked.

plot(table(as.numeric(sharks$Age)), ylab = "Frequncy", xlab = "Age")
## Warning in table(as.numeric(sharks$Age)): NAs introduced by coercion

3 - Plot the locations of the shark attacks. Color the attacks by whether or not the attack was fatal.

sharks$fatalColor <- as.numeric(as.factor(sharks$Fatal..Y.N.))
#plot(sharks$Longitude, sharks$Latitude, pch = 19, col = c(1,2))
plot(sharks$Longitude, sharks$Latitude,
     col = alpha(sharks$fatalColor, .1), pch = 19)

4 - Make a new binary variable for whether the attack was on the west coast.

sharks$westCoast <- ifelse(sharks$Longitude < -100, 1, 0)
table(sharks$westCoast)
## 
##   0   1 
## 761 237

5 - Make a binary variable for if the attack was unprovoked.

table(sharks$Type)
## 
##             Invalid            Provoked        Questionable 
##                  61                  57                   4 
##        Sea Disaster Under investigation          Unprovoked 
##                   2                   1                 837 
##          Watercraft 
##                  36
sharks$unprovoked <- ifelse(sharks$Type == "Unprovoked", 1, 0)
table(sharks$unprovoked)
## 
##   0   1 
## 161 837

6 - Make a binary variable for if there was “No injury” or a “minor” injury

sharks$severe_injury <- ifelse(grepl("No injury", sharks$Injury) | grepl("minor", tolower(sharks$Injury)), 0, 1)
table(sharks$severe_injury)
## 
##   0   1 
## 229 769

7 - Make a binary variable for if the person attacked was male.

sharks$Sex <- ifelse(sharks$Name == "female", "F", ifelse(sharks$Name == "male", "M", sharks$Sex))
sharks$Sex <- ifelse(sharks$Sex == "", NA, sharks$Sex)
sharks$male <- ifelse(is.na(sharks$Sex), NA, ifelse(sharks$Sex == "M", 1, 0))
table(sharks$male)
## 
##   0   1 
## 224 755

8 - Make summary stats for the sharks data.

stargazer(sharks, type = "text")
## 
## =============================================================================
## Statistic            N    Mean    St. Dev.   Min    Pctl(25) Pctl(75)   Max  
## -----------------------------------------------------------------------------
## Year                998 2,009.572  5.588    2,000    2,005    2,014    2,019 
## original.order      998 5,297.683 640.417   4,239   4,739.2   5,842    6,427 
## Latitude            998  29.611    5.116    5.889    27.355   32.819  60.691 
## Longitude           998  -94.624   26.284  -169.534 -96.548  -80.327  -64.621
## Maximum.Temperature 980  82.559    7.805    42.200   79.100   87.900  98.500 
## Minimum.Temperature 980  68.324    9.093    23.100   63.750   74.600  84.800 
## Temperature         980  75.079    7.986    35.900   71.900   80.500  91.400 
## Precipitation       980   0.154    0.653    0.000    0.000    0.000   11.300 
## Precipitation.Cover 980   7.362    16.883   0.000    0.000    8.300   100.000
## Wind.Speed          979  14.200    4.556    0.000    11.200   16.400  44.700 
## Wind.Gust           126  32.548    7.686    4.500    30.000   36.425  57.500 
## Visibility          975   9.233    1.147    1.500    8.800    10.000  12.800 
## fatalColor          998   1.964    0.277      1        2        2        3   
## westCoast           998   0.237    0.426      0        0        0        1   
## unprovoked          998   0.839    0.368      0        1        1        1   
## severe_injury       998   0.771    0.421      0        1        1        1   
## male                979   0.771    0.420    0.000    1.000    1.000    1.000 
## -----------------------------------------------------------------------------

9 - Given an attack, what factors lead to more serious injuries?

#exploring weather with these regressions
reg1 <- lm(severe_injury ~ westCoast, data = sharks)
reg2 <- lm(severe_injury ~ westCoast + Temperature + Precipitation + Wind.Speed + Visibility, data = sharks)
reg3 <- lm(severe_injury ~ westCoast*Temperature + Precipitation + Wind.Speed + Visibility, data = sharks)
stargazer(reg1, reg2, reg3,
          type = "text", omit.stat = c("ser", "f"))
## 
## ===================================================
##                            Dependent variable:     
##                       -----------------------------
##                               severe_injury        
##                          (1)       (2)       (3)   
## ---------------------------------------------------
## westCoast             -0.297*** -0.256*** -0.914***
##                        (0.030)   (0.035)   (0.271) 
##                                                    
## Temperature                     0.006***    0.003  
##                                  (0.002)   (0.002) 
##                                                    
## Precipitation                    -0.002    -0.001  
##                                  (0.020)   (0.019) 
##                                                    
## Wind.Speed                        0.003     0.002  
##                                  (0.003)   (0.003) 
##                                                    
## Visibility                        0.002    -0.002  
##                                  (0.011)   (0.011) 
##                                                    
## westCoast:Temperature                      0.009** 
##                                            (0.004) 
##                                                    
## Constant              0.841***   0.288*   0.627*** 
##                        (0.015)   (0.164)   (0.214) 
##                                                    
## ---------------------------------------------------
## Observations             998       975       975   
## R2                      0.090     0.111     0.116  
## Adjusted R2             0.089     0.106     0.111  
## ===================================================
## Note:                   *p<0.1; **p<0.05; ***p<0.01
#exploring the person/interaction
reg1 <- lm(severe_injury ~ westCoast*Temperature + unprovoked, data = sharks)
reg2 <- lm(severe_injury ~ westCoast*Temperature + male, data = sharks)
reg3 <- lm(severe_injury ~ westCoast*Temperature + unprovoked + male, data = sharks)
reg4 <- lm(severe_injury ~ westCoast*Temperature + unprovoked + male + as.numeric(Age), data = sharks)
## Warning in eval(predvars, data, env): NAs introduced by coercion
reg5 <- lm(severe_injury ~ westCoast*Temperature + unprovoked + male + as.numeric(Age) + Year, data = sharks)
## Warning in eval(predvars, data, env): NAs introduced by coercion
stargazer(reg1, reg2, reg3, reg4, reg5, 
          add.lines = list(c("Year FE", rep("N", 4), "Y")),
          type = "text", omit.stat = c("ser", "f"), omit = "Year")
## 
## =======================================================================
##                                      Dependent variable:               
##                       -------------------------------------------------
##                                         severe_injury                  
##                          (1)       (2)       (3)       (4)       (5)   
## -----------------------------------------------------------------------
## westCoast             -0.972*** -0.998*** -1.021*** -1.050*** -1.052***
##                        (0.264)   (0.263)   (0.263)   (0.297)   (0.294) 
##                                                                        
## Temperature             0.002     0.002     0.002     0.002     0.002  
##                        (0.002)   (0.002)   (0.002)   (0.003)   (0.003) 
##                                                                        
## unprovoked             0.078**             0.066*    -0.018    -0.028  
##                        (0.035)             (0.035)   (0.044)   (0.043) 
##                                                                        
## male                             -0.048    -0.046    -0.057*  -0.068** 
##                                  (0.030)   (0.030)   (0.033)   (0.033) 
##                                                                        
## as.numeric(Age)                                       0.001     0.001  
##                                                      (0.001)   (0.001) 
##                                                                        
## westCoast:Temperature 0.010***  0.011***  0.011***  0.012***  0.012*** 
##                        (0.004)   (0.004)   (0.004)   (0.004)   (0.004) 
##                                                                        
## Constant              0.618***  0.710***  0.682***  0.746***  20.865***
##                        (0.185)   (0.189)   (0.189)   (0.211)   (4.907) 
##                                                                        
## -----------------------------------------------------------------------
## Year FE                   N         N         N         N         Y    
## Observations             980       963       963       761       761   
## R2                      0.120     0.116     0.119     0.086     0.106  
## Adjusted R2             0.116     0.112     0.114     0.078     0.097  
## =======================================================================
## Note:                                       *p<0.1; **p<0.05; ***p<0.01

10 - Is there a variable that significantly changes the sample size of your regressions? Why

#Including age decreases the the sample size by about 200.

table(sharks$Age[is.na(as.numeric(sharks$Age))])
## Warning in table(sharks$Age[is.na(as.numeric(sharks$Age))]): NAs introduced
## by coercion
## 
##          12 or 13      20s      30s      50s      60s       6½  a minor 
##      198        1        3        3        1        2        1        1 
##  mid-30s     teen     Teen    Teens 
##        1        4        4        1
#maybe we can start replacing teens with ~16, remove the "s" after some of these.  Still, 198 missing all together.

12 - Are serious injuries more or less likely given missingness for the variable?

lim <- is.na(as.numeric(sharks$Age))
## Warning: NAs introduced by coercion
t.test(sharks$severe_injury[lim], sharks$severe_injury[!lim])
## 
##  Welch Two Sample t-test
## 
## data:  sharks$severe_injury[lim] and sharks$severe_injury[!lim]
## t = -5.0123, df = 305.38, p-value = 9.129e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.247812 -0.108089
## sample estimates:
## mean of x mean of y 
## 0.6318182 0.8097686
t.test(sharks$westCoast[lim], sharks$westCoast[!lim])
## 
##  Welch Two Sample t-test
## 
## data:  sharks$westCoast[lim] and sharks$westCoast[!lim]
## t = 3.2974, df = 318.61, p-value = 0.001086
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.04646003 0.18392090
## sample estimates:
## mean of x mean of y 
## 0.3272727 0.2120823
#So it appears that missing ages are less likely to be severly injured and more likely to be on the east coast.  Perhaps this is spurious, but it might be a signal of something non-random.