HW2 Solutions

Here is that USA analytics website again. Find the top domains over the past 7 days and past 30 days. Find the ratio between the two.

library("rvest")

## Loading required package: xml2

library("magrittr")
library("jsonlite")

pages_7days <- fromJSON("https://analytics.usa.gov/data/live/top-domains-7-days.json")
# pages_7days$data

pages_30days <- fromJSON("https://analytics.usa.gov/data/live/top-domains-30-days.json")
# pages_30days$data

m <- match(pages_30days$data$domain, pages_7days$data$domain)
ratios <- as.numeric(pages_30days$data$visits) / as.numeric(pages_7days$data$visits[m])

plot(table(round(ratios, 1)))

Here are the National Cyber Awareness Systems website. For each year, for each alert, find the date of the event, title, link, and ID.

"https://us-cert.cisa.gov/ncas/alerts" %>%
  read_html() -> myHTML
myHTML %>%
  html_nodes("ul") %>%
  `[`(3) %>%
  html_nodes("li") %>%
  html_nodes("a") %>%
  html_text() -> namez

myHTML %>%
  html_nodes("ul") %>%
  `[`(3) %>%
  html_nodes("li") %>%
  html_nodes("a") %>%
  html_attr("href") -> linkz

gsub("/ncas/alerts/", "", linkz) -> idz

datez <- c()
for(i in paste0("https://us-cert.cisa.gov/", linkz)){
  
  i %>%
    read_html() -> tmp
  tmp %>%
    html_nodes("div.submitted") %>%
    html_text() %>%
    strsplit("\\\n|\\|") %>%
    unlist() %>%
    `[`(2) %>%
    trimws() %>%
    gsub("Original release date: ", "", .) %>%
    c(datez, .) -> datez
}

head(data.frame(id = idz, date = datez))

##          id           date
## 1 aa21-201a  July 20, 2021
## 2 aa21-200b  July 19, 2021
## 3 aa21-200a  July 19, 2021
## 4 aa21-148a   May 28, 2021
## 5 aa21-131a   May 11, 2021
## 6 aa21-116a April 26, 2021

Here are the top 250 movies according to IMDB. Get the movie title, year, link, rating, and number of people who reviewed.

url <- "https://www.imdb.com/chart/top"
read_html(url) -> myHTML

myHTML %>%
  html_nodes("#main > div > span > div > div > div.lister > table") %>%
  html_table() %>%
  as.data.frame() -> top250

top250$year <- gsub(".*\\(|\\)", "", top250$Rank...Title)

myHTML %>%
  html_nodes("#main > div > span > div > div > div.lister > table") %>%
  html_nodes("a") %>%
  html_attr("href") %>% unique() -> top250$link

myHTML %>%
  html_nodes("#main > div > span > div > div > div.lister > table") %>%
  html_nodes("strong") %>%
  html_attr("title") %>%
  substr(., 14, regexpr("user", .)-2) %>%
  gsub(",", "", .) %>%
  as.numeric() -> top250$numReviews

head(top250[,c(2, 3, 6, 8)])

##                                         Rank...Title IMDb.Rating year
## 1 1.\n      The Shawshank Redemption\n        (1994)         9.2 1994
## 2            2.\n      The Godfather\n        (1972)         9.1 1972
## 3   3.\n      The Godfather: Part II\n        (1974)         9.0 1974
## 4          4.\n      The Dark Knight\n        (2008)         9.0 2008
## 5             5.\n      12 Angry Men\n        (1957)         8.9 1957
## 6         6.\n      Schindler's List\n        (1993)         8.9 1993
##   numReviews
## 1    2425398
## 2    1678439
## 3    1165000
## 4    2381701
## 5     715249
## 6    1248210

HW2 Solutions

Alexander Cardazzi

Quarantine, 2020