- Here is that USA analytics website again. Find the top domains over the past 7 days and past 30 days. Find the ratio between the two.
library("rvest")
## Loading required package: xml2
library("magrittr")
library("jsonlite")
pages_7days <- fromJSON("https://analytics.usa.gov/data/live/top-domains-7-days.json")
# pages_7days$data
pages_30days <- fromJSON("https://analytics.usa.gov/data/live/top-domains-30-days.json")
# pages_30days$data
m <- match(pages_30days$data$domain, pages_7days$data$domain)
ratios <- as.numeric(pages_30days$data$visits) / as.numeric(pages_7days$data$visits[m])
plot(table(round(ratios, 1)))
- Here are the National Cyber Awareness Systems website. For each year, for each alert, find the date of the event, title, link, and ID.
"https://us-cert.cisa.gov/ncas/alerts" %>%
read_html() -> myHTML
myHTML %>%
html_nodes("ul") %>%
`[`(3) %>%
html_nodes("li") %>%
html_nodes("a") %>%
html_text() -> namez
myHTML %>%
html_nodes("ul") %>%
`[`(3) %>%
html_nodes("li") %>%
html_nodes("a") %>%
html_attr("href") -> linkz
gsub("/ncas/alerts/", "", linkz) -> idz
datez <- c()
for(i in paste0("https://us-cert.cisa.gov/", linkz)){
i %>%
read_html() -> tmp
tmp %>%
html_nodes("div.submitted") %>%
html_text() %>%
strsplit("\\\n|\\|") %>%
unlist() %>%
`[`(2) %>%
trimws() %>%
gsub("Original release date: ", "", .) %>%
c(datez, .) -> datez
}
head(data.frame(id = idz, date = datez))
## id date
## 1 aa21-201a July 20, 2021
## 2 aa21-200b July 19, 2021
## 3 aa21-200a July 19, 2021
## 4 aa21-148a May 28, 2021
## 5 aa21-131a May 11, 2021
## 6 aa21-116a April 26, 2021
- Here are the top 250 movies according to IMDB. Get the movie title, year, link, rating, and number of people who reviewed.
url <- "https://www.imdb.com/chart/top"
read_html(url) -> myHTML
myHTML %>%
html_nodes("#main > div > span > div > div > div.lister > table") %>%
html_table() %>%
as.data.frame() -> top250
top250$year <- gsub(".*\\(|\\)", "", top250$Rank...Title)
myHTML %>%
html_nodes("#main > div > span > div > div > div.lister > table") %>%
html_nodes("a") %>%
html_attr("href") %>% unique() -> top250$link
myHTML %>%
html_nodes("#main > div > span > div > div > div.lister > table") %>%
html_nodes("strong") %>%
html_attr("title") %>%
substr(., 14, regexpr("user", .)-2) %>%
gsub(",", "", .) %>%
as.numeric() -> top250$numReviews
head(top250[,c(2, 3, 6, 8)])
## Rank...Title IMDb.Rating year
## 1 1.\n The Shawshank Redemption\n (1994) 9.2 1994
## 2 2.\n The Godfather\n (1972) 9.1 1972
## 3 3.\n The Godfather: Part II\n (1974) 9.0 1974
## 4 4.\n The Dark Knight\n (2008) 9.0 2008
## 5 5.\n 12 Angry Men\n (1957) 8.9 1957
## 6 6.\n Schindler's List\n (1993) 8.9 1993
## numReviews
## 1 2425398
## 2 1678439
## 3 1165000
## 4 2381701
## 5 715249
## 6 1248210