-
Notifications
You must be signed in to change notification settings - Fork 0
/
rvest and pepys.Rmd
151 lines (126 loc) · 5.22 KB
/
rvest and pepys.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
---
title: "rvest and pepys"
output: html_notebook
---
```{r setup}
library(tidyverse)
library(rvest)
```
https://www.pepysdiary.com/diary/1665/03/19/
```{r}
people_lines <- read_lines("https://www.pepysdiary.com/encyclopedia/people/") %>%
keep(function(x) str_detect(x, "/encyclopedia/\\d")) # verify link is to encyclopedia entry
people_entry_id <- people_lines %>%
str_extract("[0-9]+")
people_entry_text <- people_lines %>%
# extract uses a positive look-behind assertion and also a positive look-ahead assertion
str_extract("(?<=[0-9]/\\\">).+(?=</a>)")
people_name_parens <- people_entry_text %>%
str_extract("(?<=\\().+(?=\\))")
people_name <- ifelse(is.na(people_name_parens), people_entry_text,
str_extract(people_entry_text, ".+(?=\\()"))
people_df <- data_frame(entry_id = people_entry_id, name = people_name, parens = people_name_parens)
```
Next, code that will get the ID of an encyclopedia entry
```{r}
# just read in the lines for an entry. Thee's a line like this near top:
# <title>Exchequer offices (The Diary of Samuel Pepys)</title>
# Find that line and extract the title.
# test: entry_title("6627x") entry_title("6627") entry_title("art")
entry_title <- function(id) {
# possibly_url = possibly(url, otherwise = NULL)
# xx <- possibly_url(str_c("https://www.pepysdiary.com/encyclopedia/", id, "/"), open = "r")
# if (is.null(xx)) return(NA_character_)
# possibly_readLines <- possibly(readLines, NA_character_, quiet = TRUE)
title <- readLines(str_c("https://www.pepysdiary.com/encyclopedia/", id, "/"), n = 15)
#if (is.na(title)) return(NA_character_)
title <- title %>%
keep(function(x) str_detect(x, "<title>")) %>%
str_extract("(?<=<title>).+(?= \\(The Diary)")
if (title == "") return(NA_character_)
if (is.null(title)) return(NA_character_)
return(title)
}
```
```{r pepys_entry_page}
#test: pepys_entry_page(entry_df$date[20])
pepys_entry_page_original <- function(d) {
read_html(str_c("https://www.pepysdiary.com/diary/", sprintf("%04d/%02d/%02d/", year(d), month(d), day(d)))) %>%
html_nodes(".manuscript-large a") %>% # see vignette("selectorgadget")
html_attrs() %>%
keep(function(x) str_detect(x, "www.pepysdiary.com/encyclopedia")) %>% # verify link is to encyclopedia entry
str_extract("\\d+") # extract id from entry
}
# redo, but this time using api
# test: xx <- pepys_entry_page(ymd("1666-09-02"))
pepys_entry_page <- function(d) {
# API URL: https://www.pepysdiary.com/api/v1/entries/1666-09-02
fromJSON(str_c("https://www.pepysdiary.com/api/v1/entries/", sprintf("%04d-%02d-%02d", year(d), month(d), day(d))))$entry_html %>%
read_html() %>%
html_nodes("a") %>% # see vignette("selectorgadget")
html_attrs() %>%
keep(function(x) str_detect(x, "www.pepysdiary.com/encyclopedia")) %>% # verify link is to encyclopedia entry
str_extract("\\d+") # extract id from entry
}
# Return df rather than vector of ids
# test: xx <- pepys_entry_page_df(ymd("1666-09-02"))
pepys_entry_page_df <- function(d) {
ids <- fromJSON(str_c("https://www.pepysdiary.com/api/v1/entries/", sprintf("%04d-%02d-%02d", year(d), month(d), day(d))))$entry_html %>%
read_html() %>%
html_nodes("a") %>% # see vignette("selectorgadget")
html_attrs() %>%
keep(function(x) str_detect(x, "www.pepysdiary.com/encyclopedia")) %>% # verify link is to encyclopedia entry
str_extract("\\d+") # extract id from entry
return(data_frame(date = d, id = ids))
}
```
```{r}
```
```{r}
# accumulate info for 150 entries
several <- map(entry_df$date[1:150], pepys_entry_page)
save(several, file = "several list of 150 entries with ids.RData")
```
OK, so we have our building blocks. Let's use purrr to apply the pepys_entry_page function to a number of entries and gather up the result.
```{r}
# This code works, but it takes an unreasonably long amount of
# time to run. For practical purposes, I need something faster.
# Whoa, I forgot that pepys_entry_page goes off and reads
# an hrml page each time it is called. That's what is slow.
refs <- map2_dfr(
entry_df$date[1:150],
map(entry_df$date[1:150], pepys_entry_page),
function(x, y) {
data_frame(date = x, id = y)
})
save(refs, file = "refs for 1-150.RData")
tidy_pepys_objects <- refs %>% group_by(date, id) %>% count() %>% left_join(people_df, by = c("id" = "entry_id"))
```
```{r}
stop("This is busted and just wastes time.")
# let's try an alternate version. this has an execution error
system.time(refs2 <- map_dfr(
entry_df$date,
pepys_entry_page_df
))
```
```{r}
# find names for common non-people objects
objects1 <- tidy_pepys_objects %>% group_by(id) %>% filter(is.na(name)) %>%
tally() %>% arrange(desc(nn))
system.time(object_names <- map(objects1$id, entry_title))
add_names <- data_frame(id = objects1$id, object_names = flatten_chr(object_names) )
save(add_names, file = "add_names of non-people.RData")
tidy_pepys_objects <- tidy_pepys_objects %>% left_join(add_names, by = c("id")) %>%
mutate(name = ifelse(is.na(name), object_names, name))
```
```{r}
#date exploration
d1 <- ymd_hm("16600101 00-00")
r1 <- as.double(d1)
r2 <- as.double(entry_df$date[1])
as.POSIXct(r1, origin = lubridate::origin, tz = "UTC")
```
```{r}
zz <- GET("https://www.pepysdiary.com/api/v1/entries/1666-09-02")
```