-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_scraping.R
36 lines (29 loc) · 1.26 KB
/
pdf_scraping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
library(tidyverse)
library(pdftools)
library(fs)
options(tibble.print_max = 60, tibble.print_min = 50) #if there are more than n rows, print only the first m rows.
files <- dir_ls("pdfs", glob = "*.pdf")
raw_cm <- dir_map("pdfs", fun = pdf_text)
cm <- raw_cm %>% map(function(x) {
unlist(x) %>%
enframe(name = "page", value = "comment") %>%
slice(-1)
}) %>% bind_rows()
parsed <- cm %>%
mutate(comment_num = str_extract(comment, "^COMMENT \\#:\\s*\\d{1,5}")) %>%
mutate(comment_num = str_remove(comment_num, "^COMMENT \\#:\\s*")) %>%
mutate(comment_num = as.numeric(comment_num)) %>%
mutate(comment = str_remove_all(comment, "Sept 2022.*Little Cottonwood Canyon Final EIS")) %>%
fill(comment_num, .direction = "down") %>%
group_by(comment_num) %>%
summarize(comment = paste(comment, collapse = " ")) %>%
ungroup() %>%
separate(comment, into = c("cnum", "date", "source", "name", "text"), sep = "DATE:|\nSOURCE:|\nNAME:|\nCOMMENT:") %>%
mutate(across(everything(), ~str_squish(.))) %>%
select(-cnum)
codes <- parsed %>%
mutate(codes = str_extract_all(text, "\\(32\\.\\d*\\w\\.?\\d?\\w?\\)")) %>%
select(comment_num, date, source, codes) %>%
unnest(codes)
write_csv(parsed, "comments table.csv")
write_csv(codes, "comment codes.csv")