-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUnimelb_Scrape
84 lines (66 loc) · 3.3 KB
/
Unimelb_Scrape
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
library(tidyverse)
library(rvest)
library(googlesheets)
library(stringr)
FVASdatabase <- gs_title(x = "FVAS_Staff")
FVAS_ID <- "https://www.findanexpert.unimelb.edu.au/display/org6277"
Ag_ID <- "https://www.findanexpert.unimelb.edu.au/display/org6398"
VetBS <- "https://www.findanexpert.unimelb.edu.au/display/org6400"
VetCS <- "https://www.findanexpert.unimelb.edu.au/display/org6399"
VetHospital <- "https://www.findanexpert.unimelb.edu.au/display/org184"
#https://www.findanexpert.unimelb.edu.au/display/person412220
html <- read_html(x = orgID)
staffid <- html %>%
html_nodes(css = ".property-list")
remove(dat)
dat <- rbind(c("Name", "Link", "Department", "Role", "PhD", "Year"))
LinkList <- paste0("https://www.findanexpert.unimelb.edu.au", html_nodes(x = html, css = "li") %>%
html_nodes("a") %>%
tail(-7) %>%
head(-4) %>%
html_attr("href")
)
NameList <- html_nodes(x = html, css = "li") %>% html_nodes("a") %>% tail(-7) %>% head(-4) %>% html_text() %>% str_split(pattern = " ", n = 2, simplify = TRUE)
TitleList <- NameList[,1]
NameList <- NameList[,2]
Department <- html_nodes(x = html, css = "h1.fn") %>%
html_text %>%
str_replace_all(pattern = "\n", replacement = " ") %>%
str_replace_all(pattern = "[\\^]", replacement = " ") %>%
str_replace_all(pattern = "\"", replacement = " ") %>%
str_replace_all(pattern = "\\s+", replacement = " ") %>%
str_trim(side = "both")
Position <- html_nodes(x = html, css = "li") %>% tail(-7) %>% head(-6) %>% html_text() %>% str_split(pattern = ",", n=2, simplify = TRUE)
Position <- Position[,2] %>% str_replace_all(pattern = "\n", replacement = "") %>% str_trim(side = "both")
remove(PhDdetails)
PhDdetails <- data.frame(Date=as.Date(character()),
File=character(),
User=character(),
stringsAsFactors=FALSE)
for (link in LinkList) {
personhtml <- read_html(link)
print(link)
hasPhD <- html_nodes(x = personhtml, xpath = "//h3[contains(., 'Education and training')]/following-sibling::ul") %>% html_nodes(css = "li") %>% sub(pattern = "Ph.D", replacement = "PhD") %>% grep(pattern = "PhD", ignore.case = TRUE, value = TRUE) %>% grepl(pattern = "PhD", ignore.case = TRUE)
print(hasPhD)
if (length(hasPhD) > 0) {
if (hasPhD == TRUE) {
PhDYear <- as_xml_document(html_nodes(x = personhtml, xpath = "//h3[contains(., 'Education and training')]/following-sibling::ul") %>% html_nodes(css = "li") %>% sub(pattern = "Ph.D", replacement = "PhD") %>% grep(pattern = "PhD", ignore.case = TRUE, value = TRUE)) %>% html_text() %>%
str_replace_all(pattern = "\n", replacement = " ") %>%
str_replace_all(pattern = "[\\^]", replacement = " ") %>%
str_replace_all(pattern = "\"", replacement = " ") %>%
str_replace_all(pattern = "\\s+", replacement = " ") %>%
str_replace_all(pattern = "\\.", replacement = "") %>%
str_replace_all(pattern = ",", replacement = "") %>%
str_trim(side = "both") %>%
str_split(pattern = " ", simplify = TRUE)
PhDYear<- PhDYear[,length(PhDYear)]
} else {
PhDYear <- NA
}
} else {
PhDYear <- NA
}
PhDdetails <- rbind(PhDdetails, c(hasPhD, PhDYear))
Sys.sleep(10)
}
dat <- cbind(TitleList, NameList, Position, Department, LinkList)