-
Notifications
You must be signed in to change notification settings - Fork 2
/
01_collect_data.R
151 lines (124 loc) · 5.22 KB
/
01_collect_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Scrape text data from People`s Daily
# Import general packages:
library(rvest)
library(tidyverse)
library(stringr)
##############################################################
# Functions to scrape data and handle the subfolder
# 0. clean_folder()
# 1. create_folder()
# 2. scraping()
# 3. collecting()
##############################################################
# FUNCTION: clean_folder()
# Input: folder, date_start, date_end
# date_start and date_end = string, the format of date needs to be "XXXX(year)-XX(month)-XX(date)".
# Output: No return value, only delet the contents of each subfolder
clean_folder <- function(folder, date_start, date_end){
basedirct <- str_c("data/", folder, "/")
dates<-seq(as.Date(date_start), as.Date(date_end), by=1)
for (d in seq_along(dates)) {
path <-str_c(basedirct, dates[d])
unlink(path, recursive = TRUE)
}
}
# FUNCTION: create_folder()
# input=string date XXXX-XX-XX,
# output: generate a new filefolder if the folder does not exist.
# under the data/rawdata/
create_folder <- function (date){
basedirct <- "data/rawdata/"
path<- str_c(basedirct, date)
# create the subfolder for each day, if the folder does not exist, create a folder for it
if (!file.exists(path)){
dir.create(path)
}
}
# FUNCRION find_title()
# Input: a list of the string splitted article
# Outbut: title = str, the title of the article
# Note: in the scragped data, the first element is always the description of the news
# like, the frontpage-important news, or the frontpage-photograph
# some news may not have a title. this function will detect all the scraping articel
# to use the second non-space element as its title, if this element has character more than 20 char
# then consider this new does not a title, assign notilte to this artle.
find_title <- function(list) {
j <- 0
for (i in seq_along (list)){
if (list[i] !=""){
j <- j + 1
if (j == 2) {
# change the longest string length to 30, rather than 20
if (nchar(list[i]) < 30 ){
title <- list [i]
break()
} else {
title <- 'notitle'
break()
}
} else {title <- "empty"} # change"empty_article to "empty", because when splitting this title as index, there will be a problem
} else {title <- "empty"}
}
return (title)
}
# FUNCTION: scraping()
# Associated function: create_folder(), find_title()
# input = string date XXXX-XX-XX; pagenumber: int.
# ouput = None;
# File generated: write the scraped texts of artiles, whose \t\t and \t\t\n have been striped,
# into a folder created by create_folder() as a .txt file.
scraping <- function (date, pagenumber) {
# create a folder based on the date under the data/rawdata/
create_folder(date)
# initiate the base-directory and base url
basedirct <- "data/rawdata/"
baseurl <- "http://www.ziliaoku.org/rmrb/"
# generate the url for scraping
url<-str_c(baseurl, date, "-", toString(pagenumber))
# extract the content of the specific page of People`s Daily on a certain date
article<-read_html(url)
text<-article %>% html_nodes(".article") %>% html_text()
# in the text (a list), all the artilces will be saved as an element of this list
for (i in seq_along(text)) {
# strip the space at the begining and the end of one of the article from text
rawtext<-unlist(strsplit(text[i], '\\s'))
# after splitting one of the text to a lists of words (sep='space'),
# call the function find_title to assign the second piece of non-null char as the title.
title <- find_title(rawtext)
filename <- str_c(date, "-", pagenumber, "_", toString(i), "_", title)
# pre-processing of the text of the article
text[i]<-str_trim(text[i], side='both')
# save the text of one artile
filedirct<- str_c(basedirct, date, "/", filename,".txt")
# write into a .txt file into the assigned directory
#write.table (text[i], file = filedirct, row.names = F, quote = F)
writeLines(text[i], filedirct)
}
}
# FUNCTION: collecting() to scrape the the content of the pages of People`s Daily in a time period.
# Associated funciton: scraping()
# Input:
# date_start and date_end = string, the format of date needs to be "XXXX(year)-XX(month)-XX(date)".
# pagelist = a list of pagenumbers
# Output:
# save the texts of each artile of each date in specified page of People`s Daily
# into the corresponding subfolder in the data/rawdata/
collecting <-function (date_start, date_end, pagelist) {
# create a date sequence based on the date_start and date_end
dates<-seq(as.Date(date_start), as.Date(date_end), by=1)
for (d in seq_along(dates)) {
for (p in seq_along(pagelist))
scraping(dates[d], pagelist[p])
}
}
##############################################################
# Scraping Data from "http://www.ziliaoku.org/rmrb/"
##############################################################
# Note: to scrape the data will take 10-20 min for two yrs` data.
# clean_folder ("segtext","1989-01-01","1990-12-31")
# Fundamental parameters:
# basedirct <- "data/rawdata/"
# baseurl <- "http://www.ziliaoku.org/rmrb/"
#pagelist=c(1)
#collecting ("1988-01-01","1988-12-31", pagelist)
# encounter problem for 1986-01-06 1986-02-02,..., because in these days, there are lots of empty artilces.