forked from Yukaii/CrawlerMaster
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hcu_course_crawler.rb
117 lines (104 loc) · 3.81 KB
/
hcu_course_crawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
##
# 玄奘課程爬蟲
# http://hrs.hcu.edu.tw/strategy/std/index.asp
#
module CourseCrawler::Crawlers
class HcuCourseCrawler < CourseCrawler::Base
def initialize year: nil, term: nil, update_progress: nil, after_each: nil
@year = year || current_year
@term = term || current_term
@update_progress_proc = update_progress
@after_each_proc = after_each
@query_url = 'http://hrs.hcu.edu.tw/strategy/std/index.asp'
@ic = Iconv.new('utf-8//IGNORE//translit', 'big5')
end
def courses
@courses = []
year = @year - 1911
r = RestClient.get(@query_url)
doc = Nokogiri::HTML(@ic.iconv(r))
puts "get url ..."
r = RestClient.post("http://hrs.hcu.edu.tw/strategy/std/index2.asp", {
"yy" => year,
"mm" => @term,
# "s1" => "",
# "s22" => "",
# "s2" => "",
# "s6" => "",
# "s7" => "",
# "s8" => "",
})
doc = Nokogiri::HTML(@ic.iconv(r))
puts "crawling data ..."
count = 1
total = doc.css('tbody tr[height="30"]').map{|tr| tr}.each do |tr|
# 為了觀看爬蟲次序
if count==1 || count % 5 == 0
puts "data crawled : "+ count.to_s
end
count += 1
data = tr.css('td:nth-child(n+3)').map{|td| td.text}
_datas = tr.css('td')
# syllabus_url = tr.css('td a').map{|a| a}
# note = tr[:title]
time_period_regex = /(?<day>\d)[0]?(?<period>\d+)/
course_time_location = data[5].scan(time_period_regex)
# 把 course_time_location 轉成資料庫可以儲存的格式
course_days, course_periods, course_locations = [], [], []
course_time_location.each do |arr|
day, period = arr
course_days << day.to_i
course_periods << period.to_i
course_locations << power_strip(data[6])
end
general_code = data[2]
cla_code = Digest::MD5.hexdigest(_datas[2].text)[0..5]
course = {
year: @year, # 西元年
term: @term, # 學期 (第一學期=1,第二學期=2)
name: data[3], # 課程名稱
lecturer: data[4], # 授課教師
credits: data[7].to_i, # 學分數
code: "#{@year}-#{@term}-#{data[1]}_#{data[2]}_#{cla_code}",
general_code: "#{data[2]}",
# general_code: data[2], # 選課代碼
# url: syllabus_url, # 課程大綱之類的連結(內容為HTML,這是一個要POST的)
required: data[12].include?('必'), # 必修或選修
department: data[0] + "#{data[5].scan(/[單雙]/)[0]}", # 開課系所
# note: note,
day_1: course_days[0],
day_2: course_days[1],
day_3: course_days[2],
day_4: course_days[3],
day_5: course_days[4],
day_6: course_days[5],
day_7: course_days[6],
day_8: course_days[7],
day_9: course_days[8],
period_1: course_periods[0],
period_2: course_periods[1],
period_3: course_periods[2],
period_4: course_periods[3],
period_5: course_periods[4],
period_6: course_periods[5],
period_7: course_periods[6],
period_8: course_periods[7],
period_9: course_periods[8],
location_1: course_locations[0],
location_2: course_locations[1],
location_3: course_locations[2],
location_4: course_locations[3],
location_5: course_locations[4],
location_6: course_locations[5],
location_7: course_locations[6],
location_8: course_locations[7],
location_9: course_locations[8],
}
@after_each_proc.call(course: course) if @after_each_proc
@courses << course
end
puts "Project finished !!!"
@courses
end
end
end