-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlowestCrawler.rb
105 lines (92 loc) · 2.59 KB
/
lowestCrawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
require "rubygems"
require "nokogiri"
require "open-uri"
require "csv"
require "date"
def processLowestLevelPage(url,f2)
page = Nokogiri::HTML(open(url))
leftStack = page.css("div[id=left-stack]")
podcastId = ""
podcastName = ""
artist = ""
link = ""
episodeName = ""
episodeDescription = ""
result = ""
#TO-DO
podcastInfo = page.css("div").select{|podcastId| podcastId['podcast-id']}[0]
if podcastInfo.nil?
return
end
unless podcastInfo['podcast-id'].nil?
podcastId = podcastInfo['podcast-id']
end
unless podcastInfo['podcast-name'].nil?
podcastName = '"'+podcastInfo['podcast-name'].gsub(/[\r,\n,"]/,'').strip+'"'
end
# print podcastId+","
# print podcastName+","
contentBox = page.css("div[class=tracklist-content-box]")
if contentBox.nil?
return
end
lastestEntryArray = contentBox.css("tbody").css("tr")
size = 0
unless lastestEntryArray[size+1].nil?
while lastestEntryArray[size].css("td[class=release-date]")==lastestEntryArray[size+1].css("td[class=release-date]")
size = size+1
end
end
for i in 0..size
if i==1
puts "duplicate!!"
end
result = ""
lastestEntry = lastestEntryArray[i]
if lastestEntry.nil?
return
end
episodeNameEntry = lastestEntry.css("td[class='name flexible-col']")
unless episodeNameEntry.nil?
episodeName = '"'+episodeNameEntry.text.gsub(/[\r,\n,"]/,'').strip+'"'
end
episodeDiscriptionEntry = lastestEntry.css("td[class='description flexible-col']")
unless episodeDiscriptionEntry.nil?
episodeDescription = '"'+episodeDiscriptionEntry.text.gsub(/[\r,\n,"]/,'').strip+'"'
end
unless lastestEntry.nil?
unless lastestEntry['preview-artist'].nil?
artist = '"'+lastestEntry['preview-artist'].gsub(/[\r,\n,"]/,'').strip+'"'
end
unless lastestEntry['audio-preview-url'].nil?
link = '"'+lastestEntry['audio-preview-url'].gsub(/[\r,\n,"]/,'').strip+'"'
end
end
unless podcastId.nil?
result = result+podcastId+","
end
unless podcastName.nil?
result = result+podcastName+","
end
unless artist.nil?
result = result+artist+","
end
unless link.nil?
result = result+link
end
result = result + ","+episodeName+","+'"'+DateTime.now.to_s+'"'+","+episodeDescription
f2.puts result
i = i+1
end
end
f2 = File.new("dataForUser"+DateTime.now.to_s+".csv", "w")
count = 0
CSV.foreach("crawl_data_table.csv") do |row|
if row[4]=="1"
count = count +1
print "processing num:"
puts count
processLowestLevelPage(row[1],f2)
end
end
f2.close