-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_socrata.php
114 lines (79 loc) · 2.8 KB
/
get_socrata.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
<?php
/*
This program scrapes the healthcare related projects from Socrata's meta site
And then creates a list of the dataset, organized by originating domain name
Many of the items under the 'health' tag on the socrata site have little do with healthcare.
And given that almost everything could impact health, including crime, waste management and ecology
As a result, many of the 'health' tag results from the website are not really 'healthcare' per se..
To handle this, the socrata_name_filter.php file
the socrata_name_filter.php file contains an array of string values that will cause
*/
$base_url = "http://api.us.socrata.com/api/catalog/v1?categories=health&limit=100&offset=";
$is_more = true;
$offset = 0;
$dataset_by_domain = [];
$domain_attributions = [];
while($is_more){
$url = $base_url . $offset;
$json = file_get_contents($url);
$data = json_decode($json,true);
if(isset($data['results'])){
$results = $data['results'];
if(count($results) > 0){
foreach($results as $this_result){
$domain = $this_result['metadata']['domain'];
$name = $this_result['resource']['name'];
$description = $this_result['resource']['description'];
$attribution = $this_result['resource']['attribution'];
$permalink = $this_result['permalink'];;
$tmp = [
'name' => $name,
'description' => $description,
'permalink' => $permalink,
];
$dataset_by_domain[$domain][] = $tmp;
$domain_attributions[$domain][$attribution] = $attribution;
}
}else{
$is_more = false;
}
}else{
$is_more = false;
}
$offset = $offset + 500;
echo '.';
//$is_more = false;
}
$markdown = '
Socrata Healthcare Datasets
======================
';
//https://stackoverflow.com/a/14704792/144364
uasort($dataset_by_domain, function ($a, $b) {
$a = count($a);
$b = count($b);
return ($a == $b) ? 0 : (($a > $b) ? -1 : 1);
});
$socrata_dir = './socrata_markdown/';
foreach($dataset_by_domain as $domain => $dataset_list){
$dataset_count = count($dataset_list);
$domain_file = $socrata_dir.$domain.'.md';
$markdown .= "\n* [$domain]($domain_file) $dataset_count health datasets\n";
$c = '';
foreach($domain_attributions[$domain] as $attribution){
if(strlen(trim($attribution)) > 0){
$markdown .= "$c * $attribution ";
$c = "\n";
}
}
$domain_markdown = "# $domain health datasets\n";
foreach($dataset_list as $this_dataset){
$name = $this_dataset['name'];
$description = $this_dataset['description'];
$description = trim(preg_replace('/\s\s+/', ' ', $description));
$permalink = $this_dataset['permalink'];
$domain_markdown .= "* [$name]($permalink) - $description\n";
}
file_put_contents($domain_file,$domain_markdown);
}
file_put_contents('SocrataHealthDataSets.md',$markdown);