Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add delete_missing option to CKAN Harvester #542 #548

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ckanext/harvest/harvesters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,8 @@ def _create_or_update_package(self, package_dict, harvest_object,

# Check modified date
if 'metadata_modified' not in package_dict or \
package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified') or \
existing_package_dict['state'] == 'deleted':
log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
# Update package
context.update({'id': package_dict['id']})
Expand Down
59 changes: 56 additions & 3 deletions ckanext/harvest/harvesters/ckanharvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ckan.lib.helpers import json
from ckan.plugins import toolkit

from ckanext.harvest.model import HarvestObject
from ckanext.harvest.model import HarvestObject, HarvestObjectExtra
from .base import HarvesterBase

import logging
Expand Down Expand Up @@ -159,7 +159,7 @@ def validate_config(self, config):
except NotFound:
raise ValueError('User not found')

for key in ('read_only', 'force_all'):
for key in ('read_only', 'force_all', 'delete_missing'):
if key in config_obj:
if not isinstance(config_obj[key], bool):
raise ValueError('%s must be boolean' % key)
Expand All @@ -184,6 +184,18 @@ def gather_stage(self, harvest_job):

self._set_config(harvest_job.source.config)

# If using delete_missing, get the previous package ids for this source
delete_missing = self.config.get('delete_missing', False)
if delete_missing:
query = model.Session.query(HarvestObject.package_id) \
.filter(HarvestObject.package_id != None) \
.filter(HarvestObject.current == True) \
.filter(HarvestObject.harvest_source_id == harvest_job.source.id)

package_ids_in_db = set()
for package_id, in query:
package_ids_in_db.add(package_id)

# Get source URL
remote_ckan_base_url = harvest_job.source.url.rstrip('/')

Expand All @@ -209,10 +221,12 @@ def gather_stage(self, harvest_job):

# Ideally we can request from the remote CKAN only those datasets
# modified since the last completely successful harvest.
# If using delete_missing option we have to get all datasets
last_error_free_job = self.last_error_free_job(harvest_job)
log.debug('Last error-free job: %r', last_error_free_job)
if (last_error_free_job and
not self.config.get('force_all', False)):
not self.config.get('force_all', False) and
not delete_missing):
get_all_packages = False

# Request only the datasets modified since
Expand Down Expand Up @@ -283,6 +297,25 @@ def gather_stage(self, harvest_job):
obj.save()
object_ids.append(obj.id)

# If using delete_missing option, check for datasets that no longer
# exist
if delete_missing:
ids_to_delete = package_ids_in_db - package_ids
for pkg_id in ids_to_delete:
log.debug('Creating HarvestObject to delete dataset %s',
pkg_id)
obj = HarvestObject(
guid=pkg_id,
package_id=pkg_id,
job=harvest_job,
content='',
extras=[
HarvestObjectExtra(key='missing', value='true')
]
)
obj.save()
object_ids.append(obj.id)

return object_ids
except Exception as e:
self._save_gather_error('%r' % e.message, harvest_job)
Expand Down Expand Up @@ -385,6 +418,26 @@ def import_stage(self, harvest_object):
self._set_config(harvest_object.job.source.config)

try:
# If using delete_missing option, check if this object tells to
# delete the dataset
delete_missing = self.config.get('delete_missing', False)
if (delete_missing and
'missing' in [item.key for item in harvest_object.extras]):
# Delete package
toolkit.get_action('package_delete')(
base_context.copy(), {'id': harvest_object.package_id})
log.info('Deleted package {0}'.format(harvest_object.package_id))

# Update previous harvest object
model.Session.query(HarvestObject).\
filter(HarvestObject.guid == harvest_object.guid).\
filter(HarvestObject.harvest_source_id ==
harvest_object.source.id).\
filter(HarvestObject.current == True).\
update({'current': False})

return True

package_dict = json.loads(harvest_object.content)

if package_dict.get('type') == 'harvest':
Expand Down
135 changes: 133 additions & 2 deletions ckanext/harvest/tests/harvesters/mock_ckan.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def do_GET(self):
if params['start'] != '0':
datasets = []
elif set(params.keys()) == set(['rows', 'start']):
datasets = ['dataset1', DATASETS[1]['name']]
datasets = [d['name'] for d in DATASETS]
elif set(params.keys()) == set(['fq', 'rows', 'start']) and \
params['fq'] == '-organization:org1':
datasets = [DATASETS[1]['name']]
Expand Down Expand Up @@ -489,7 +489,138 @@ def convert_dataset_to_restful_form(dataset):
"revision_id": "3bd6ced3-35b2-4b20-94e2-c596e24bc375",
"date_released": "30/7/2010",
"theme-primary": "Towns & Cities"
}
},
{
"id": "243592d4-5a74-4139-8a9a-02c8a5fd0caf",
"name": "painel-integridade-publica",
"author": None,
"author_email": None,
"creator_user_id": "a0e37bbb-9859-43f8-836a-fc40406d83de",
"isopen": True,
"license_id": "odc-pddl",
"license_title": "Open Data Commons Public Domain Dedication and License (PDDL)",
"license_url": "http://www.opendefinition.org/licenses/odc-pddl",
"maintainer": "Coordenação-Geral de Monitoramento e Avaliação (CGMAV/DIPIN/SIP)",
"maintainer_email": "[email protected]",
"metadata_created": "2023-06-12T20:57:57.479383",
"metadata_modified": "2023-06-12T20:57:57.479390",
"notes": "O Painel Integridade Pública permite conferir informações sobre a estruturação, "
"a execução e o monitoramento de programas de integridade em órgãos e entidades "
"do Governo Federal",
"num_resources": 3,
"num_tags": 3,
"organization": {
"id": "539d7aca-df8e-4033-ad9d-191444f1322d",
"name": "controladoria-geral-da-uniao",
"title": "Controladoria-Geral da União",
"type": "organization",
"description": "A Controladoria-Geral da União (CGU) é o órgão"
"de controle interno do Governo Federal",
"image_url": "https://dados.gov.br/api/publico/s3/19987297-90c6-472b-9ee2-c2106fb6afbb.png",
"created": "2016-08-19T08:03:46.882221",
"is_organization": True,
"approval_status": "approved",
"state": "active"
},
"owner_org": "539d7aca-df8e-4033-ad9d-191444f1322d",
"private": False,
"state": "active",
"title": "Painel Integridade Pública",
"type": "dataset",
"url": None,
"version": "1.0",
"extras": [],
"groups": [],
"resources": [
{
"cache_last_updated": None,
"cache_url": None,
"created": "2023-06-12T20:57:57.618980",
"description": "Mapeamento",
"format": "CSV",
"hash": "",
"id": "309d3d05-3a30-45fd-a4b1-caf8c53e94fc",
"last_modified": None,
"mimetype": None,
"mimetype_inner": None,
"name": "2023",
"package_id": "243592d4-5a74-4139-8a9a-02c8a5fd0caf",
"position": 0,
"resource_type": None,
"size": 0,
"state": "active",
"url": "https://dadosabertos-download.cgu.gov.br/IntegridadePublica/Dados_Mapeamento_CSV.csv",
"url_type": None,
},
{
"cache_last_updated": None,
"cache_url": None,
"created": "2023-06-12T20:57:57.618985",
"description": "Dados coletados pela CGU",
"format": "CSV",
"hash": "",
"id": "71433fcb-a6a9-433e-a3fc-d9f8acbc75b8",
"idTipo": 1,
"last_modified": None,
"mimetype": None,
"mimetype_inner": None,
"name": "2022",
"package_id": "243592d4-5a74-4139-8a9a-02c8a5fd0caf",
"position": 1,
"resource_type": None,
"size": 0,
"state": "active",
"url": "https://dadosabertos-download.cgu.gov.br/IntegridadePublica/Dados_Etapa_2_CSV.csv",
"url_type": None,
},
{
"cache_last_updated": None,
"cache_url": None,
"created": "2023-06-12T20:57:57.618987",
"description": "Dados coletados pela CGU",
"format": "CSV",
"hash": "",
"id": "88554739-e83d-44fe-b6ad-f8e49ad7a1e6",
"last_modified": None,
"metadata_modified": "2023-06-12T20:57:57.380779",
"mimetype": None,
"mimetype_inner": None,
"name": "2021",
"package_id": "243592d4-5a74-4139-8a9a-02c8a5fd0caf",
"position": 2,
"resource_type": None,
"size": 0,
"state": "active",
"url": "https://dadosabertos-download.cgu.gov.br/IntegridadePublica/Dados_Etapa_1_CSV.csv",
"url_type": None,
},
],
"tags": [
{
"display_name": "Integridade",
"id": "a709b28c-936c-4714-99ef-cd77cb144b9f",
"name": "Integridade",
"state": "active",
"vocabulary_id": None,
},
{
"display_name": "Integridade Pública",
"id": "2b52014c-da45-4837-80de-0dfe467a9a51",
"name": "Integridade Pública",
"state": "active",
"vocabulary_id": None,
},
{
"display_name": "Programa de Integridade",
"id": "ee16b70a-41e2-41ec-a521-11049aecf767",
"name": "Programa de Integridade",
"state": "active",
"vocabulary_id": None,
},
],
"relationships_as_subject": [],
"relationships_as_object": [],
},
]

INVALID_TAGS = [
Expand Down
34 changes: 33 additions & 1 deletion ckanext/harvest/tests/harvesters/test_ckanharvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ckanext.harvest.harvesters.ckanharvester import ContentFetchError
from ckanext.harvest.tests.factories import (HarvestSourceObj, HarvestJobObj,
HarvestObjectObj)
from ckanext.harvest.tests.lib import run_harvest
from ckanext.harvest.tests.lib import run_harvest, run_harvest_job
import ckanext.harvest.model as harvest_model
from ckanext.harvest.harvesters.base import HarvesterBase
from ckanext.harvest.harvesters.ckanharvester import CKANHarvester
Expand Down Expand Up @@ -357,3 +357,35 @@ def test_get_content_handles_http_error(
harvester._get_content("http://test.example.gov.uk")

assert str(context.value) == 'HTTP error: 404 http://test.example.gov.uk'

def test_delete_missing(self):
config = {'delete_missing': True}
harvester = CKANHarvester()

# Create harvest source
source = HarvestSourceObj(
url='http://localhost:%s' % mock_ckan.PORT,
config=json.dumps(config),
source_type=harvester.info()['name'])

# Run the first harvest process
job = HarvestJobObj(source=source, run=False)
results_by_guid = run_harvest_job(job, harvester)

assert mock_ckan.DATASETS[2]['id'] in results_by_guid

# Delete one dataset in mock server and rerun harvest process
datasets = copy.deepcopy(mock_ckan.DATASETS)
deleted_id = datasets[2]['id']
del datasets[2]
with patch('ckanext.harvest.tests.harvesters.mock_ckan.DATASETS',
datasets):
job = HarvestJobObj(source=source, run=False)
results_by_guid = run_harvest_job(job, harvester)

assert deleted_id in results_by_guid
assert results_by_guid[deleted_id]['report_status'] == 'deleted'

# Check if dataset is in deleted state
deleted_dataset = call_action('package_show', None, id=deleted_id)
assert deleted_dataset['state'] == 'deleted'
Loading