-
Notifications
You must be signed in to change notification settings - Fork 1
/
load-all.sh
executable file
·738 lines (544 loc) · 33.2 KB
/
load-all.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
#!/bin/bash -
date
set -eu
set -o pipefail
HOST="$1"
DATE="$2"
USER="$3"
PASSWORD="$4"
PREV_VERSION="$5"
CURRENT_VERSION=`echo $PREV_VERSION | perl -ne 'if (/^v?(\d+)$/) { print "v" . ($1+1) . "\n"; } else { print "vUNKNOWN" }'`
PREV_DATE="$6"
die() {
echo $1 1>&2
exit 1
}
DATE_VERSION=$DATE
JBASE_HOME=`pwd`
JAPONICUS_BUILD=$JBASE_HOME/japonicus-build
JAPONICUS_CURATION=$JBASE_HOME/japonicus-curation
JAPONICUS_CONFIG=$JBASE_HOME/japonicus-config
POMCUR=/var/pomcur
SOURCES=$POMCUR/sources
JAPONICUS_SOURCES=$POMCUR/japonicus_sources
WWW_DIR=/var/www/pombase
POMBASE_NIGHTLY=$WWW_DIR/dumps
LOAD_CONFIG=$JAPONICUS_CONFIG/load-japonicus-chado.yaml
MAIN_CONFIG=$JAPONICUS_CONFIG/japonicus_site_config.json
LOG_DIR=$JBASE_HOME/logs
POMBASE_CHADO=$JBASE_HOME/pombase-chado
POMBASE_LEGACY=$JBASE_HOME/pombase-legacy
(cd chobo/; git pull) || die "Failed to update Chobo"
(cd pombase-chado; git pull) || die "Failed to update pombase-chado"
(cd pombase-legacy; git pull) || die "Failed to update pombase-legacy"
(cd pombase-website; git pull) || die "Failed to update pombase-website"
(cd $JAPONICUS_BUILD; git pull) || die "can't update $JAPONICUS_BUILD"
(cd $JAPONICUS_CURATION; git pull) || die "can't update $JAPONICUS_CURATION"
(cd $JAPONICUS_CONFIG; git pull) || die "can't update $JAPONICUS_CONFIG"
(cd pombase-legacy
export PATH=JBASE_HOME/chobo/script/:/usr/local/owltools-v0.3.0-74-gee0f8bbd/OWLTools-Runner/bin/:$PATH
export CHADO_CLOSURE_TOOL=$JBASE_HOME/pombase-chado/script/relation-graph-chado-closure.pl
export PERL5LIB=$POMBASE_CHADO/lib:$JBASE_HOME/chobo/lib/
time nice -19 $JAPONICUS_BUILD/make-db $JBASE_HOME $DATE "$HOST" $USER $PASSWORD) || die "make-db failed"
BASE_DB=japonicusdb-base-$DATE_VERSION
DB=japonicusdb-build-$DATE_VERSION
createdb -T $BASE_DB $DB
export PERL5LIB=$POMBASE_CHADO/lib:$POMBASE_LEGACY/lib
echo $PERL5LIB
echo initialising Chado with CVs and cvterms
$JBASE_HOME/pombase-chado/script/pombase-admin.pl $LOAD_CONFIG chado-init \
"$HOST" $DB $USER $PASSWORD || exit 1
echo loading organisms
$JBASE_HOME/pombase-chado/script/pombase-import.pl $LOAD_CONFIG organisms \
"$HOST" $DB $USER $PASSWORD < $JAPONICUS_CONFIG/japonicus_organism_config.tsv
#echo loading PB refs
#$JBASE_HOME/pombase-chado/script/pombase-import.pl $LOAD_CONFIG references-file \
# "$HOST" $DB $USER $PASSWORD < $JBASE_HOME/svn-supporting-files/PB_references.txt
echo loading GO refs parsed from go-site/metadata/gorefs/
$JBASE_HOME/pombase-chado/script/pombase-import.pl $POMBASE_LEGACY/load-pombase-chado.yaml references-file \
"$HOST" $DB $USER $PASSWORD < $SOURCES/pombe-embl/supporting_files/go_references.txt
echo loading human genes
$JBASE_HOME/pombase-chado/script/pombase-import.pl $LOAD_CONFIG features \
--organism-taxonid=9606 --uniquename-column=1 --name-column=2 --feature-type=gene \
--transcript-so-name=transcript --product-column=3 \
--ignore-lines-matching="^hgnc_id.symbol" --ignore-short-lines \
"$HOST" $DB $USER $PASSWORD < $SOURCES/hgnc_complete_set.txt
echo loading protein coding genes from SGD data file
$JBASE_HOME/pombase-chado/script/pombase-import.pl $LOAD_CONFIG features \
--organism-taxonid=4932 --uniquename-column=5 --name-column=6 \
--product-column=4 \
--column-filter="1=ORF,blocked_reading_frame,blocked reading frame" --feature-type=gene \
--transcript-so-name=transcript \
--feature-prop-from-column=sgd_identifier:3 \
"$HOST" $DB $USER $PASSWORD < $SOURCES/sgd_yeastmine_genes.tsv
for so_type in ncRNA snoRNA
do
echo loading $so_type genes from SGD data file
$JBASE_HOME/pombase-chado/script/pombase-import.pl $LOAD_CONFIG features \
--organism-taxonid=4932 --uniquename-column=5 --name-column=6 \
--transcript-so-name=$so_type \
--column-filter="1=${so_type} gene" --feature-type=gene \
"$HOST" $DB $USER $PASSWORD < $SOURCES/sgd_yeastmine_genes.tsv
done
echo loading pombe genes
$JBASE_HOME/pombase-chado/script/pombase-import.pl $LOAD_CONFIG features \
--organism-taxonid=4896 --uniquename-column=1 --name-column=3 --feature-type=gene \
--product-column=5 --ignore-short-lines \
--transcript-so-name=mRNA --column-filter="7=protein coding gene" \
"$HOST" $DB $USER $PASSWORD < $POMBASE_NIGHTLY/latest_build/misc/gene_IDs_names_products.tsv
for so_type in ncRNA tRNA snoRNA rRNA snRNA
do
$JBASE_HOME/pombase-chado/script/pombase-import.pl $LOAD_CONFIG features \
--organism-taxonid=4896 --uniquename-column=1 --name-column=3 \
--product-column=5 --ignore-short-lines \
--transcript-so-name=$so_type \
--column-filter="7=${so_type} gene" --feature-type=gene \
"$HOST" $DB $USER $PASSWORD < $POMBASE_NIGHTLY/latest_build/misc/gene_IDs_names_products.tsv
done
cd $LOG_DIR
log_file=log.`date +'%Y-%m-%d-%H-%M-%S'`
$POMBASE_LEGACY/script/load-chado.pl --taxonid=4897 \
--gene-ex-qualifiers $JAPONICUS_CONFIG/gene_ex_qualifiers \
$LOAD_CONFIG $DATE_VERSION \
"$HOST" $DB $USER $PASSWORD $JAPONICUS_CURATION/contigs/*.contig 2>&1 | tee $log_file || exit 1
$POMBASE_LEGACY/etc/process-log.pl $log_file
PGPASSWORD=$PASSWORD psql -U $USER -h "$HOST" $DB -c 'analyze'
echo loading names_and_products.tsv
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG names-and-products \
--dest-organism-taxonid=4897 \
"$HOST" $DB $USER $PASSWORD < $JAPONICUS_CURATION/names_and_products.tsv
echo loading systematic_id_uniprot_mapping.tsv
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG generic-property \
--property-name="uniprot_identifier" --organism-taxonid=4897 \
--feature-uniquename-column=1 --property-column=2 \
"$HOST" $DB $USER $PASSWORD < $JAPONICUS_CURATION/systematic_id_uniprot_mapping.tsv
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG generic-property \
--property-name="uniprot_identifier" --organism-taxonid=4896 \
--feature-uniquename-column=1 --property-column=2 \
"$HOST" $DB $USER $PASSWORD < $JAPONICUS_CURATION/pombe_systematic_id_uniprot_mapping.tsv
evidence_summary () {
DB=$1
psql $DB -c "select count(feature_cvtermprop_id), value from feature_cvtermprop where type_id in (select cvterm_id from cvterm where name = 'evidence') group by value order by count(feature_cvtermprop_id)" | cat
}
assigned_by_summary () {
DB=$1
psql $DB -c "select count(feature_cvtermprop_id), value from feature_cvtermprop where type_id in (select cvterm_id from cvterm where name = 'assigned_by') group by value order by count(feature_cvtermprop_id);" | cat
}
refresh_views () {
for view in \
pombase_annotated_gene_features_per_publication \
pombase_feature_cvterm_with_ext_parents \
pombase_feature_cvterm_no_ext_terms \
pombase_feature_cvterm_ext_resolved_terms \
pombase_genotypes_alleles_genes_mrna \
pombase_extension_rels_and_values \
pombase_genes_annotations_dates \
pombase_annotation_summary \
pombase_publication_curation_summary
do
psql $DB -c "REFRESH MATERIALIZED VIEW $view;"
done
}
echo annotation evidence counts before loading
evidence_summary $DB
pg_dump $DB | gzip -2 > /tmp/japonicus-chado-10-pre-goa.dump.gz
CURRENT_GOA_GAF=$SOURCES/gene_association.goa_uniprot.gz
GOA_POMBE_AND_JAPONICUS="$SOURCES/gene_association.goa_uniprot.pombe+japonicus.gz"
GOA_VERSION=`cat $GOA_POMBE_AND_JAPONICUS.uniprot_version`
$POMBASE_CHADO/script/pombase-admin.pl $LOAD_CONFIG add-chado-prop \
"$HOST" $DB $USER $PASSWORD "UniProt-GOA_version" $GOA_VERSION
echo reading $GOA_POMBE_AND_JAPONICUS
gzip -d < $GOA_POMBE_AND_JAPONICUS | perl -ne 'print if /\ttaxon:(4897|402676)\t/' |
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG gaf \
--taxon-filter=4897 --use-only-first-with-id \
--term-id-filter-filename=<(cat $SOURCES/pombe-embl/goa-load-fixes/filtered_GO_IDs $JAPONICUS_CURATION/japonicusdb_only_filtered_GO_IDs) \
--with-filter-filename=$SOURCES/pombe-embl/goa-load-fixes/filtered_mappings \
--assigned-by-filter=GOC,RNAcentral,InterPro,UniProtKB,UniProt "$HOST" $DB $USER $PASSWORD \
2>&1 | tee $LOG_DIR/$log_file.goa_gene_association_japonicus
echo load PANTHER annotation
gzip -d < $GOA_POMBE_AND_JAPONICUS | perl -ne 'print if /\ttaxon:(4897|402676)\t/' |
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG gaf \
--taxon-filter=4897 \
--with-prefix-filter="PANTHER:" \
--term-id-filter-filename=<(cat $SOURCES/pombe-embl/goa-load-fixes/filtered_GO_IDs $JAPONICUS_CURATION/japonicusdb_only_filtered_GO_IDs) \
--with-filter-filename=$SOURCES/pombe-embl/goa-load-fixes/filtered_mappings \
--assigned-by-filter=GO_Central "$HOST" $DB $USER $PASSWORD \
2>&1 | tee $LOG_DIR/$log_file.goa_gene_association_panther_japonicus
pg_dump $DB | gzip -2 > /tmp/japonicus-chado-20-after-goa.dump.gz
echo annotation count after GAF loading:
evidence_summary $DB
echo load manual annotation
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG gaf \
--taxon-filter=4897 \
--assigned-by-filter=EnsemblFungi,GOC,RNAcentral,InterPro,UniProtKB,UniProt,JaponicusDB \
"$HOST" $DB $USER $PASSWORD < $JAPONICUS_CURATION/Gene_ontology/manual_go_annotation.gaf \
2>&1 | tee $LOG_DIR/$log_file.manual_go_annotation
pg_dump $DB | gzip -2 > /tmp/japonicus-chado-30-after-manual-annotation.dump.gz
echo load Compara pombe orthologs
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG orthologs \
--publication=PMID:26896847 --organism_1_taxonid=4897 --organism_2_taxonid=4896 \
"$HOST" $DB $USER $PASSWORD < $JAPONICUS_CURATION/compara_pombe_orthologs.tsv 2>&1 | tee $LOG_DIR/$log_file.compara_pombe_orthologs
echo load Rhind pombe orthologs
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG orthologs \
--publication=PMID:21511999 --organism_1_taxonid=4897 --organism_2_taxonid=4896 \
"$HOST" $DB $USER $PASSWORD < $JAPONICUS_CURATION/rhind_pombe_orthologs.tsv 2>&1 | tee $LOG_DIR/$log_file.rhind_pombe_orthologs
echo load manual pombe orthologs
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG orthologs \
--publication=null --organism_1_taxonid=4897 --organism_2_taxonid=4896 \
"$HOST" $DB $USER $PASSWORD < $JAPONICUS_CURATION/manual_pombe_orthologs.tsv 2>&1 | tee $LOG_DIR/$log_file.manual_pombe_orthologs
echo load cerevisiae orthologs
# load via pombe orthologs first
echo " via pombe"
$JAPONICUS_BUILD/project-pombe-orthologs.pl \
<(curl -s --http1.1 https://curation.pombase.org/dumps/latest_build/exports/pombe-cerevisiae-orthologs-with-systematic-ids.txt.gz | gzip -d) \
<($POMBASE_CHADO/script/pombase-export.pl $LOAD_CONFIG simple-orthologs --organism-taxon-id=4897 --other-organism-taxon-id=4896 "$HOST" $DB $USER $PASSWORD) |
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG orthologs \
--publication=PMID:29761456 --organism_1_taxonid=4897 --organism_2_taxonid=4932 \
"$HOST" $DB $USER $PASSWORD 2>&1 | tee $LOG_DIR/$log_file.cerevisiae_orthologs_via_pombe
$POMBASE_CHADO/script/pombase-export.pl $LOAD_CONFIG simple-orthologs \
--organism-taxon-id=4897 --other-organism-taxon-id=4932 \
"$HOST" $DB $USER $PASSWORD > /tmp/japonicus_cerevisiae_orthologs_via_pombe.tsv
echo " from Compara"
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG orthologs \
--publication=PMID:26896847 --organism_1_taxonid=4897 --organism_2_taxonid=4932 \
"$HOST" $DB $USER $PASSWORD < $JAPONICUS_CURATION/cerevisiae_orthologs.tsv 2>&1 | tee $LOG_DIR/$log_file.compara_cerevisiae_orthologs
echo load human orthologs
$JAPONICUS_BUILD/project-pombe-orthologs.pl \
<(curl -s --http1.1 https://curation.pombase.org/dumps/latest_build/exports/pombe-human-orthologs-with-systematic-ids.txt.gz | gzip -d) \
<($POMBASE_CHADO/script/pombase-export.pl $LOAD_CONFIG simple-orthologs --organism-taxon-id=4897 --other-organism-taxon-id=4896 "$HOST" $DB $USER $PASSWORD) |
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG orthologs \
--publication=PMID:29761456 --organism_1_taxonid=4897 --organism_2_taxonid=9606 \
"$HOST" $DB $USER $PASSWORD 2>&1 | tee $LOG_DIR/$log_file.human_orthologs
echo transfer names and products from pombe to japonicus
$JBASE_HOME/pombase-chado/script/pombase-process.pl \
$LOAD_CONFIG transfer-names-and-products \
--source-organism-taxonid=4896 --dest-organism-taxonid=4897 \
"$HOST" $DB $USER $PASSWORD 2>&1 | tee $LOG_DIR/$log_file.transfer_names_and_products
pg_dump $DB | gzip -2 > /tmp/japonicus-chado-40-after-orthologs-and-names.dump.gz
PGPASSWORD=$PASSWORD psql -U $USER -h "$HOST" $DB -c 'analyze'
echo transfer GO annotation from pombe
#curl -s --http1.1 https://curation.pombase.org/dumps/latest_build/pombase-latest.gaf.gz |
# gzip -d |
gzip -d < $WWW_DIR/dumps/latest_build/pombase-latest.gaf.gz |
$POMBASE_CHADO/script/pombase-process.pl $LOAD_CONFIG transfer-gaf-annotations \
--source-organism-taxonid=4896 --dest-organism-taxonid=4897 \
--evidence-codes-to-ignore=ND --terms-to-ignore="GO:0005515" \
"$HOST" $DB $USER $PASSWORD |
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG gaf \
--term-id-filter-filename=$JAPONICUS_CURATION/japonicusdb_only_filtered_GO_IDs \
--taxon-filter=4897 "$HOST" $DB $USER $PASSWORD \
2>&1 | tee $LOG_DIR/$log_file.transfer_pombe_go_annotation
pg_dump $DB | gzip -2 > /tmp/japonicus-chado-50-after-pombe-go-annotation.gz
echo load PDBe IDs
perl -ne '
($id, $pdb_id, $taxon_id) = split /\t/;
print "$id\t$pdb_id\n" if $taxon_id == 4897;
' < $SOURCES/pombe-embl/external_data/protein_structure/systematic_id_to_pdbe_mapping_japonicus.tsv |
sort | uniq |
$POMBASE_CHADO/script/pombase-import.pl $POMBASE_LEGACY/load-pombase-chado.yaml generic-property \
--property-name="pdb_identifier" --organism-taxonid=4897 \
--feature-uniquename-column=1 --property-column=2 \
"$HOST" $DB $USER $PASSWORD
refresh_views
# run this before loading the Canto data because the Canto loader creates
# reciprocals automatically
# See: https://github.com/pombase/pombase-chado/issues/723
# and: https://github.com/pombase/pombase-chado/issues/788
$JBASE_HOME/pombase-chado/script/pombase-process.pl \
$LOAD_CONFIG add-reciprocal-ipi-annotations \
--organism-taxonid=4897 "$HOST" $DB $USER $PASSWORD 2>&1 | tee $LOG_DIR/$log_file.add_reciprocal_ipi_annotations
pg_dump $DB | gzip -2 > /tmp/japonicus-chado-60-after-reciprocal-ipi-annotations.gz
PGPASSWORD=$PASSWORD psql -U $USER -h "$HOST" $DB -c 'analyze'
refresh_views
echo update out of date allele names
$POMBASE_CHADO/script/pombase-process.pl $LOAD_CONFIG update-allele-names "$HOST" $DB $USER $PASSWORD
echo do GO term re-mapping
$POMBASE_CHADO/script/pombase-process.pl $LOAD_CONFIG change-terms \
--exclude-by-fc-prop="canto_session" \
--mapping-file=$SOURCES/pombe-embl/chado_load_mappings/GO_mapping_to_specific_terms.txt \
"$HOST" $DB $USER $PASSWORD 2>&1 | tee $LOG_DIR/$log_file.go-term-mapping
CURATION_TOOL_DATA=$POMCUR/backups/japonicus-current.json
echo
echo load Canto data
$POMBASE_CHADO/script/pombase-import.pl $LOAD_CONFIG canto-json \
--organism-taxonid=4897 --db-prefix=JaponicusDB --all-curation-is-community \
"$HOST" $DB $USER $PASSWORD < $CURATION_TOOL_DATA 2>&1 | tee $LOG_DIR/$log_file.curation_tool_data
echo annotation count after loading curation tool data:
evidence_summary $DB
pg_dump $DB | gzip -2 > /tmp/japonicus-chado-70-after-canto-data.gz
echo
echo counts of assigned_by before filtering:
assigned_by_summary $DB
PGPASSWORD=$PASSWORD psql -U $USER -h "$HOST" $DB -c 'analyze'
echo
echo filtering redundant annotations - `date`
$JBASE_HOME/pombase-chado/script/pombase-process.pl $LOAD_CONFIG go-filter "$HOST" $DB $USER $PASSWORD
echo done GO filtering - `date`
echo
echo filtering redundant annotations - `date`
$JBASE_HOME/pombase-chado/script/pombase-process.pl $LOAD_CONFIG go-filter-with-not "$HOST" $DB $USER $PASSWORD
echo done filtering using NOT annotations - `date`
pg_dump $DB | gzip -2 > /tmp/japonicus-chado-80-go-filtering.gz
echo
echo counts of assigned_by after filtering:
assigned_by_summary $DB
echo
echo annotation count after filtering redundant GO annotations
evidence_summary $DB
echo
echo query PubMed for publication details, then store
$POMBASE_CHADO/script/pubmed_util.pl $LOAD_CONFIG \
"$HOST" $DB $USER $PASSWORD --add-missing-fields \
--organism-taxonid=4897 2>&1 | tee $LOG_DIR/$log_file.pubmed_query
echo
echo loading finished
PGPASSWORD=$PASSWORD psql -U $USER -h "$HOST" $DB -c 'analyze'
refresh_views
echo
echo running consistency checks
if $POMBASE_CHADO/script/check-chado.pl $LOAD_CONFIG $MAIN_CONFIG "$HOST" $DB $USER $PASSWORD > $LOG_DIR/$log_file.chado_checks 2>&1
then
CHADO_CHECKS_STATUS=passed
else
CHADO_CHECKS_STATUS=failed
fi
echo Chado checks $CHADO_CHECKS_STATUS
DUMPS_DIR=$WWW_DIR/japonicus_nightly
BUILDS_DIR=$DUMPS_DIR/builds
CURRENT_BUILD_DIR=$BUILDS_DIR/$DB
mkdir -p $CURRENT_BUILD_DIR
mkdir -p $CURRENT_BUILD_DIR/logs
mkdir -p $CURRENT_BUILD_DIR/exports
(
echo starting orthologs export at `date`
$POMBASE_CHADO/script/pombase-export.pl $LOAD_CONFIG orthologs --organism-taxon-id=4897 --other-organism-taxon-id=4896 --other-organism-field-name=uniquename --other-organism-field-name=uniquename --sensible-ortholog-direction "$HOST" $DB $USER $PASSWORD | gzip -9 > $CURRENT_BUILD_DIR/$DB.japonicus-pombe-orthologs.txt.gz
$POMBASE_CHADO/script/pombase-export.pl $LOAD_CONFIG orthologs --organism-taxon-id=4897 --other-organism-taxon-id=9606 --sensible-ortholog-direction "$HOST" $DB $USER $PASSWORD | gzip -9 > $CURRENT_BUILD_DIR/$DB.japonicus-human-orthologs.txt.gz
$POMBASE_CHADO/script/pombase-export.pl $LOAD_CONFIG orthologs --organism-taxon-id=4897 --other-organism-taxon-id=4932 --other-organism-field-name=uniquename --sensible-ortholog-direction "$HOST" $DB $USER $PASSWORD | gzip -9 > $CURRENT_BUILD_DIR/$DB.japonicus-cerevisiae-orthologs.txt.gz
echo starting gaf export at `date`
$POMBASE_CHADO/script/pombase-export.pl $LOAD_CONFIG gaf --organism-taxon-id=4897 "$HOST" $DB $USER $PASSWORD | gzip -9 > $CURRENT_BUILD_DIR/$DB.gaf.gz
echo starting phaf export at `date`
$POMBASE_CHADO/script/pombase-export.pl $LOAD_CONFIG phaf --organism-taxon-id=4897 "$HOST" $DB $USER $PASSWORD | gzip -9 > $CURRENT_BUILD_DIR/$DB.phaf.gz
echo starting modifications export at `date`
$POMBASE_CHADO/script/pombase-export.pl $LOAD_CONFIG modifications --organism-taxon-id=4897 "$HOST" $DB $USER $PASSWORD | gzip -9 > $CURRENT_BUILD_DIR/$DB.modifications.gz
psql $DB -t --no-align -c "
SELECT uniquename FROM pub WHERE uniquename LIKE 'PMID:%'
AND pub_id IN (SELECT pub_id FROM feature_cvterm UNION SELECT pub_id FROM feature_relationship_pub)
ORDER BY substring(uniquename FROM 'PMID:(\d+)')::integer;" > $CURRENT_BUILD_DIR/publications_with_annotations.txt
) > $LOG_DIR/$log_file.export_warnings 2>&1
cp $LOG_DIR/$log_file.* $CURRENT_BUILD_DIR/logs/
(
echo extension relation counts:
psql $DB -c "select count(id), name, base_cv_name from (select p.cvterm_id::text || '_cvterm' as id,
substring(type.name from 'annotation_extension_relation-(.*)') as name, base_cv_name
from pombase_feature_cvterm_ext_resolved_terms fc
join cvtermprop p on p.cvterm_id = fc.cvterm_id
join cvterm type on p.type_id = type.cvterm_id
where type.name like 'annotation_ex%'
UNION all select r.cvterm_relationship_id::text ||
'_cvterm_rel' as id, t.name as name, base_cv_name from cvterm_relationship r join cvterm t on t.cvterm_id = r.type_id join pombase_feature_cvterm_ext_resolved_terms fc on r.subject_id = fc.cvterm_id where
t.name <> 'is_a' and r.subject_id in (select cvterm_id from cvterm, cv
where cvterm.cv_id = cv.cv_id and cv.name = 'PomBase annotation extension terms'))
as sub group by base_cv_name, name order by base_cv_name, name;
"
echo
echo number of annotations using extensions by cv:
psql $DB -c "select count(feature_cvterm_id), base_cv_name from pombase_feature_cvterm_with_ext_parents group by base_cv_name order by count;"
) > $CURRENT_BUILD_DIR/logs/$log_file.extension_relation_counts
(
echo counts of qualifiers grouped by CV name
psql $DB -c "select count(fc.feature_cvterm_id), value, base_cv_name from feature_cvtermprop p, pombase_feature_cvterm_ext_resolved_terms fc, cvterm t where type_id = (select cvterm_id from cvterm where name = 'qualifier' and cv_id = (select cv_id from cv where name = 'feature_cvtermprop_type')) and p.feature_cvterm_id = fc.feature_cvterm_id and fc.cvterm_id = t.cvterm_id group by value, base_cv_name order by count desc;"
) > $CURRENT_BUILD_DIR/logs/$log_file.qualifier_counts_by_cv
(
echo all protein family term and annotated genes
psql $DB -c "select t.name, db.name || ':' || x.accession as termid, array_to_string(array_agg(f.uniquename), ',') as gene_uniquenames from feature f join feature_cvterm fc on fc.feature_id = f.feature_id join cvterm t on t.cvterm_id = fc.cvterm_id join dbxref x on x.dbxref_id = t.dbxref_id join db on x.db_id = db.db_id join cv on t.cv_id = cv.cv_id where cv.name = 'PomBase family or domain' group by t.name, termid order by t.name, termid;"
) > $CURRENT_BUILD_DIR/logs/$log_file.protein_family_term_annotation
(
echo 'Alleles with type "other"'
psql $DB -F ',' -A -c "select f.name, f.uniquename, (select value from featureprop p where
p.feature_id = f.feature_id and p.type_id in (select cvterm_id from cvterm
where name = 'description')) as description, ARRAY(select value from featureprop p where
p.feature_id = f.feature_id and p.type_id in (select cvterm_id from cvterm
where name = 'canto_session')) as session from feature f where type_id in (select
cvterm_id from cvterm where name = 'allele') and feature_id in (select
feature_id from featureprop p where p.type_id in (select cvterm_id from cvterm
where name = 'allele_type') and p.value = 'other');"
) > $CURRENT_BUILD_DIR/logs/$log_file.alleles_of_type_other
(
echo counts of all annotation by type:
psql $DB -c "select count(distinct fc_id), cv_name from (select distinct
fc.feature_cvterm_id as fc_id, cv.name as cv_name from cvterm t,
feature_cvterm fc, cv where fc.cvterm_id = t.cvterm_id and cv.cv_id = t.cv_id
and cv.name <> 'PomBase annotation extension terms' UNION select distinct
fc.feature_cvterm_id as fc_id, parent_cv.name as cv_name from cvterm t,
feature_cvterm fc, cv term_cv, cvterm_relationship rel, cvterm parent_term, cv
parent_cv, cvterm rel_type where fc.cvterm_id = t.cvterm_id and term_cv.cv_id
= t.cv_id and t.cvterm_id = subject_id and parent_term.cvterm_id = object_id
and parent_term.cv_id = parent_cv.cv_id and term_cv.name = 'PomBase annotation extension terms' and rel.type_id = rel_type.cvterm_id and rel_type.name =
'is_a') as sub group by cv_name order by count;"
echo
echo annotation counts by evidence code and cv type, sorted by cv name:
psql $DB -c "with sub as (select distinct
fc.feature_cvterm_id as fc_id, cv.name as cv_name from cvterm t,
feature_cvterm fc, cv where fc.cvterm_id = t.cvterm_id and cv.cv_id = t.cv_id
and cv.name <> 'PomBase annotation extension terms' UNION select distinct
fc.feature_cvterm_id as fc_id, parent_cv.name as cv_name from cvterm t,
feature_cvterm fc, cv term_cv, cvterm_relationship rel, cvterm parent_term, cv
parent_cv, cvterm rel_type where fc.cvterm_id = t.cvterm_id and term_cv.cv_id
= t.cv_id and t.cvterm_id = subject_id and parent_term.cvterm_id = object_id
and parent_term.cv_id = parent_cv.cv_id and term_cv.name =
'PomBase annotation extension terms' and rel.type_id =
rel_type.cvterm_id and rel_type.name = 'is_a')
select p.value as ev_code, cv_name, count(fc_id) from sub join
feature_cvtermprop p on sub.fc_id = p.feature_cvterm_id where type_id
= (select cvterm_id from cvterm t join cv on t.cv_id = cv.cv_id where
cv.name = 'feature_cvtermprop_type' and t.name = 'evidence') group by
p.value, cv_name order by cv_name;"
echo
echo annotation counts by evidence code and cv type, sorted by count:
psql $DB -c "with sub as (select distinct
fc.feature_cvterm_id as fc_id, cv.name as cv_name from cvterm t,
feature_cvterm fc, cv where fc.cvterm_id = t.cvterm_id and cv.cv_id = t.cv_id
and cv.name <> 'PomBase annotation extension terms' UNION select distinct
fc.feature_cvterm_id as fc_id, parent_cv.name as cv_name from cvterm t,
feature_cvterm fc, cv term_cv, cvterm_relationship rel, cvterm parent_term, cv
parent_cv, cvterm rel_type where fc.cvterm_id = t.cvterm_id and term_cv.cv_id
= t.cv_id and t.cvterm_id = subject_id and parent_term.cvterm_id = object_id
and parent_term.cv_id = parent_cv.cv_id and term_cv.name =
'PomBase annotation extension terms' and rel.type_id =
rel_type.cvterm_id and rel_type.name = 'is_a')
select p.value as ev_code, cv_name, count(fc_id) from sub join
feature_cvtermprop p on sub.fc_id = p.feature_cvterm_id where type_id
= (select cvterm_id from cvterm t join cv on t.cv_id = cv.cv_id where
cv.name = 'feature_cvtermprop_type' and t.name = 'evidence') group by
p.value, cv_name order by count;"
echo
echo annotation counts by evidence code and cv type, sorted by cv evidence code:
psql $DB -c "with sub as (select distinct
fc.feature_cvterm_id as fc_id, cv.name as cv_name from cvterm t,
feature_cvterm fc, cv where fc.cvterm_id = t.cvterm_id and cv.cv_id = t.cv_id
and cv.name <> 'PomBase annotation extension terms' UNION select distinct
fc.feature_cvterm_id as fc_id, parent_cv.name as cv_name from cvterm t,
feature_cvterm fc, cv term_cv, cvterm_relationship rel, cvterm parent_term, cv
parent_cv, cvterm rel_type where fc.cvterm_id = t.cvterm_id and term_cv.cv_id
= t.cv_id and t.cvterm_id = subject_id and parent_term.cvterm_id = object_id
and parent_term.cv_id = parent_cv.cv_id and term_cv.name =
'PomBase annotation extension terms' and rel.type_id =
rel_type.cvterm_id and rel_type.name = 'is_a')
select p.value as ev_code, cv_name, count(fc_id) from sub join
feature_cvtermprop p on sub.fc_id = p.feature_cvterm_id where type_id
= (select cvterm_id from cvterm t join cv on t.cv_id = cv.cv_id where
cv.name = 'feature_cvtermprop_type' and t.name = 'evidence') group by
p.value, cv_name order by p.value;"
echo
echo total:
psql $DB -c "select count(distinct fc_id) from (select distinct
fc.feature_cvterm_id as fc_id, cv.name as cv_name from cvterm t,
feature_cvterm fc, cv where fc.cvterm_id = t.cvterm_id and cv.cv_id = t.cv_id
and cv.name <> 'PomBase annotation extension terms' UNION select distinct
fc.feature_cvterm_id as fc_id, parent_cv.name as cv_name from cvterm t,
feature_cvterm fc, cv term_cv, cvterm_relationship rel, cvterm parent_term, cv
parent_cv, cvterm rel_type where fc.cvterm_id = t.cvterm_id and term_cv.cv_id
= t.cv_id and t.cvterm_id = subject_id and parent_term.cvterm_id = object_id
and parent_term.cv_id = parent_cv.cv_id and term_cv.name = 'PomBase annotation extension terms' and rel.type_id = rel_type.cvterm_id and rel_type.name =
'is_a') as sub;"
echo
echo counts of annotation from Canto, by type:
sub_query="(select
distinct fc.feature_cvterm_id as fc_id, cv.name as cv_name from
cvterm t, feature_cvterm fc, cv where fc.cvterm_id = t.cvterm_id and
cv.cv_id = t.cv_id and cv.name <> 'PomBase annotation extension
terms' and fc.feature_cvterm_id in (select feature_cvterm_id from
feature_cvtermprop where type_id in (select cvterm_id from cvterm
where name = 'canto_session')) UNION select distinct fc.feature_cvterm_id
as fc_id, parent_cv.name as cv_name from cvterm t, feature_cvterm fc,
cv term_cv, cvterm_relationship rel, cvterm parent_term, cv
parent_cv, cvterm rel_type where fc.cvterm_id = t.cvterm_id and
term_cv.cv_id = t.cv_id and t.cvterm_id = subject_id and
parent_term.cvterm_id = object_id and parent_term.cv_id =
parent_cv.cv_id and term_cv.name = 'PomBase annotation extension terms'
and rel.type_id = rel_type.cvterm_id and rel_type.name =
'is_a' and fc.feature_cvterm_id in (select feature_cvterm_id from
feature_cvtermprop where type_id in (select cvterm_id from cvterm
where name = 'canto_session'))) as sub"
psql $DB -c "select count(distinct fc_id), cv_name from $sub_query group by cv_name order by count;"
psql $DB -c "select count(distinct fc_id) as total from $sub_query;"
) > $CURRENT_BUILD_DIR/logs/$log_file.annotation_counts_by_cv
echo creating files for the website:
$POMCUR/bin/pombase-chado-json -c $MAIN_CONFIG \
--doc-config-file $JBASE_HOME/pombase-website/src/app/config/doc-config.json \
-p "postgres://$USER:$PASSWORD@localhost/$DB" \
-d $CURRENT_BUILD_DIR/ --go-eco-mapping=$SOURCES/gaf-eco-mapping.txt \
-i $JAPONICUS_SOURCES/japonicus_domain_results.json \
--pdb-data-file $SOURCES/pombe-embl/external_data/protein_structure/systematic_id_to_pdbe_mapping_japonicus.tsv \
2>&1 | tee $LOG_DIR/$log_file.web-json-write
find $CURRENT_BUILD_DIR/fasta -name '*.fa' | xargs gzip -9f
cp $LOG_DIR/$log_file.web-json-write $CURRENT_BUILD_DIR/logs/
DB_BASE_NAME=`echo $DB | sed 's/-v[0-9]$//'`
zstd -9q --rm $CURRENT_BUILD_DIR/api_maps.sqlite3
cp $LOG_DIR/*.txt $CURRENT_BUILD_DIR/logs/
psql $DB -c 'grant select on all tables in schema public to public;'
DUMP_FILE=$CURRENT_BUILD_DIR/$DB.chado_dump.gz
echo dumping to $DUMP_FILE
pg_dump $DB | gzip -2 > $DUMP_FILE
rm -f $DUMPS_DIR/latest_build
ln -s $CURRENT_BUILD_DIR $DUMPS_DIR/latest_build
(cd $JBASE_HOME/container_build
(cd japonicus-config && git pull)
cp -f japonicus-config/japonicus_site_config.json main_config.json
nice -10 $JAPONICUS_BUILD/build_container.sh $DATE_VERSION $DUMPS_DIR/latest_build)
IMAGE_NAME=japonicus/web:$DATE_VERSION-prod
# temporarily add another replica so so have no downtime when we update
docker service update --replicas 2 japonicus-1
sleep 60
docker service update --update-delay 0s --image=$IMAGE_NAME japonicus-1
sleep 60
docker service update --replicas 1 japonicus-1
#if [ $CHADO_CHECKS_STATUS=passed ]
#then
rm -f $DUMPS_DIR/nightly_update
ln -s $CURRENT_BUILD_DIR $DUMPS_DIR/nightly_update
WWW_JAPONICUS_DIR=/var/www/www-japonicusdb
WWW_DATA_DIR=$WWW_JAPONICUS_DIR/data
cp $JAPONICUS_CURATION/systematic_id_uniprot_mapping.tsv $WWW_DATA_DIR/names_and_identifiers/JaponicusDB2UniProt.tsv
cp $CURRENT_BUILD_DIR/misc/gene_IDs_names.tsv $WWW_DATA_DIR/names_and_identifiers/
cp $CURRENT_BUILD_DIR/misc/gene_IDs_names_products.tsv $WWW_DATA_DIR/names_and_identifiers/
cp $CURRENT_BUILD_DIR/misc/sysID2product.tsv $WWW_DATA_DIR/names_and_identifiers/
cp $CURRENT_BUILD_DIR/misc/sysID2product.rna.tsv $WWW_DATA_DIR/names_and_identifiers/
cp $CURRENT_BUILD_DIR/misc/Complex_annotation.tsv $WWW_DATA_DIR/annotations/Gene_ontology/GO_complexes/Complex_annotation.tsv
gzip -9 < $CURRENT_BUILD_DIR/misc/gene_product_annotation_data_taxonid_4897.tsv > $WWW_DATA_DIR/annotations/Gene_ontology/japonicusdb.gpad.gz
gzip -9 < $CURRENT_BUILD_DIR/misc/gene_product_information_taxonid_4897.tsv > $WWW_DATA_DIR/annotations/Gene_ontology/japonicusdb.gpi.gz
gzip -9 < $CURRENT_BUILD_DIR/misc/go_style_gaf.tsv > $WWW_DATA_DIR/annotations/Gene_ontology/gene_association_2-2.japonicusdb.gz
gzip -9 < $CURRENT_BUILD_DIR/misc/pombase_style_gaf.tsv > $WWW_DATA_DIR/annotations/Gene_ontology/gene_association_2-1.japonicusdb.gz
cp $CURRENT_BUILD_DIR/misc/transmembrane_domain_coords_and_seqs.tsv $WWW_DATA_DIR/Protein_data/
cp $CURRENT_BUILD_DIR/misc/aa_composition.tsv $WWW_DATA_DIR/Protein_data/
cp $CURRENT_BUILD_DIR/misc/PeptideStats.tsv $WWW_DATA_DIR/Protein_data/
cp $CURRENT_BUILD_DIR/misc/ProteinFeatures.tsv $WWW_DATA_DIR/Protein_data/Protein_Features.tsv
cp $CURRENT_BUILD_DIR/fasta/feature_sequences/cds+introns+utrs.fa.gz $WWW_DATA_DIR/genome_sequence_and_features/feature_sequences/cds+introns+utrs.fa.gz
cp $CURRENT_BUILD_DIR/fasta/feature_sequences/cds+introns.fa.gz $WWW_DATA_DIR/genome_sequence_and_features/feature_sequences/cds+introns.fa.gz
cp $CURRENT_BUILD_DIR/fasta/feature_sequences/cds.fa.gz $WWW_DATA_DIR/genome_sequence_and_features/feature_sequences/cds.fa.gz
cp $CURRENT_BUILD_DIR/fasta/feature_sequences/introns_within_cds.fa.gz $WWW_DATA_DIR/genome_sequence_and_features/feature_sequences/introns_within_cds.fa.gz
cp $CURRENT_BUILD_DIR/fasta/feature_sequences/five_prime_utrs.fa.gz $WWW_DATA_DIR/genome_sequence_and_features/feature_sequences/UTR/5UTR.fa.gz
cp $CURRENT_BUILD_DIR/fasta/feature_sequences/three_prime_utrs.fa.gz $WWW_DATA_DIR/genome_sequence_and_features/feature_sequences/UTR/3UTR.fa.gz
cp $CURRENT_BUILD_DIR/fasta/feature_sequences/peptide.fa.gz $WWW_DATA_DIR/genome_sequence_and_features/feature_sequences/peptide.fa.gz
cp $CURRENT_BUILD_DIR/fasta/chromosomes/*.gz $WWW_DATA_DIR/genome_sequence_and_features/genome_sequence/
(cd $CURRENT_BUILD_DIR/gff
for f in *.gff3
do
gzip -9 < $f > $WWW_DATA_DIR/genome_sequence_and_features/gff3/$f.gz
done)
cp $CURRENT_BUILD_DIR/$DB.japonicus-human-orthologs.txt.gz $WWW_DATA_DIR/orthologs/japonicus-human-orthologs.txt.gz
cp $CURRENT_BUILD_DIR/$DB.japonicus-pombe-orthologs.txt.gz $WWW_DATA_DIR/orthologs/japonicus-pombe-orthologs.txt.gz
cp $CURRENT_BUILD_DIR/$DB.japonicus-cerevisiae-orthologs.txt.gz $WWW_DATA_DIR/orthologs/japonicus-cerevisiae-orthologs.txt.gz
cp $CURRENT_BUILD_DIR/$DB.modifications.gz $WWW_DATA_DIR/annotations/modifications/japonicusdb-chado.modifications.gz
cp $CURRENT_BUILD_DIR/$DB.phaf.gz $WWW_DATA_DIR/annotations/Phenotype_annotations/phenotype_annotations.japonicusdb.phaf.gz
#fi
cat > $POMCUR/apps/japonicus/canto_chado.yaml <<EOF
Model::ChadoModel:
connect_info:
- "dbi:Pg:dbname=$DB;host=localhost"
- japonicus
- japonicus
schema_class: Canto::ChadoDB
EOF
echo finished building: $DB
date