Skip to content

Commit

Permalink
#56 - Fix occasional UTA duplicated exons
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Nov 14, 2023
1 parent 01625f8 commit e64ef23
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 23 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

### Changed

- #56 - Fix occasional UTA duplicated exons
- #57 - Correctly handle retrieving genomic position and dealing w/indels in GFF (thanks ltnetcase for reporting)
- #60 - Fix for missing protein IDs due to Genbank / GenBank (thanks holtgrewe)
- #64 - Split code/data versions. json.gz are now labelled according to data schema version (thanks holtgrewe)
Expand Down
21 changes: 9 additions & 12 deletions generate_transcript_data/uta_20210129_grch37.sql
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
\copy (SELECT transcript.ac, string_agg(distinct transcript.hgnc, ',') as hgnc, string_agg(distinct origin.url, ',') as origin_url,
string_agg(distinct es.alt_ac::varchar, ',') as contig,
string_agg(distinct es.alt_strand::varchar, ',') as strand,
\copy (SELECT transcript.ac, string_agg(distinct transcript.hgnc, ',') as hgnc, 'http://www.ncbi.nlm.nih.gov/refseq/' as origin_url,
string_agg(distinct aln_v.alt_ac::varchar, ',') as contig,
string_agg(distinct aln_v.alt_strand::varchar, ',') as strand,
transcript.cds_start_i,
transcript.cds_end_i,
string_agg(exon.start_i::varchar, ',' order by exon.ord) as exon_starts,
string_agg(exon.end_i::varchar, ',' order by exon.ord) as exon_ends,
string_agg(exon_aln.cigar, ',' order by exon.ord) as cigars,
string_agg(aln_v.alt_start_i::varchar, ',' order by aln_v.alt_exon_id) as exon_starts,
string_agg(aln_v.alt_end_i::varchar, ',' order by aln_v.alt_exon_id) as exon_ends,
string_agg(aln_v.cigar, ',' order by aln_v.alt_exon_id) as cigars,
string_agg(distinct aa.pro_ac, ',' order by aa.pro_ac) as protein
from uta_20210129.transcript transcript
inner join uta_20210129.exon_set es on (transcript.ac = es.tx_ac AND alt_aln_method = 'splign')
inner join uta_20210129.origin origin on (transcript.origin_id = origin.origin_id)
inner join uta_20210129.exon as exon on (es.exon_set_id = exon.exon_set_id)
inner join uta_20210129.exon_aln exon_aln on (exon_aln.alt_exon_id = exon.exon_id)
inner join uta_20210129.tx_exon_aln_v aln_v on (transcript.ac = aln_v.tx_ac AND alt_aln_method = 'splign')
left outer join uta_20210129.associated_accessions aa on (transcript.ac = aa.tx_ac)
WHERE es.alt_ac in
('NC_000001.10', 'NC_000002.11', 'NC_000003.11', 'NC_000004.11', 'NC_000005.9', 'NC_000006.11', 'NC_000007.13', 'NC_000008.10', 'NC_000009.11', 'NC_000010.10', 'NC_000011.9', 'NC_000012.11', 'NC_000013.10', 'NC_000014.8', 'NC_000015.9', 'NC_000016.9', 'NC_000017.10', 'NC_000018.9', 'NC_000019.9', 'NC_000020.10', 'NC_000021.8', 'NC_000022.10', 'NC_000023.10', 'NC_000024.9') and origin.origin_id not in (10, 11)
WHERE aln_v.alt_ac in
('NC_000001.10', 'NC_000002.11', 'NC_000003.11', 'NC_000004.11', 'NC_000005.9', 'NC_000006.11', 'NC_000007.13', 'NC_000008.10', 'NC_000009.11', 'NC_000010.10', 'NC_000011.9', 'NC_000012.11', 'NC_000013.10', 'NC_000014.8', 'NC_000015.9', 'NC_000016.9', 'NC_000017.10', 'NC_000018.9', 'NC_000019.9', 'NC_000020.10', 'NC_000021.8', 'NC_000022.10', 'NC_000023.10', 'NC_000024.9')
group by transcript.ac) TO 'uta_20210129_grch37.csv' CSV HEADER;
19 changes: 8 additions & 11 deletions generate_transcript_data/uta_20210129_grch38.sql
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
\copy (SELECT transcript.ac, string_agg(distinct transcript.hgnc, ',') as hgnc, string_agg(distinct origin.url, ',') as origin_url,
string_agg(distinct es.alt_ac::varchar, ',') as contig,
string_agg(distinct es.alt_strand::varchar, ',') as strand,
\copy (SELECT transcript.ac, string_agg(distinct transcript.hgnc, ',') as hgnc, 'http://www.ncbi.nlm.nih.gov/refseq/' as origin_url,
string_agg(distinct aln_v.alt_ac::varchar, ',') as contig,
string_agg(distinct aln_v.alt_strand::varchar, ',') as strand,
transcript.cds_start_i,
transcript.cds_end_i,
string_agg(exon.start_i::varchar, ',' order by exon.ord) as exon_starts,
string_agg(exon.end_i::varchar, ',' order by exon.ord) as exon_ends,
string_agg(exon_aln.cigar, ',' order by exon.ord) as cigars,
string_agg(aln_v.alt_start_i::varchar, ',' order by aln_v.alt_exon_id) as exon_starts,
string_agg(aln_v.alt_end_i::varchar, ',' order by aln_v.alt_exon_id) as exon_ends,
string_agg(aln_v.cigar, ',' order by aln_v.alt_exon_id) as cigars,
string_agg(distinct aa.pro_ac, ',' order by aa.pro_ac) as protein
from uta_20210129.transcript transcript
inner join uta_20210129.exon_set es on (transcript.ac = es.tx_ac AND alt_aln_method = 'splign')
inner join uta_20210129.origin origin on (transcript.origin_id = origin.origin_id)
inner join uta_20210129.exon as exon on (es.exon_set_id = exon.exon_set_id)
inner join uta_20210129.exon_aln exon_aln on (exon_aln.alt_exon_id = exon.exon_id)
inner join uta_20210129.tx_exon_aln_v aln_v on (transcript.ac = aln_v.tx_ac AND alt_aln_method = 'splign')
left outer join uta_20210129.associated_accessions aa on (transcript.ac = aa.tx_ac)
WHERE es.alt_ac in
WHERE aln_v.alt_ac in
('NC_000001.11', 'NC_000002.12', 'NC_000003.12', 'NC_000004.12', 'NC_000005.10', 'NC_000006.12', 'NC_000007.14', 'NC_000008.11', 'NC_000009.12', 'NC_000010.11', 'NC_000011.10', 'NC_000012.12', 'NC_000013.11', 'NC_000014.9', 'NC_000015.10', 'NC_000016.10', 'NC_000017.11', 'NC_000018.10', 'NC_000019.10', 'NC_000020.11', 'NC_000021.9', 'NC_000022.11', 'NC_000023.11', 'NC_000024.10') and origin.origin_id not in (10, 11)
group by transcript.ac) TO 'uta_20210129_grch38.csv' CSV HEADER;

0 comments on commit e64ef23

Please sign in to comment.