diff --git a/src/align/include/align_parameters.hpp b/src/align/include/align_parameters.hpp index 790989af..b07dc2c2 100644 --- a/src/align/include/align_parameters.hpp +++ b/src/align/include/align_parameters.hpp @@ -62,6 +62,7 @@ struct Parameters { bool sam_format; //Emit the output in SAM format (PAF default) bool no_seq_in_sam; //Do not fill the SEQ field in SAM format bool multithread_fasta_input; //Multithreaded fasta input + uint64_t target_padding; //Additional padding around target sequence #ifdef WFA_PNG_TSV_TIMING // plotting diff --git a/src/align/include/align_types.hpp b/src/align/include/align_types.hpp index 5abc5883..92b871a6 100644 --- a/src/align/include/align_types.hpp +++ b/src/align/include/align_types.hpp @@ -26,6 +26,11 @@ namespace align skch::offset_t rEndPos; //mapping boundary end offset on ref skch::strand_t strand; //mapping strand float mashmap_estimated_identity; + + // Chain metadata + int32_t chain_id{-1}; // Unique ID for this chain (-1 if not part of chain) + int32_t chain_length{1}; // Total segments in chain (1 if not part of chain) + int32_t chain_pos{1}; // Position in chain, 1-based (1 if not part of chain) }; typedef std::unordered_map refSequenceMap_t; diff --git a/src/align/include/computeAlignments.hpp b/src/align/include/computeAlignments.hpp index d56e14a9..f3be2d1a 100644 --- a/src/align/include/computeAlignments.hpp +++ b/src/align/include/computeAlignments.hpp @@ -139,6 +139,7 @@ typedef atomic_queue::AtomicQueuecomputeAlignments(); @@ -149,7 +150,7 @@ typedef atomic_queue::AtomicQueue 14) { + const vector chain_vec = skch::CommonFunc::split(tokens[14], ':'); + if (chain_vec.size() == 3 && chain_vec[0] == "chain" && chain_vec[1] == "i") { + // Split the id.pos.len format + const vector chain_parts = skch::CommonFunc::split(chain_vec[2], '.'); + if (chain_parts.size() == 3) { + chain_id = std::stoi(chain_parts[0]); + chain_pos = std::stoi(chain_parts[1]); + chain_length = std::stoi(chain_parts[2]); + } + } + } + // Save words into currentRecord { currentRecord.qId = tokens[0]; @@ -176,8 +194,39 @@ typedef atomic_queue::AtomicQueue 0) { + if (rStartPos >= target_padding) { + rStartPos -= target_padding; + } else { + rStartPos = 0; + } + if (rEndPos + target_padding <= ref_len) { + rEndPos += target_padding; + } else { + rEndPos = ref_len; + } + } + + // Validate coordinates against reference length + if (rStartPos >= ref_len || rEndPos > ref_len) { + std::cerr << "[parse-debug] ERROR: Coordinates exceed reference length!" << std::endl; + throw std::runtime_error("[wfmash::align::parseMashmapRow] Error! Coordinates exceed reference length: " + + std::to_string(rStartPos) + "-" + std::to_string(rEndPos) + + " (ref_len=" + std::to_string(ref_len) + ")"); + } + + currentRecord.rStartPos = rStartPos; + currentRecord.rEndPos = rEndPos; currentRecord.mashmap_estimated_identity = mm_id; } } @@ -193,7 +242,7 @@ seq_record_t* createSeqRecord(const MappingBoundaryRow& currentRecord, // Get the query sequence length const int64_t query_size = faidx_seq_len(query_faidx, currentRecord.qId.c_str()); - // Compute padding + // Compute padding for sequence extraction const uint64_t head_padding = currentRecord.rStartPos >= param.wflign_max_len_minor ? param.wflign_max_len_minor : currentRecord.rStartPos; const uint64_t tail_padding = ref_size - currentRecord.rEndPos >= param.wflign_max_len_minor @@ -273,7 +322,10 @@ std::string processAlignment(seq_record_t* rec) { param.no_seq_in_sam, param.min_identity, param.wflign_max_len_minor, - rec->currentRecord.mashmap_estimated_identity); + rec->currentRecord.mashmap_estimated_identity, + rec->currentRecord.chain_id, + rec->currentRecord.chain_length, + rec->currentRecord.chain_pos); return output.str(); } @@ -310,7 +362,7 @@ void processor_thread(std::atomic& total_alignments_queued, std::string* line_ptr = nullptr; if (line_queue.try_pop(line_ptr)) { MappingBoundaryRow currentRecord; - parseMashmapRow(*line_ptr, currentRecord); + parseMashmapRow(*line_ptr, currentRecord, param.target_padding); // Process the record and create seq_record_t seq_record_t* rec = createSeqRecord(currentRecord, *line_ptr, local_ref_faidx, local_query_faidx); @@ -408,9 +460,48 @@ void worker_thread(uint64_t tid, if (seq_queue.try_pop(rec)) { is_working.store(true); std::string alignment_output = processAlignment(rec); - - // Push the alignment output to the paf_queue - paf_queue.push(new std::string(std::move(alignment_output))); + + // Parse the alignment output to find CIGAR string and coordinates + std::stringstream ss(alignment_output); + std::string line; + while (std::getline(ss, line)) { + if (line.empty()) continue; + + std::vector fields; + std::stringstream field_ss(line); + std::string field; + while (field_ss >> field) { + fields.push_back(field); + } + + // Find the CIGAR string field (should be after cg:Z:) + auto cigar_it = std::find_if(fields.begin(), fields.end(), + [](const std::string& s) { return s.substr(0, 5) == "cg:Z:"; }); + + if (cigar_it != fields.end()) { + std::string cigar = cigar_it->substr(5); // Remove cg:Z: prefix + uint64_t ref_start = std::stoull(fields[7]); + uint64_t ref_end = std::stoull(fields[8]); + + + // Just pass through the CIGAR string and coordinates unchanged + // The trimming is now handled in wflign namespace + + // Reconstruct the line + std::string new_line; + for (const auto& f : fields) { + if (!new_line.empty()) new_line += '\t'; + new_line += f; + } + new_line += '\n'; + + // Push the modified alignment output to the paf_queue + paf_queue.push(new std::string(std::move(new_line))); + } else { + // If no CIGAR string found, output the line unchanged + paf_queue.push(new std::string(line + '\n')); + } + } // Update progress meter and processed alignment length uint64_t alignment_length = rec->currentRecord.qEndPos - rec->currentRecord.qStartPos; @@ -514,7 +605,7 @@ void computeAlignments() { while(std::getline(mappingListStream, mappingRecordLine)) { if (!mappingRecordLine.empty()) { - parseMashmapRow(mappingRecordLine, currentRecord); + parseMashmapRow(mappingRecordLine, currentRecord, param.target_padding); total_alignment_length += currentRecord.qEndPos - currentRecord.qStartPos; } } diff --git a/src/common/wflign/CMakeLists.txt b/src/common/wflign/CMakeLists.txt index 40bd247a..8ab9ac76 100644 --- a/src/common/wflign/CMakeLists.txt +++ b/src/common/wflign/CMakeLists.txt @@ -61,6 +61,7 @@ set(wflign_SOURCE src/wflign.cpp src/wflign_alignment.cpp src/wflign_patch.cpp + src/wflign_swizzle.cpp src/rkmh.cpp src/murmur3.cpp ) diff --git a/src/common/wflign/src/alignment_printer.hpp b/src/common/wflign/src/alignment_printer.hpp index a7bc378d..4df83767 100644 --- a/src/common/wflign/src/alignment_printer.hpp +++ b/src/common/wflign/src/alignment_printer.hpp @@ -21,6 +21,7 @@ void write_tag_and_md_string( void write_alignment_sam( std::ostream &out, const alignment_t& patch_aln, + const std::string& cigar_str, const std::string& query_name, const uint64_t& query_total_length, const uint64_t& query_offset, @@ -41,6 +42,7 @@ void write_alignment_sam( bool write_alignment_paf( std::ostream& out, const alignment_t& aln, + const std::string& cigar_str, const std::string& query_name, const uint64_t& query_total_length, const uint64_t& query_offset, diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp index 777a45e4..629d6d9f 100644 --- a/src/common/wflign/src/wflign.cpp +++ b/src/common/wflign/src/wflign.cpp @@ -4,6 +4,7 @@ #include "wflign.hpp" #include "wflign_patch.hpp" +#include "wflign_swizzle.hpp" // Namespaces @@ -34,7 +35,10 @@ void do_biwfa_alignment( const bool no_seq_in_sam, const float min_identity, const uint64_t wflign_max_len_minor, - const float mashmap_estimated_identity) { + const float mashmap_estimated_identity, + const int32_t chain_id, + const int32_t chain_length, + const int32_t chain_pos) { // Create WFA aligner with the provided penalties wfa::WFAlignerGapAffine2Pieces wf_aligner( @@ -49,6 +53,7 @@ void do_biwfa_alignment( wf_aligner.setHeuristicNone(); + // Perform the alignment const int status = wf_aligner.alignEnd2End(target, (int)target_length, query, (int)query_length); @@ -64,12 +69,39 @@ void do_biwfa_alignment( // Copy alignment CIGAR wflign_edit_cigar_copy(wf_aligner, &aln.edit_cigar); + + // Convert WFA CIGAR to string format for potential swizzling + std::string cigar_str = wfa_edit_cigar_to_string(aln.edit_cigar); + // Try swizzling the CIGAR at both ends with debug enabled + std::string swizzled = try_swap_start_pattern(cigar_str, query, target, 0, 0); + if (swizzled != cigar_str) { + cigar_str = swizzled; + } else { + } + + swizzled = try_swap_end_pattern(cigar_str, query, target, 0, 0); + if (swizzled != cigar_str) { + cigar_str = swizzled; + } else { + } + + // If the CIGAR changed, update coordinates and alignment + //if (cigar_str != wfa_edit_cigar_to_string(aln.edit_cigar)) { + // Update coordinates based on swizzled CIGAR + //auto new_coords = alignment_end_coords(cigar_str, query_offset, target_offset); + + // Convert back to WFA format + //wfa_string_to_edit_cigar(cigar_str, &aln.edit_cigar); + //} + + //wfa_string_to_edit_cigar(cigar_str, &aln.edit_cigar); // Write alignment if (paf_format_else_sam) { write_alignment_paf( out, aln, + cigar_str, query_name, query_total_length, query_offset, @@ -86,6 +118,7 @@ void do_biwfa_alignment( write_alignment_sam( out, aln, + cigar_str, query_name, query_total_length, query_offset, @@ -451,7 +484,8 @@ void WFlign::wflign_affine_wavefront( query_name, query, query_total_length, query_offset, query_length, query_is_rev, target_name, target, target_total_length, target_offset, target_length, *out, wfa_convex_penalties, emit_md_tag, paf_format_else_sam, no_seq_in_sam, - min_identity, wflign_max_len_minor, mashmap_estimated_identity); + min_identity, wflign_max_len_minor, mashmap_estimated_identity, + -1, 1, 1); // Not part of a chain when using direct biWFA return; } @@ -1134,6 +1168,7 @@ void WFlign::wflign_affine_wavefront( write_alignment_paf( *out, **x, + "", query_name, query_total_length, query_offset, diff --git a/src/common/wflign/src/wflign.hpp b/src/common/wflign/src/wflign.hpp index bf6dd3f4..7767dd68 100644 --- a/src/common/wflign/src/wflign.hpp +++ b/src/common/wflign/src/wflign.hpp @@ -53,7 +53,10 @@ namespace wflign { const bool no_seq_in_sam, const float min_identity, const uint64_t wflign_max_len_minor, - const float mashmap_estimated_identity); + const float mashmap_estimated_identity, + const int32_t chain_id, + const int32_t chain_length, + const int32_t chain_pos); class WFlign { public: diff --git a/src/common/wflign/src/wflign_alignment.cpp b/src/common/wflign/src/wflign_alignment.cpp index b7bcf891..78aae9bc 100644 --- a/src/common/wflign/src/wflign_alignment.cpp +++ b/src/common/wflign/src/wflign_alignment.cpp @@ -513,6 +513,16 @@ char* wfa_alignment_to_cigar( uint64_t& inserted_bp, uint64_t& deletions, uint64_t& deleted_bp) { + // Initialize all counters to 0 + target_aligned_length = 0; + query_aligned_length = 0; + matches = 0; + mismatches = 0; + insertions = 0; + inserted_bp = 0; + deletions = 0; + deleted_bp = 0; + // the edit cigar contains a character string of ops // here we compress them into the standard cigar representation @@ -529,6 +539,7 @@ char* wfa_alignment_to_cigar( (edit_cigar->cigar_ops[i] != lastMove && lastMove != 0)) { // calculate matches, mismatches, insertions, deletions switch (lastMove) { + case '=': case 'M': matches += numOfSameMoves; query_aligned_length += numOfSameMoves; diff --git a/src/common/wflign/src/wflign_cigar_utils.hpp b/src/common/wflign/src/wflign_cigar_utils.hpp new file mode 100644 index 00000000..7b303a49 --- /dev/null +++ b/src/common/wflign/src/wflign_cigar_utils.hpp @@ -0,0 +1,55 @@ +#ifndef WFLIGN_CIGAR_UTILS_HPP +#define WFLIGN_CIGAR_UTILS_HPP + +#include +#include +#include "edit_cigar.hpp" + +namespace wflign { + +// Convert WFA edit_cigar_t to SAM-style CIGAR string +inline std::string wfa_edit_cigar_to_string(const edit_cigar_t& edit_cigar) { + std::stringstream cigar; + int count = 0; + char last_op = '\0'; + + for (int i = edit_cigar.begin_offset; i < edit_cigar.end_offset; ++i) { + const char op = edit_cigar.operations[i]; + if (op == last_op) { + ++count; + } else { + if (count > 0) { + cigar << count << last_op; + } + count = 1; + last_op = op; + } + } + + if (count > 0) { + cigar << count << last_op; + } + + return cigar.str(); +} + +// Convert SAM-style CIGAR string back to WFA edit_cigar_t +inline void wfa_string_to_edit_cigar(const std::string& cigar_str, edit_cigar_t* edit_cigar) { + edit_cigar_clear(edit_cigar); + + int num = 0; + for (char c : cigar_str) { + if (std::isdigit(c)) { + num = num * 10 + (c - '0'); + } else { + for (int i = 0; i < num; ++i) { + edit_cigar_add_operation(edit_cigar, c); + } + num = 0; + } + } +} + +} // namespace wflign + +#endif // WFLIGN_CIGAR_UTILS_HPP diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 3d0da6aa..ed5afb8d 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -9,6 +9,141 @@ namespace wflign { + std::tuple trim_deletions( + const std::string& cigar, + uint64_t ref_start, + uint64_t ref_end, + uint64_t query_start, + uint64_t query_end) { + + std::string trimmed_cigar; + uint64_t new_ref_start = ref_start; + uint64_t new_ref_end = ref_end; + uint64_t new_query_start = query_start; + uint64_t new_query_end = query_end; + + // Parse CIGAR string + std::vector> cigar_ops; + size_t i = 0; + while (i < cigar.size()) { + int count = 0; + while (i < cigar.size() && std::isdigit(static_cast(cigar[i]))) { + count = count * 10 + (cigar[i] - '0'); + i++; + } + if (i < cigar.size()) { + cigar_ops.push_back({count, cigar[i]}); + i++; + } + } + + // Find leading deletions and matches + size_t leading_dels = 0; + while (leading_dels < cigar_ops.size() && cigar_ops[leading_dels].second == 'D') { + new_ref_start += cigar_ops[leading_dels].first; + leading_dels++; + } + + // Find trailing deletions and matches + size_t trailing_dels = 0; + while (trailing_dels < cigar_ops.size() - leading_dels && + cigar_ops[cigar_ops.size() - 1 - trailing_dels].second == 'D') { + new_ref_end -= cigar_ops[cigar_ops.size() - 1 - trailing_dels].first; + trailing_dels++; + } + + // Build new CIGAR string and count consumed bases + uint64_t ref_consumed = 0; + uint64_t query_consumed = 0; + for (size_t j = leading_dels; j < cigar_ops.size() - trailing_dels; j++) { + trimmed_cigar += std::to_string(cigar_ops[j].first) + cigar_ops[j].second; + + // Count bases consumed by this operation + switch(cigar_ops[j].second) { + case 'M': + case 'X': + case '=': + ref_consumed += cigar_ops[j].first; + query_consumed += cigar_ops[j].first; + break; + case 'D': + case 'N': + ref_consumed += cigar_ops[j].first; + break; + case 'I': + query_consumed += cigar_ops[j].first; + break; + // S/H operations don't affect coordinates + } + } + + // Adjust coordinates based on consumed bases + new_ref_end = new_ref_start + ref_consumed; + new_query_end = new_query_start + query_consumed; + + return std::make_tuple(trimmed_cigar, new_ref_start, new_ref_end, new_query_start, new_query_end); + } + + // Process a compressed CIGAR string to get alignment metrics + void process_compressed_cigar( + const std::string& cigar_str, + uint64_t& matches, + uint64_t& mismatches, + uint64_t& insertions, + uint64_t& inserted_bp, + uint64_t& deletions, + uint64_t& deleted_bp, + uint64_t& refAlignedLength, + uint64_t& qAlignedLength) { + + matches = mismatches = insertions = inserted_bp = deletions = deleted_bp = refAlignedLength = qAlignedLength = 0; + + + // For a compressed CIGAR string like "50000=", we should get perfect identity + if (cigar_str.find_first_not_of("0123456789") == cigar_str.length() - 1 && cigar_str.back() == '=') { + matches = std::stoi(cigar_str.substr(0, cigar_str.length() - 1)); + refAlignedLength = matches; + qAlignedLength = matches; + return; + } + + size_t pos = 0; + while (pos < cigar_str.length()) { + // Parse the length + size_t next_pos = pos; + while (next_pos < cigar_str.length() && isdigit(cigar_str[next_pos])) { + next_pos++; + } + int length = std::stoi(cigar_str.substr(pos, next_pos - pos)); + char op = cigar_str[next_pos]; + + switch (op) { + case 'M': + case '=': + matches += length; + refAlignedLength += length; + qAlignedLength += length; + break; + case 'X': + mismatches += length; + refAlignedLength += length; + qAlignedLength += length; + break; + case 'I': + insertions++; // Count runs for gap-compressed + inserted_bp += length; // Count individual bases for block + qAlignedLength += length; + break; + case 'D': + deletions++; // Count runs for gap-compressed + deleted_bp += length; // Count individual bases for block + refAlignedLength += length; + break; + } + pos = next_pos + 1; + } + } + void encodeOneStep(const char *filename, std::vector &image, unsigned width, unsigned height) { //Encode the image unsigned error = lodepng::encode(filename, image, width, height); @@ -535,24 +670,39 @@ AlignmentBounds find_alignment_bounds(const alignment_t& aln, const int& erode_k } void trim_alignment(alignment_t& aln) { - // Trim head + // Count leading indels int head_trim_q = 0, head_trim_t = 0; - while (aln.edit_cigar.begin_offset < aln.edit_cigar.end_offset) { - char op = aln.edit_cigar.cigar_ops[aln.edit_cigar.begin_offset]; + int head_trim_ops = 0; + while (aln.edit_cigar.begin_offset + head_trim_ops < aln.edit_cigar.end_offset) { + char op = aln.edit_cigar.cigar_ops[aln.edit_cigar.begin_offset + head_trim_ops]; if (op != 'I' && op != 'D') break; if (op == 'I') head_trim_q++; if (op == 'D') head_trim_t++; - aln.edit_cigar.begin_offset++; + head_trim_ops++; } - // Trim tail + // Count trailing indels int tail_trim_q = 0, tail_trim_t = 0; - while (aln.edit_cigar.end_offset > aln.edit_cigar.begin_offset) { - char op = aln.edit_cigar.cigar_ops[aln.edit_cigar.end_offset - 1]; + int tail_trim_ops = 0; + while (aln.edit_cigar.end_offset - tail_trim_ops > aln.edit_cigar.begin_offset + head_trim_ops) { + char op = aln.edit_cigar.cigar_ops[aln.edit_cigar.end_offset - 1 - tail_trim_ops]; if (op != 'I' && op != 'D') break; if (op == 'I') tail_trim_q++; if (op == 'D') tail_trim_t++; - aln.edit_cigar.end_offset--; + tail_trim_ops++; + } + + // If we found indels to trim, create new trimmed CIGAR + if (head_trim_ops > 0 || tail_trim_ops > 0) { + int new_len = aln.edit_cigar.end_offset - aln.edit_cigar.begin_offset - head_trim_ops - tail_trim_ops; + char* new_cigar = (char*)malloc(new_len * sizeof(char)); + memcpy(new_cigar, + aln.edit_cigar.cigar_ops + aln.edit_cigar.begin_offset + head_trim_ops, + new_len * sizeof(char)); + free(aln.edit_cigar.cigar_ops); + aln.edit_cigar.cigar_ops = new_cigar; + aln.edit_cigar.begin_offset = 0; + aln.edit_cigar.end_offset = new_len; } // Adjust coordinates @@ -1802,16 +1952,37 @@ query_start : query_end) total_target_aligned_length, total_query_aligned_length, matches, mismatches, insertions, inserted_bp, deletions, deleted_bp); + // Calculate metrics from the compressed CIGAR string + uint64_t cigar_matches = 0; + uint64_t cigar_mismatches = 0; + uint64_t cigar_insertions = 0; + uint64_t cigar_inserted_bp = 0; + uint64_t cigar_deletions = 0; + uint64_t cigar_deleted_bp = 0; + uint64_t cigar_refAlignedLength = 0; + uint64_t cigar_qAlignedLength = 0; + + process_compressed_cigar( + std::string(cigarv), + cigar_matches, + cigar_mismatches, + cigar_insertions, + cigar_inserted_bp, + cigar_deletions, + cigar_deleted_bp, + cigar_refAlignedLength, + cigar_qAlignedLength); + const double gap_compressed_identity = - (double)matches / - (double)(matches + mismatches + insertions + deletions); + (double)cigar_matches / + (double)(cigar_matches + cigar_mismatches + cigar_insertions + cigar_deletions); if (gap_compressed_identity >= min_identity) { - const uint64_t edit_distance = mismatches + inserted_bp + deleted_bp; + const uint64_t edit_distance = cigar_mismatches + cigar_inserted_bp + cigar_deleted_bp; // identity over the full block const double block_identity = - (double)matches / (double)(matches + edit_distance); + (double)cigar_matches / (double)(cigar_matches + edit_distance); auto write_tag_and_md_string = [&](std::ostream &out, const char *c, const int target_start) { @@ -1908,8 +2079,8 @@ query_start : query_end) << "\t" << (query_is_rev ? "-" : "+") << "\t" << target_name << "\t" << target_total_length << "\t" << target_offset - target_pointer_shift + target_start << "\t" - << target_offset + target_end << "\t" << matches << "\t" - << matches + mismatches + inserted_bp + deleted_bp + << target_offset + target_end << "\t" << cigar_matches << "\t" + << cigar_matches + cigar_mismatches + cigar_inserted_bp + cigar_deleted_bp << "\t" << std::round(float2phred(1.0 - block_identity)) //<< "\t" << "as:i:" << total_score @@ -2032,7 +2203,7 @@ query_start : query_end) // Write the patch alignments for (auto& patch_aln : multi_patch_alns) { write_alignment_sam( - out, patch_aln, query_name, query_total_length, + out, patch_aln, "", query_name, query_total_length, query_offset, query_length, query_is_rev, target_name, target_total_length, target_offset, target_length, min_identity, mashmap_estimated_identity, @@ -2053,6 +2224,7 @@ query_start : query_end) bool wrote = write_alignment_paf( out, patch_aln, + "", query_name, query_total_length, query_offset, @@ -2163,6 +2335,7 @@ void write_tag_and_md_string( void write_alignment_sam( std::ostream &out, const alignment_t& patch_aln, + const std::string& cigar_str, const std::string& query_name, const uint64_t& query_total_length, const uint64_t& query_offset, @@ -2180,6 +2353,8 @@ void write_alignment_sam( const char* target, const int64_t& target_pointer_shift) { + if (cigar_str == "") { std::cerr << "[wflign_patch] unsupported codepath" << std::endl; exit(1); } + uint64_t patch_matches = 0; uint64_t patch_mismatches = 0; uint64_t patch_insertions = 0; @@ -2189,10 +2364,35 @@ void write_alignment_sam( uint64_t patch_refAlignedLength = 0; uint64_t patch_qAlignedLength = 0; - char* patch_cigar = wfa_alignment_to_cigar( - &patch_aln.edit_cigar, patch_refAlignedLength, patch_qAlignedLength, - patch_matches, patch_mismatches, patch_insertions, patch_inserted_bp, - patch_deletions, patch_deleted_bp); + process_compressed_cigar( + cigar_str, + patch_matches, + patch_mismatches, + patch_insertions, + patch_inserted_bp, + patch_deletions, + patch_deleted_bp, + patch_refAlignedLength, + patch_qAlignedLength); + + // Trim deletions and get new coordinates + auto [trimmed_cigar, new_ref_start, new_ref_end, new_query_start, new_query_end] = + trim_deletions(cigar_str, target_offset + patch_aln.i, target_offset + patch_aln.i + patch_refAlignedLength, + query_offset + patch_aln.j, query_offset + patch_aln.j + patch_qAlignedLength); + + // Recompute metrics with trimmed CIGAR + process_compressed_cigar( + trimmed_cigar, + patch_matches, + patch_mismatches, + patch_insertions, + patch_inserted_bp, + patch_deletions, + patch_deleted_bp, + patch_refAlignedLength, + patch_qAlignedLength); + + char* patch_cigar = strdup(trimmed_cigar.c_str()); double patch_gap_compressed_identity = (double)patch_matches / (double)(patch_matches + patch_mismatches + patch_insertions + patch_deletions); @@ -2256,6 +2456,7 @@ void write_alignment_sam( bool write_alignment_paf( std::ostream& out, const alignment_t& aln, + const std::string& cigar_str, const std::string& query_name, const uint64_t& query_total_length, const uint64_t& query_offset, // query offset on the forward strand @@ -2270,6 +2471,7 @@ bool write_alignment_paf( const bool& with_endline, const bool& is_rev_patch) { bool ret = false; // return true if we wrote the alignment + if (cigar_str == "") { std::cerr << "[wflign_patch] unsupported codepath" << std::endl; exit(1); } if (aln.ok) { uint64_t matches = 0; @@ -2281,11 +2483,36 @@ bool write_alignment_paf( uint64_t refAlignedLength = 0; uint64_t qAlignedLength = 0; - char *cigar = wfa_alignment_to_cigar( - &aln.edit_cigar, refAlignedLength, qAlignedLength, matches, - mismatches, insertions, inserted_bp, deletions, deleted_bp); - - size_t alignmentRefPos = aln.i; + process_compressed_cigar( + cigar_str, + matches, + mismatches, + insertions, + inserted_bp, + deletions, + deleted_bp, + refAlignedLength, + qAlignedLength); + + // Trim deletions and get new coordinates + auto [trimmed_cigar, new_ref_start, new_ref_end, new_query_start, new_query_end] = + trim_deletions(cigar_str, target_offset + aln.i, target_offset + aln.i + refAlignedLength, + query_offset + aln.j, query_offset + aln.j + qAlignedLength); + + // Recompute metrics with trimmed CIGAR + process_compressed_cigar( + trimmed_cigar, + matches, + mismatches, + insertions, + inserted_bp, + deletions, + deleted_bp, + refAlignedLength, + qAlignedLength); + + char* cigar = strdup(trimmed_cigar.c_str()); + size_t alignmentRefPos = new_ref_start - target_offset; double gap_compressed_identity = (double)matches / (double)(matches + mismatches + insertions + deletions); @@ -2302,6 +2529,7 @@ bool write_alignment_paf( q_start = query_offset + aln.j; q_end = query_offset + aln.j + qAlignedLength; } + out << query_name << "\t" << query_total_length << "\t" << q_start << "\t" << q_end << "\t" << (aln.is_rev ^ query_is_rev ? "-" : "+") << "\t" << target_name << "\t" diff --git a/src/common/wflign/src/wflign_swizzle.cpp b/src/common/wflign/src/wflign_swizzle.cpp new file mode 100644 index 00000000..4129e69f --- /dev/null +++ b/src/common/wflign/src/wflign_swizzle.cpp @@ -0,0 +1,426 @@ +#include "wflign_swizzle.hpp" +#include + +namespace wflign { + +// Implementation copied from cigar_swap.cpp and wrapped in namespace +static std::string merge_cigar_ops(const std::string &cigar) { + std::string merged; + int current_count = 0; + char current_op = '\0'; + + size_t i = 0; + while (i < cigar.size()) { + int val = 0; + while (i < cigar.size() && std::isdigit(static_cast(cigar[i]))) { + val = val * 10 + (cigar[i] - '0'); + i++; + } + if (i >= cigar.size()) break; + char op = cigar[i++]; + if (op == current_op) { + current_count += val; + } else { + if (current_op != '\0') { + merged += std::to_string(current_count); + merged += current_op; + } + current_op = op; + current_count = val; + } + } + if (current_op != '\0') { + merged += std::to_string(current_count); + merged += current_op; + } + return merged; +} + +static bool sequences_match( + const std::string &query_seq, + const std::string &target_seq, + int64_t query_start, + int64_t target_start, + int N, + bool debug) { + if (query_start < 0 || target_start < 0) { + return false; + } + if (query_start + N > (int64_t)query_seq.size()) { + return false; + } + if (target_start + N > (int64_t)target_seq.size()) { + return false; + } + for (int i = 0; i < N; i++) { + if (query_seq[query_start + i] != target_seq[target_start + i]) { + return false; + } + } + return true; +} + +static bool verify_cigar_alignment( + const std::string &cigar, + const std::string &query_seq, + const std::string &target_seq, + int64_t query_start, + int64_t target_start, + bool debug) { + if (true) { + } + + int64_t qPos = query_start; + int64_t tPos = target_start; + + size_t i = 0; + while (i < cigar.size()) { + int val = 0; + while (i < cigar.size() && std::isdigit(static_cast(cigar[i]))) { + val = val * 10 + (cigar[i] - '0'); + i++; + } + if (i >= cigar.size()) break; + char op = cigar[i++]; + + if (op == '=') { + if (qPos < 0 || tPos < 0 || + qPos + val > (int64_t)query_seq.size() || + tPos + val > (int64_t)target_seq.size()) { + return false; + } + for (int k = 0; k < val; k++) { + if (query_seq[qPos + k] != target_seq[tPos + k]) { + return false; + } + } + qPos += val; + tPos += val; + } else if (op == 'D') { + if (tPos + val > (int64_t)target_seq.size()) { + return false; + } + tPos += val; + } else { + return false; + } + } + return true; +} + +static bool extract_first_two_ops( + const std::string &cigar, + int &count1, char &op1, + int &count2, char &op2, + size_t &second_op_start, + size_t &second_op_end) { + size_t i = 0; + { + int val = 0; + while (i < cigar.size() && std::isdigit(static_cast(cigar[i]))) { + val = val * 10 + (cigar[i] - '0'); + i++; + } + if (i >= cigar.size()) return false; + op1 = cigar[i++]; + count1 = val; + } + second_op_start = i; + { + int val = 0; + while (i < cigar.size() && std::isdigit(static_cast(cigar[i]))) { + val = val * 10 + (cigar[i] - '0'); + i++; + } + if (i >= cigar.size()) return false; + op2 = cigar[i++]; + count2 = val; + second_op_end = i; + } + return true; +} + +static bool extract_last_two_ops( + const std::string &cigar, + int &count2, char &op2, + int &count1, char &op1, + size_t &second_last_op_start, + size_t &last_op_start) { + if (cigar.empty()) return false; + + size_t pos = cigar.size(); + while (pos > 0 && std::isdigit(static_cast(cigar[pos - 1]))) { + pos--; + } + if (pos == 0) return false; + op2 = cigar[pos - 1]; + { + size_t digit_end = pos - 1; + int val = 0; + int factor = 1; + size_t j = digit_end; + while (j > 0 && std::isdigit(static_cast(cigar[j - 1]))) { + val += (cigar[j - 1] - '0') * factor; + factor *= 10; + j--; + } + count2 = val; + last_op_start = j; + } + + if (last_op_start == 0) return false; + { + size_t pos2 = last_op_start; + while (pos2 > 0 && std::isdigit(static_cast(cigar[pos2 - 1]))) { + pos2--; + } + if (pos2 == 0) return false; + op1 = cigar[pos2 - 1]; + { + size_t digit_end = pos2 - 1; + int val = 0; + int factor = 1; + size_t j = digit_end; + while (j > 0 && std::isdigit(static_cast(cigar[j - 1]))) { + val += (cigar[j - 1] - '0') * factor; + factor *= 10; + j--; + } + count1 = val; + second_last_op_start = j; + } + } + return true; +} + +std::pair alignment_end_coords( + const std::string &cigar, + int64_t query_start, + int64_t target_start) { + int64_t qPos = query_start; + int64_t tPos = target_start; + size_t i = 0; + while (i < cigar.size()) { + int val = 0; + while (i < cigar.size() && std::isdigit(static_cast(cigar[i]))) { + val = val * 10 + (cigar[i] - '0'); + i++; + } + if (i >= cigar.size()) break; + char op = cigar[i++]; + if (op == '=') { + qPos += val; + tPos += val; + } else if (op == 'D') { + tPos += val; + } + } + return {qPos, tPos}; +} + +std::string try_swap_start_pattern( + const std::string &cigar, + const std::string &query_seq, + const std::string &target_seq, + int64_t query_start, + int64_t target_start) { + + /* + if (!verify_cigar_alignment(cigar, query_seq, target_seq, query_start, target_start, true)) { + return cigar; + } + */ + + int N, Dlen; + char op1, op2; + size_t second_op_start, second_op_end; + if (!extract_first_two_ops(cigar, N, op1, Dlen, op2, second_op_start, second_op_end)) { + return cigar; + } + + + if (op1 == '=' && op2 == 'D') { + // Always enable debug logging + const bool debug = true; + + // Note: target_seq is already offset by the padding, so we use 0-based coords + if (sequences_match(query_seq, target_seq, query_start, 0, N, debug)) { + // When we move a deletion from after the match to before it, + // we need to adjust target_start since the deletion now comes first + int64_t new_target_start = target_start - Dlen; + + + std::string remainder = cigar.substr(second_op_end); + std::string swapped = std::to_string(Dlen) + "D" + + std::to_string(N) + "=" + + remainder; + swapped = merge_cigar_ops(swapped); + + // Verify the entire new alignment with adjusted target_start + /* + if (!verify_cigar_alignment(swapped, query_seq, target_seq, query_start, new_target_start, debug)) { + if (debug) { + } + return cigar; + } + */ + return swapped; + } + } + return cigar; +} + +std::string try_swap_end_pattern( + const std::string &cigar, + const std::string &query_seq, + const std::string &target_seq, + int64_t query_start, + int64_t target_start) { + //if (!verify_cigar_alignment(cigar, query_seq, target_seq, query_start, target_start)) { + // return cigar; + //} + + int count2, count1; + char op2, op1; + size_t second_last_op_start, last_op_start; + if (!extract_last_two_ops(cigar, count2, op2, count1, op1, second_last_op_start, last_op_start)) { + return cigar; + } + + if (op1 == 'D' && op2 == '=') { + int N = count2; + int Dlen = count1; + auto endCoords = alignment_end_coords(cigar, query_start, target_start); + int64_t endQ = endCoords.first; + int64_t endT = endCoords.second; + + if (sequences_match(query_seq, target_seq, endQ - N, endT - N - Dlen, N)) { + std::string before = cigar.substr(0, second_last_op_start); + std::string swapped = before + + std::to_string(N) + "=" + + std::to_string(Dlen) + "D"; + swapped = merge_cigar_ops(swapped); + if (!verify_cigar_alignment(swapped, query_seq, target_seq, query_start, target_start)) { + return cigar; + } + return swapped; + } + } + return cigar; +} + +static std::string drop_leading_trailing_deletions_if_all_eq( + const std::string &cigar) { + std::vector> ops; + { + size_t i = 0; + while (i < cigar.size()) { + int val = 0; + while (i < cigar.size() && std::isdigit(static_cast(cigar[i]))) { + val = val * 10 + (cigar[i] - '0'); + i++; + } + if (i >= cigar.size()) break; + char op = cigar[i++]; + ops.push_back(std::make_pair(val, (int)op)); + } + } + if (ops.empty()) return cigar; + + for (auto &p : ops) { + char op = (char)p.second; + if (op != 'D' && op != '=') { + return cigar; + } + } + + bool found_eq = false; + for (size_t i = 0; i < ops.size(); i++) { + char op = (char)ops[i].second; + if (op == '=') { + found_eq = true; + break; + } + } + if (!found_eq) { + return cigar; + } + + char firstOp = (char)ops[0].second; + char lastOp = (char)ops[ops.size()-1].second; + + if (firstOp != 'D' || lastOp != 'D') { + return cigar; + } + + ops.erase(ops.begin()); + ops.pop_back(); + + std::string out; + for (size_t i = 0; i < ops.size(); i++) { + out += std::to_string(ops[i].first); + out += (char)ops[i].second; + } + + out = merge_cigar_ops(out); + return out; +} + +// Convert edit_cigar_t to string CIGAR format +std::string wfa_edit_cigar_to_string(const wflign_cigar_t& edit_cigar) { + std::string cigar; + int count = 0; + char last_op = '\0'; + for (int i = edit_cigar.begin_offset; i < edit_cigar.end_offset; ++i) { + char op = edit_cigar.cigar_ops[i]; + if (op == 'M') { + op = '='; // Convert any M to = since these are exact matches + } + if (op == last_op) { + count++; + } else { + if (last_op != '\0') { + cigar += std::to_string(count) + last_op; + } + last_op = op; + count = 1; + } + } + if (last_op != '\0') { + cigar += std::to_string(count) + last_op; + } + return cigar; +} + +// Convert string CIGAR format to edit_cigar_t +void wfa_string_to_edit_cigar(const std::string& cigar_str, wflign_cigar_t* edit_cigar) { + // Clear existing cigar + edit_cigar->begin_offset = 0; + edit_cigar->end_offset = 0; + + std::vector ops; + size_t i = 0; + while (i < cigar_str.size()) { + // Parse count + int count = 0; + while (i < cigar_str.size() && std::isdigit(static_cast(cigar_str[i]))) { + count = count * 10 + (cigar_str[i] - '0'); + i++; + } + if (i >= cigar_str.size()) break; + + // Get operation and convert M to = since we know these are exact matches + char op = cigar_str[i++]; + if (op == 'M') { + op = '='; // Convert M to = since we're dealing with exact matches + } + + // Add operation count times + for (int j = 0; j < count; j++) { + edit_cigar->cigar_ops[edit_cigar->end_offset++] = op; + } + } +} + +} // namespace wflign + diff --git a/src/common/wflign/src/wflign_swizzle.hpp b/src/common/wflign/src/wflign_swizzle.hpp new file mode 100644 index 00000000..7aaf0357 --- /dev/null +++ b/src/common/wflign/src/wflign_swizzle.hpp @@ -0,0 +1,69 @@ +#pragma once + +#include +#include +#include +#include +#include "../deps/WFA2-lib/bindings/cpp/WFAligner.hpp" +#include "wflign_alignment.hpp" + +namespace wflign { + +// Forward declare CIGAR conversion functions +std::string wfa_edit_cigar_to_string(const wflign_cigar_t& edit_cigar); +void wfa_string_to_edit_cigar(const std::string& cigar_str, wflign_cigar_t* edit_cigar); + +// Try to swap start pattern: "N= DlenD" => "DlenD N=" +std::string try_swap_start_pattern( + const std::string &cigar, + const std::string &query_seq, + const std::string &target_seq, + int64_t query_start, + int64_t target_start); + +// Try to swap end pattern: "DlenD N=" => "N= DlenD" +std::string try_swap_end_pattern( + const std::string &cigar, + const std::string &target_seq, + const std::string &query_seq, + int64_t query_start, + int64_t target_start); + +// Drop leading/trailing deletions if the rest is purely '=' +static std::string drop_leading_trailing_deletions_if_all_eq( + const std::string &cigar); + +// Helper functions +static std::string merge_cigar_ops(const std::string &cigar); +static bool sequences_match( + const std::string &query_seq, + const std::string &target_seq, + int64_t query_start, + int64_t target_start, + int N, + bool debug = false); +static bool extract_first_two_ops( + const std::string &cigar, + int &count1, char &op1, + int &count2, char &op2, + size_t &second_op_start, + size_t &second_op_end); +static bool extract_last_two_ops( + const std::string &cigar, + int &count2, char &op2, + int &count1, char &op1, + size_t &second_last_op_start, + size_t &last_op_start); +std::pair alignment_end_coords( + const std::string &cigar, + int64_t query_start, + int64_t target_start); +static bool verify_cigar_alignment( + const std::string &cigar, + const std::string &query_seq, + const std::string &target_seq, + int64_t query_start, + int64_t target_start, + bool debug = false); + +} // namespace wflign diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index f9e206b8..27d4a3fa 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -124,6 +124,7 @@ void parse_args(int argc, args::Group alignment_opts(options_group, "Alignment:"); args::ValueFlag input_mapping(alignment_opts, "FILE", "input PAF file for alignment", {'i', "align-paf"}); + args::ValueFlag target_padding(alignment_opts, "INT", "padding around target sequence [0]", {'E', "target-padding"}); args::ValueFlag wfa_params(alignment_opts, "vals", "scoring: mismatch, gap1(o,e), gap2(o,e) [6,6,2,26,1]", {'g', "wfa-params"}); @@ -478,6 +479,17 @@ void parse_args(int argc, align_parameters.wflign_min_inv_patch_len = 23; align_parameters.wflign_max_patching_score = 0; // will trigger estimation based on gap penalties and sequence length + if (target_padding) { + const int64_t p = handy_parameter(args::get(target_padding)); + if (p < 0) { + std::cerr << "[wfmash] ERROR: target padding must be >= 0" << std::endl; + exit(1); + } + align_parameters.target_padding = p; + } else { + align_parameters.target_padding = 100; + } + if (thread_count) { map_parameters.threads = args::get(thread_count); align_parameters.threads = args::get(thread_count); diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp index 0a3bab44..6116b9e2 100644 --- a/src/map/include/base_types.hpp +++ b/src/map/include/base_types.hpp @@ -177,6 +177,9 @@ namespace skch bool selfMapFilter; // set to true if a long-to-short mapping in all-vs-all mode (we report short as the query) double chainPairScore; // best score for potential chain pair int64_t chainPairId; // best partner mapping for potential chain pair + int32_t chain_id{-1}; //unique ID for this chain (-1 if not part of chain) + int32_t chain_length{1}; //total segments in chain (1 if not part of chain) + int32_t chain_pos{1}; //position in chain, 1-based (1 if not part of chain) offset_t qlen() { //length of this mapping on query axis return queryEndPos - queryStartPos + 1; diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index c7528e72..7549a9f6 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1154,6 +1154,17 @@ namespace skch std::sort(l2Mappings.begin(), l2Mappings.end(), [](const auto& a, const auto& b) { return std::tie(a.refSeqId, a.refStartPos) < std::tie(b.refSeqId, b.refStartPos); }); + // Add chain information + // All mappings in this batch form a chain + int32_t chain_id = maxChainIdSeen.fetch_add(1, std::memory_order_relaxed); + int32_t chain_length = l2Mappings.size(); + int32_t chain_pos = 1; + for (auto& mapping : l2Mappings) { + mapping.chain_id = chain_id; + mapping.chain_length = chain_length; + mapping.chain_pos = chain_pos++; + } + #ifdef ENABLE_TIME_PROFILE_L1_L2 { std::chrono::duration timeSpentL2 = skch::Time::now() - t1; @@ -2363,6 +2374,41 @@ namespace skch void reportReadMappings(MappingResultsVector_t &readMappings, const std::string &queryName, std::ostream &outstrm) { + // Sort mappings by chain ID and query position + std::sort(readMappings.begin(), readMappings.end(), + [](const MappingResult &a, const MappingResult &b) { + return std::tie(a.splitMappingId, a.queryStartPos) + < std::tie(b.splitMappingId, b.queryStartPos); + }); + + // Assign chain positions within each chain + int current_chain = -1; + int chain_pos = 0; + int chain_length = 0; + + // First pass - count chain lengths + for (size_t i = 0; i < readMappings.size(); ++i) { + if (readMappings[i].splitMappingId != current_chain) { + current_chain = readMappings[i].splitMappingId; + chain_length = 1; + // Count forward to find chain length + for (size_t j = i + 1; j < readMappings.size(); ++j) { + if (readMappings[j].splitMappingId == current_chain) { + chain_length++; + } else { + break; + } + } + // Assign length to all mappings in this chain + readMappings[i].chain_length = chain_length; + chain_pos = 1; + } else { + readMappings[i].chain_length = chain_length; + chain_pos++; + } + readMappings[i].chain_pos = chain_pos; + } + //Print the results for(auto &e : readMappings) { @@ -2390,7 +2436,7 @@ namespace skch { outstrm << sep << "jc:f:" << float(e.conservedSketches) / e.sketchSize; } else { - outstrm << sep << "chain:i:" << e.splitMappingId; + outstrm << sep << "chain:i:" << e.splitMappingId << "." << e.chain_pos << "." << e.chain_length; } } else {