Skip to content

Commit af09464

Browse files
committed
r1263: ~5-10% performance improvement
Via larger batches and more short-read heuristics. Identical alignment on 2 million reads. Short DNA-seq read alignment may be improved in corner cases.
1 parent d43f356 commit af09464

5 files changed

Lines changed: 36 additions & 26 deletions

File tree

align.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -603,7 +603,7 @@ static inline void mm_get_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int3
603603

604604
static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, int n_a, mm128_t *a, ksw_extz_t *ez, int splice_flag)
605605
{
606-
int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE);
606+
int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE), is_sr_rna = (!!(opt->flag & MM_F_SR_RNA) && is_splice);
607607
int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1;
608608
uint8_t *tseq, *qseq, *junc;
609609
int32_t i, l, bw, bw_long, dropped = 0, ksw_flag = 0, rs0, re0, qs0, qe0;
@@ -773,14 +773,18 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
773773
mm_idx_getseq(mi, rid, rs, re, tseq);
774774
}
775775
mm_get_junc(mi, rid, rs, re, !!(ksw_flag&KSW_EZ_SPLICE_REV), junc);
776-
if (is_sr) { // perform ungapped alignment
776+
if (is_sr || (is_sr_rna && qe - qs == re - rs)) { // perform ungapped alignment
777+
int32_t max_gapped_score = (qe - qs - 2) * opt->a - 2 * (opt->q + opt->e);
777778
assert(qe - qs == re - rs);
778779
ksw_reset_extz(ez);
779780
for (j = 0, ez->score = 0; j < qe - qs; ++j) {
780-
if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2;
781+
if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->sc_ambi > 0? -opt->sc_ambi : opt->sc_ambi;
781782
else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b;
782783
}
783-
ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs);
784+
if (ez->score > max_gapped_score)
785+
ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs);
786+
else
787+
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, ksw_flag|KSW_EZ_APPROX_MAX, ez);
784788
} else { // perform normal gapped alignment
785789
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, ksw_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop
786790
}

main.c

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ static ko_longopt_t long_options[] = {
3535
{ "splice", ko_no_argument, 310 },
3636
{ "cost-non-gt-ag", ko_required_argument, 'C' },
3737
{ "no-long-join", ko_no_argument, 312 },
38-
{ "sr", ko_no_argument, 313 },
38+
{ "sr", ko_optional_argument, 313 },
3939
{ "frag", ko_required_argument, 314 },
4040
{ "secondary", ko_required_argument, 315 },
4141
{ "cs", ko_optional_argument, 316 },
@@ -218,7 +218,6 @@ int main(int argc, char *argv[])
218218
else if (c == 309) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ, n_threads = 1; // --print-aln-seq
219219
else if (c == 310) opt.flag |= MM_F_SPLICE; // --splice
220220
else if (c == 312) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
221-
else if (c == 313) opt.flag |= MM_F_SR; // --sr
222221
else if (c == 317) opt.end_bonus = atoi(o.arg); // --end-bonus
223222
else if (c == 318) opt.flag |= MM_F_INDEPEND_SEG; // --no-pairing (deprecated)
224223
else if (c == 320) ipt.flag |= MM_I_NO_SEQ; // --idx-no-seq
@@ -257,6 +256,17 @@ int main(int argc, char *argv[])
257256
else if (c == 501) mm_dbg_flag |= MM_DBG_SEED_FREQ; // --dbg-seed-occ
258257
else if (c == 330) {
259258
fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
259+
} else if (c == 313) { // --sr
260+
if (o.arg == 0 || strcmp(o.arg, "dna") == 0) {
261+
opt.flag |= MM_F_SR;
262+
} else if (strcmp(o.arg, "rna") == 0) {
263+
opt.flag |= MM_F_SR_RNA;
264+
} else if (strcmp(o.arg, "no") == 0) {
265+
opt.flag &= ~(uint64_t)(MM_F_SR|MM_F_SR_RNA);
266+
} else if (mm_verbose >= 2) {
267+
opt.flag |= MM_F_SR;
268+
fprintf(stderr, "[WARNING]\033[1;31m --sr only takes 'dna' or 'rna'. Invalid values are assumed to be 'dna'.\033[0m\n");
269+
}
260270
} else if (c == 314) { // --frag
261271
yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1);
262272
} else if (c == 315) { // --secondary

minimap.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include <stdio.h>
66
#include <sys/types.h>
77

8-
#define MM_VERSION "2.28-r1261-dirty"
8+
#define MM_VERSION "2.28-r1263-dirty"
99

1010
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
1111
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
@@ -46,6 +46,7 @@
4646
#define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
4747
#define MM_F_OUT_DS (0x2000000000LL)
4848
#define MM_F_WEAK_PAIRING (0x4000000000LL)
49+
#define MM_F_SR_RNA (0x8000000000LL)
4950

5051
#define MM_I_HPC 0x1
5152
#define MM_I_NO_SEQ 0x2

minimap2.1

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -297,11 +297,13 @@ maximum alignment gap is mostly controlled by
297297
.B --splice
298298
Enable the splice alignment mode.
299299
.TP
300-
.B --sr
301-
Enable short-read alignment heuristics. In the short-read mode, minimap2
302-
applies a second round of chaining with a higher minimizer occurrence threshold
303-
if no good chain is found. In addition, minimap2 attempts to patch gaps between
304-
seeds with ungapped alignment.
300+
.BR --sr [= no | dna | rna ]
301+
Enable short-read alignment heuristics [no]. If this option is used with no argument,
302+
.RB ` dna '
303+
is set. In the DNA short-read mode, minimap2 applies a second round of chaining
304+
with a higher minimizer occurrence threshold if no good chain is found. In
305+
addition, minimap2 attempts to patch gaps between seeds with ungapped
306+
alignment.
305307
.TP
306308
.BI --split-prefix \ STR
307309
Prefix to create temporary files. Typically used for a multi-part index.
@@ -520,20 +522,13 @@ Copy input FASTA/Q comments to output.
520522
.B -c
521523
Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
522524
.TP
523-
.BI --cs[= STR ]
525+
.BR --cs [= short | long ]
524526
Output the
525527
.B cs
526528
tag.
527-
.I STR
528-
can be either
529-
.I short
530-
or
531-
.IR long .
532-
If no
533-
.I STR
534-
is given,
535-
.I short
536-
is assumed. [none]
529+
If no argument is given,
530+
.RB ` short '
531+
is set. [none]
537532
.TP
538533
.B --MD
539534
Output the MD tag (see the SAM spec).
@@ -689,7 +684,7 @@ Spliced alignment for accurate long RNA-seq reads such as PacBio iso-seq
689684
.B splice:sr
690685
Spliced alignment for short RNA-seq reads
691686
.RB ( -xsplice:hq
692-
.B --frag=yes -m25 -s40 -2K50m --heap-sort=yes --pairing=weak
687+
.B --frag=yes -m25 -s40 -2K100m --heap-sort=yes --pairing=weak --sr=rna
693688
.BR --secondary=no ).
694689
.TP
695690
.B sr

options.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,13 +179,13 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
179179
if (strcmp(preset, "splice:hq") == 0) {
180180
mo->noncan = 5, mo->b = 4, mo->q = 6, mo->q2 = 24;
181181
} else if (strcmp(preset, "splice:sr") == 0) {
182-
mo->flag |= MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT | MM_F_FRAG_MODE | MM_F_WEAK_PAIRING;
182+
mo->flag |= MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT | MM_F_FRAG_MODE | MM_F_WEAK_PAIRING | MM_F_SR_RNA;
183183
mo->noncan = 5, mo->b = 4, mo->q = 6, mo->q2 = 24;
184184
mo->min_chain_score = 25;
185185
mo->min_dp_max = 40;
186186
mo->pe_ori = 0<<1|1; // FR
187187
mo->best_n = 10;
188-
mo->mini_batch_size = 50000000;
188+
mo->mini_batch_size = 100000000;
189189
}
190190
} else return -1;
191191
return 0;

0 commit comments

Comments
 (0)