From 2842e484b7fdee3705c5cd337cab0799eca62954 Mon Sep 17 00:00:00 2001 From: Marcel Levstek <62072754+marcellevstek@users.noreply.github.com> Date: Thu, 5 Dec 2024 09:02:25 +0100 Subject: [PATCH 1/2] Add BAM output option to ``vc-gatk4-hc`` --- docs/CHANGELOG.rst | 1 + .../processes/variant_calling/gatk4_hc.py | 36 +++++++++++++++++- .../output/56GSID_10k.rna-seq.gatkHC.bam | Bin 0 -> 6643 bytes .../output/56GSID_10k.rna-seq.gatkHC.bam.bai | Bin 0 -> 96 bytes .../tests/processes/test_variant_calling.py | 8 ++++ 5 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 resolwe_bio/tests/files/haplotypecaller/output/56GSID_10k.rna-seq.gatkHC.bam create mode 100644 resolwe_bio/tests/files/haplotypecaller/output/56GSID_10k.rna-seq.gatkHC.bam.bai diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index 9f3bf8022..725cc539d 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -32,6 +32,7 @@ Added - Add filtering ``Variant`` by ``VariantExperiment`` and ``VariantCall`` - Use generic permission filters when filtering variants by related models - Return only distinct ``VariantExperiment`` objects +- Add ``--bam-output`` input argument to ``vc-gatk4-hc`` =================== diff --git a/resolwe_bio/processes/variant_calling/gatk4_hc.py b/resolwe_bio/processes/variant_calling/gatk4_hc.py index aec948f4f..d362baa92 100644 --- a/resolwe_bio/processes/variant_calling/gatk4_hc.py +++ b/resolwe_bio/processes/variant_calling/gatk4_hc.py @@ -36,7 +36,7 @@ class GatkHaplotypeCaller(Process): name = "GATK4 (HaplotypeCaller)" category = "GATK" process_type = "data:variants:vcf:gatk:hc" - version = "1.5.0" + version = "1.6.0" scheduling_class = SchedulingClass.BATCH entity = {"type": "sample"} requirements = { @@ -113,6 +113,27 @@ class Advanced: default=12, description="Set the maximum Java heap size (in GB).", ) + bam_out = BooleanField( + label="Output a BAM containing assembled haplotypes.", + default=False, + description="Output a BAM containing assembled haplotypes. " + "Really for debugging purposes only. " + "Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. " + "Turning on this mode may result in serious performance cost for the caller.", + ) + bam_writer_type = StringField( + label="BAM writer type", + default="CALLED_HAPLOTYPES", + description="The type of BAM output. " + "This determines whether HC will write out all of the haplotypes it considered (top 128 max) " + "or just the ones that were selected as alleles and assigned to samples.", + choices=[ + ("CALLED_HAPLOTYPES", "Assigned haplotypes"), + ("ALL_POSSIBLE_HAPLOTYPES", "All considered haplotypes"), + ("NO_HAPLOTYPES", "Do not output haplotypes"), + ], + disabled="!bam_out", + ) advanced = GroupField(Advanced, label="Advanced options") @@ -121,6 +142,8 @@ class Output: vcf = FileField(label="VCF file") tbi = FileField(label="Tabix index") + bam = FileField(label="Alignment file", required=False) + bai = FileField(label="BAM file index", required=False) species = StringField(label="Species") build = StringField(label="Build") @@ -168,6 +191,11 @@ def run(self, inputs, outputs): if inputs.advanced.interval_padding: args.extend(["--interval-padding", inputs.advanced.interval_padding]) + if inputs.advanced.bam_out: + output_bam = name + ".gatkHC.bam" + args.extend(["--bam-output", output_bam]) + args.extend(["--bam-writer-type", inputs.advanced.bam_writer_type]) + return_code, stdout, stderr = Cmd["gatk"]["HaplotypeCaller"][args] & TEE( retcode=None ) @@ -188,3 +216,9 @@ def run(self, inputs, outputs): outputs.tbi = variants_index outputs.species = inputs.alignment.output.species outputs.build = inputs.alignment.output.build + if inputs.advanced.bam_out: + output_bai = Path(name + ".gatkHC.bai") + renamed_bai = output_bai.with_suffix(".bam.bai") + output_bai.rename(renamed_bai) + outputs.bam = output_bam + outputs.bai = str(renamed_bai) diff --git a/resolwe_bio/tests/files/haplotypecaller/output/56GSID_10k.rna-seq.gatkHC.bam b/resolwe_bio/tests/files/haplotypecaller/output/56GSID_10k.rna-seq.gatkHC.bam new file mode 100644 index 0000000000000000000000000000000000000000..302bea27481bfc990b1eac3262949087597daa13 GIT binary patch literal 6643 zcmZ8`Wl$Rm&~0!iP~5G!1qmLkEl?ba1=nE3tx#Ghu7Oert}X6v!QG)y+=3G*5}++b z3-8|h=FPnC+aEjoYxnHVIXgSEEOGeQfPd`~7oebs30R~%wa{{9SMdw_H86pgdv&dHZr%ht*v8mx*lj>>jX!YPoh^v3(IfnXi&uA7H>s-T!@|^;k=l)b zOH0V5`uo3=cLx|X5Tk>QRnIYqi~BF)*80Y|uBD^-t1KbQwNCNYw(Z$A28Nm%r@bMU z)+Lm4rF(6onOSbf`vJG+wY#-y?T=_wHrlUpDbNEw3|k9WZ|{+8Z9``*t~$8+#=ZI6 z(sCT2<@LU;W2t@9x#{TC`D1HW*(-KE7t z^7wQyT)Q#O%Am(tfNpp;yyW9&G4%JgapEgo#{;j@b$7|Nsg0kl)E;N+W=_87W@!>w z?ya8>c-{Qu^yD4G)V>jV;p)ij((jIHm#V;Nb#)Ov@S#?%rsW(fz-V*fw0d|z5P9Ub zw02r~CW-F3&)u=?y8GH@bbKP-akjM9wSOmrJiO{y{kaB{+%WF?RkwB`x9+qV@Uhg) zN%Ao@#wc@XJr_RppLVrV3 zTnJ9ALZH;G$D3Wqs?FmC>Eo%>Ty(NM)H$n!wQJGkW9`kY_PdV(gzra)Ec-9J%C0&ucARoE z7dQI&oNE6KH42z_MK`nuV?6d)dNg08*Gg<6gU2|e@s7Fez-#oPyME5 z2|JUonMSUAUYsvRRa!LHlDE%uW?3pMcikZ@*WKM|(l>xZ|vw)f_2^gaqnpJ?OAXa3_g|#O1`=t8n-0Cs?xZImGR(2%qd2k^e>rk5vL7GRWE_5)6s@HYHtoCwY?)|6D9h!8#>Zj<-{Bo4|f^!hBM%upl zvav^wNza0Rb0EWehpa{N`f_3_!)7A4SYxippAP$M%koxK@R6xEaYTV3!2F^oY3=cAeN46%mR0;eisIHbp3}*`RPggR ziP|L}L7m?o>RIv#$e+?YiNl*cy5Hv~{56ElsJtMRld_#Lz~xua?;WU`!F$j5N_V&J z_V;<|V^p2U@0DlcG^s=;Q9*lOD=eoINzD^+%u`9o%wOAXNt5_Wy}$$n`jc(#ixxI< z)C-&(Yk%eB1YE1R3$*DFSB`}b4NMTUak7(ROYgLim$TX}C(B)fM(rV~oRBmsh@lghv#P&tqZ9e2aD)(mEwS?G~tt~HG_p+myoFOOdDpu4(?~%_5 z8eoX=tIa%E%$!EqOYt3HN^RhdgU#1^5+c%(swRCh$+nmdY6>qJ?`+4YmoXaZxOg;A zu`^<1u+B9`37aB=E?+>Qf~*^XAS%1gZ@&&jl_?-&OX+W-#za(vN7MSmHYyZF`!PmaFY8sh#HEICiY2% zmh8CmvvRg}uafK4bBeim?jeqj1nUWjyc722o!zl}+#bafxCT*nxSNO@A_IEPFVAYC z6V3g~JinsggoVN+q&~Tn_)>2_VvApBUPytdj`7M0g zm#BY2axvN=(L?se>M-K6v&QOUGuM}J+(0~n3_*wUc0I249KBc@LJOPP{l%mq1&pXd zHD+H-m-rlZDmzP)+Vus=SO>=BJll#BDGS5Q#|vc-zMTj-yt*yI824_1d(LndQL1bi za!X)!AF@bV33|l*rN#r}<7JII+;ggQG|1ubg=p)?=uHW>85FL|`TpKgkSZ#%Q%oVR z=7ddy5efZQU>r}A60}}L_8f+hjo}YUbA7y}AWZWwD4#0*T+=8G#$1h>bju7)Z-L)N zsOK*tskDs;Xt)UCf`+D{m&W3js43ZlpbztLmQ0~c-eoEVd$HbS*?Xf9doCx`m!uWu z%6m`s-Z{4k&IDU63jd=FH3F*$s-fC?%n60h{uU4jw&gaZzsl3jRz-4>=%6;F;rxsj zqmHz)+Y#{(1)D>rELoKG!00bLH_;MAXzKf@jqRkZ4%%w6swN1MM5G2M z7Kf_;H1PZ+gy0%Uvp*E9Z{`J$JZV%fU6#gm-G{)^^u5b%ycL6?ZN&-d<1>rDsUPWs z40I^GyGBdgc;S5I<30)1g)^}poO_0E|J;B11ly%eAx+PcCV3ewOxKbjWYZZ9-=}|- zGvpw7owj71E}vVjY{$wPzWzO7T{j8GG!cA%Us^`yF_P$uAs;SC-k@C!A}ZY@N1_(f zo--UfIs&@C$TJ*|Zxz$!H`9?-kDjt;vRo(QVA-e9C=|-+z9@))o#u()q2Z{F+7AQT z;(zBJvvFJ|$a)#R+|VSxMvG;}o=@(?gvM4h$ej{#EV&5qcs=k<1c-dgTc0q0^Q^un z==T7g8bCB?3zDajhrtVg3=Vv79(X+$HSOe>S+b8XC>_}k&=KXZ8n8i^%lzs0m6Nik z{P@(Iu0T{l&x?NRp9uiR6Q?dGKvt+Vkek0??cDsV_(-wXUcmC}r7%Tm9tY#iT{G(# zO7p8?SVYSFAd&CJxWltuET{Mw4T1B~P5g8#)otG5(uU)rQNfW#;`Eu>l;N1j%p(2G zMcvVw+>Dccs>L?0p>&{mlWl;l(k7h6Ry*(#<~YQ%^~B{8AHJi;SAyrIg@Y=4=4BBX zF){23as@O{*nMv#xY%BRzKb#O8WuitnKOGMuz|m@e*-*fxWJye_lPP<&kO!XkD=*X zwHRB3LN;d23R{jssG*r18}TA6ua?0+s`Ad|vnwOQU;9D94?hat@=QPPx+f}jF1{36 zohEUwMQS0Radzj)G5GwDn*2mdYekQOo(u5%Fn~~b;ptl*b{1SWnaHdPNirbgUw6;KQAijCca`# zS`vPxcEM)n$Iz%{2!GoBnhk6JyDB^9RB`ca3TAl)?RrCweK|>_Ry8HbX1qbPWzn#{ zr?a)Kylwa%J~LbDSB}ww#8EO6le*Laf@E`K2OZsL{4g~qC40Fv-jdeq=+7W{?2)Ih zAupvKXN-tJpQx2w_cpf#e5|{`9$<`ZLP-v06QEdbAP{M3zxqCCJ?EuH$eW6o@Q_-r zP+SiJg@bU#m?`W{m|vzoD(&xuPXVjJ+yw5Kn6|0Ul@Oec+U}_N>{7GJoT6$7EPV#oI4}-4kuNoXdaivVPx58YAScQRG03L!RWdVw$W08V%Ln0z^I6hAA5RDt8XyN^ z%d2jnkJH$SJTWb@hC|lz2Pz{hU7>=e;r=TCUFE}e%henwY(Mt!MWRA>^QTn*q3#D# z%zD3>>HrnxMs`9c>s-^1HXQQwGmoi5F^8L^RB34qo|7<+?1W$^$UO||pPL5~(zO~4 z?y;s1jTt2Mw~M$xtThf9;rd&U6l0u|+Uct9#Z&_i$5}_XjNnnWhX_vGNGomIw_`ZO z>M1%<8-Ff`^APX=aWIsMB8-4y8uDc)cblod7T*cKQ@c-`3jxoI>ntEttTXts=*QpJ zBX=9(?6buiZl*U@x|}870uSS_HJSt?CmdS>Zj{7KipW#`>*HvU*Ja2Z-`v*ZN~|IM z^`Nd7C-q7NmxRt49z)p(BNYyvr=S2=eE+VUnVMi7oFx-ao)s?M_#Q#7-JN=#5*B|z zV@FW>6G}Io=ou!cq1HB3p}82zVLZsMJn4(fs=gem4(NiWIGL2%Bh5wC2N`f&^HAP$h++!?#4|v{zIH5*rr60I`7Nj43Wm1`wi4g&!|AT0k zE9ENZqcK0%_YPJ~5mP2`{x9=&owZEdwPYK^Ok3O#gENRtd6B3|u!WFF zl+tc45^GSjrHwTKSIYE?uif;!ospN-YmR!6EYxAxKBqbT5Sy~9HTM>O-uYB&2pb8# zqI~jQ2?-$Xzq&>Cm}UvIx-!@j=-N^6C+?5-UaP>o)O^XnxEubN3WZH<=DJDe&Ts06 zG1g$8*MrR4{7+8Uv{i#YxViRLM5WiN+*1*`%$vN^4w>#3LrZ1SuoLzE-_&*X{bmW} zR1^&ND$OWkb*$N<2v!fe-V?fC21;AP0LC!jlUT;2R2(y~ER~w;{_n7rEQb!IC5KLw zRMG9QbkzL5XjJw8KVkjh#lc2DS7OIgso^F!5nPgai?Xg|4I3`Q!_elCJ*smh*^2J` zCv*f+vJP?RMfCrpLx#PA>C-q@DDR(Q*zMU|zeQdJXtisp{Q0@6~(iEz*D11!)EGrZ|rxMR0K5 zuIQa~(Pv@Vn;N{1iV&nDbckVAp1B!B58`c~(1bR0r0zfKn|*x0aQp_beCUbeKU3|5!=Lod7o8))J_|FV{TN`EgR(ZDA`*)Kj;3LTVdC=4+6 zJ$jovkDpoXZ|f-v*XfX#!l}!+CjD4>!gEKVl-Z+3&-O$&%Iv9}17(P_L<(Kh4?QGb zAG)~j5GKusD?}$a$%V58qkk0bviR$2L)A~2Zn0v38u?Ln3*@d)Z!c~eY{|(iR3FDx z0q^JN8*u7Skr{$@oA(6TRN5lkjZz?Ssv+?WFlTz-a{HC@K9i8gwu~D5-A8B@8ztjL zo*Uz7IC6EgvCyHnX_CkVSn_V*^()Ey&vAg-`1S+o1>X_B@?w)J21}{zZ}nCEa^a~k zcGutna`e0KaEIU+ficA8rwSf88(wRG?%xOv`s*KjGQ_q0ilbmAGIXQ30T~X*5|cG1 zt!7Y6C&aIC3-3T z?!_*MfKN7*W6m> zKNOgeS^iiV^Ppd9N^k97td!fLmYGCjgDvJs^^*tqyyW(hE!5UNigH&tY-J>s3btS- zQGpt@E0i_wj|-v_vuF*}=21!uhWL;-9lL18HZkZ{26$LCGK5lc@gg;^lln~fmg*~i z6)CY>E~&^|tmouGB7LdqvlsKER?TCeGz2pjBpQsf1d?HT63g+b5-Uz|n((VHAEM+L zVk4fQ#wT%ZRK#`7ySya*YwY#{Z#)@Db3EGT8g9HbUu#SF+8CgXO8zjUul!+naN6#M z05?rMp%eBbdws~jD|PDItq=C141x7NrBO>^y1BY1$_(2$GV;y3Wr8M#5XlM??W6q* z&#``|XCRYsPsK3v#L(tvti4SASaJT1Bpkb(Kouy@OY-;L`GD|H@wv#8vL{n8ZNB*$ zDW|>+Y)qw3KNJGBQpi0*o?Mu7Mbw}Ef!9>$6ddE5o9UfAKEM5G{dS7h%uZI6|4HmW zfOPCB%q9sQeTDjO^D~IO3B@@4^^p&kkIG=CCU|jM3O!dZIOJ~}fY)bfS(Z)Q-A9G5 z!uV|O0BW=hIuL<={vx)8n>q3dsMnHYeJ}!EQRCqh^aZ+c$Y7x_ddny?yW%HC!VuWZ za(=T}HPsZ-X*_u;h0W&OM#JAZUikRTx9C8nZdDVg@3l;6cS%15S<_cCzv~KT^h}v| zz@7mux8+$oZaldE80_n)m1bXw#Sn?D4Yv~-*28ISPw{$3p~)-?GEz8U6w;oaaY}5? z@0e36wuhNUPYxJ3Ty^DHzp)HVBO2+LT2+k_dD`GEK)AtrAbjCwwVbpd8<+q-2)*#e zTrrs-nTiFg`NGVK(+}UQl*0yTi*FfsCkNqgSDT+I_Au4%h%2ODdJU`UgF6oOIl*!(B$jS&>^n>P@L01zU|6%2BwCjag_;uUL8 zdwxR~!eW6LgQT1|t1CA#bO%MczOjd++7Wqq&%W@Ag80j{#pXB@N=WR6URd(1;H-k` zcFbd_Is+Dd<@bwt(O0|8zRcO;6V0W$kxCj-4-XtznP6rJy3?Bh2hkANzD+o@!n|If z3-P}Bc@!Eb_1F}wb9hzB62YOensgVy*wr+t;L>xw_T}-VA=>|{cyh%Fflc7c5S@}1 zW1H3)KrbzAcuAfj$?u9?M{PR`e$zXl3J7`Z95s3%PJfnQE91{@jy356V9CYtKQ#~2 z#d}StXG7T00KwDV?{ z<0OSA`@DOj?c725xvp<*Y?c+_0>-|AFQ!TE0eVoVrsxf^yyL%&VlW~iX z!%#OZ))mH_YGyZO-uAZ4F^xvWf|Pc`9{+blyu&pTrt+@E8`(-+xT zXIl4oA9H@;;n<`lJ%6LqYd7A*d&9l|pv5+mbgd;dZ;jrzy-(htn+@I900e^@uoyVk z2PdE5#-H?xbePGTJs&j$G%y_6mSId>r`({PJAc5F_&o*h%6r$q%oYL{DK~bp`!pl& zyo0_jGGKKQHLf91tq=O!e_K85;mJK?eYx%*b9D11u`9_vvDFH%j}api%Rs(?@1i-i zq5wNlZMAn4Lqu#3^F|4xcNEN4B>%L;PbCbcA&~*gXJ_DFu|8svT2eS1<<|)tO?iQr zd<#_XZ%-SaNfMEQ_GaYuzV2r$1(DLzCZupi?irLS@RYL^Pr&0B@1j&atA=|A>+mC2 z3hH#i-)h#WKBVPf3R51c-wDK=?}lAd2F0PHvu1+`A5wO21%lr8)5sk>JFKD2y*|)w zgek0%7QI`V0Vqvng`uT~tpgM}XjxYSy_oDkba?|aOb(QWjS=q^bQ`63D8D$3*Ih4_ zS;gHl1<&m3a%tFysMzQ~V^0oxgoA6LGo$$%$=#Wsak#y7u2c$dGhW4@$)_Y#%$A6_ zG&c)DOoP|2`liDLC8An`6gi{}KOv+ek~3wALm7l~lUZ%{8OX3iAVnd`)A4omQKQb( zMquZpr%)tinya6}#M-zXOCzS`RqmH!!vG_rm8A^kigYU|?VZVoxCk1`xZ41xNq^(`5+lH3=e*E}jpOhLgy8VS)f#I|bDM literal 0 HcmV?d00001 diff --git a/resolwe_bio/tests/processes/test_variant_calling.py b/resolwe_bio/tests/processes/test_variant_calling.py index 408bcdf6b..241774a79 100644 --- a/resolwe_bio/tests/processes/test_variant_calling.py +++ b/resolwe_bio/tests/processes/test_variant_calling.py @@ -193,6 +193,8 @@ def test_gatk_haplotypecaller(self): "soft_clipped": True, "java_gc_threads": 3, "max_heap_size": 7, + "bam_out": True, + "bam_writer_type": "CALLED_HAPLOTYPES", }, }, ) @@ -205,6 +207,12 @@ def test_gatk_haplotypecaller(self): file_filter=filter_vcf_variable, compression="gzip", ) + self.assertFile( + gatk_rnaseq, "bam", output_folder / "56GSID_10k.rna-seq.gatkHC.bam" + ) + self.assertFile( + gatk_rnaseq, "bai", output_folder / "56GSID_10k.rna-seq.gatkHC.bam.bai" + ) @tag_process("snpeff", "snpeff-single") def test_snpeff(self): From 2510fb16b3da7d9e7952f81430a8fdce649f06b3 Mon Sep 17 00:00:00 2001 From: Marcel Levstek <62072754+marcellevstek@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:53:06 +0100 Subject: [PATCH 2/2] Add ``--max-mnp-distance`` input argument to ``vc-gatk4-hc`` --- docs/CHANGELOG.rst | 1 + resolwe_bio/processes/variant_calling/gatk4_hc.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index 725cc539d..5638fdc8e 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -33,6 +33,7 @@ Added - Use generic permission filters when filtering variants by related models - Return only distinct ``VariantExperiment`` objects - Add ``--bam-output`` input argument to ``vc-gatk4-hc`` +- Add ``--max-mnp-distance`` input argument to ``vc-gatk4-hc`` =================== diff --git a/resolwe_bio/processes/variant_calling/gatk4_hc.py b/resolwe_bio/processes/variant_calling/gatk4_hc.py index d362baa92..0bb905f21 100644 --- a/resolwe_bio/processes/variant_calling/gatk4_hc.py +++ b/resolwe_bio/processes/variant_calling/gatk4_hc.py @@ -134,6 +134,12 @@ class Advanced: ], disabled="!bam_out", ) + max_mnp_distance = IntegerField( + label="Max MNP distance", + default=0, + description="Two or more phased substitutions separated by this distance or less are merged into MNPs. " + "Set 0 to disable.", + ) advanced = GroupField(Advanced, label="Advanced options") @@ -180,6 +186,8 @@ def run(self, inputs, outputs): inputs.stand_call_conf, "--tmp-dir", TMPDIR, + "--max-mnp-distance", + inputs.advanced.max_mnp_distance, ] if inputs.advanced.soft_clipped: