From 37645ef2b79b2baf2c13857bd6713a5bfbb2e2ae Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Mon, 3 Feb 2025 10:29:17 +0000 Subject: [PATCH 01/10] Large scale inference docs --- docs/_static/ancestor_grouping.png | Bin 0 -> 31844 bytes docs/_toc.yml | 1 + docs/api.rst | 26 +++ docs/inference.md | 2 +- docs/large_scale.md | 136 ++++++++++++++++ tsinfer/formats.py | 2 +- tsinfer/inference.py | 248 ++++++++++++++++++++++++++++- 7 files changed, 405 insertions(+), 10 deletions(-) create mode 100644 docs/_static/ancestor_grouping.png create mode 100644 docs/large_scale.md diff --git a/docs/_static/ancestor_grouping.png b/docs/_static/ancestor_grouping.png new file mode 100644 index 0000000000000000000000000000000000000000..a79c75b5a3bde6aaaad800d8d5a983fbd52dac34 GIT binary patch literal 31844 zcmeFZby!tf*FL-vK^jpIrBjgZ?vn2A?vmQ1GzdsYN_VFM(yf9Z-6bhVw={fn<2g@# z?{$5@_qx8{f6wu_Wv?~oTyu^&#y#$F&!u6C@-NX*@K7KS2)dM{m@))%4+??64I?3f zD@YcZXb{NL8BbMB7iA+TiKCN)xs|OMiHnD$8Ht&@l{p0BK2x4z<%G}uDfrd|r{(_l zdQFV4Z|VZ;u5RRG6V0^LhAZs9ky)`MBhM|N2{j(w-a_Zw&Wc}nCCOCJ*qmIgW_S@a zGu>W18#LKJ_watLeROcW%edD>_=AyrFl_z!qJ49V`^x?HI+%FSXurgp=Uw^fgQH1I z&)57fFP(BG-MvR3$DVBpV+&N$2PeYrnTE{;KW*xMd!QUOXDlrE2Kx&ZhT67aT-M!k z=G381N-<|($F4KypF(19!es(+{kz{gjg@w8O|EQB?W6QSUtOGSy!~ZebUqvS`de>z zDdTj?-cWtzbnk(W-#-%8Khnir%kE3tgST5ope-7>jZ zBi_+wSL{h$zh6EIUwNK1l;6)>vx)B;tWvXHcVwe_l7s!p;PqjX2omBWh>QRVJ24ySzm|k@uq)Dg}FO$O6Tc)FWK( z@fsAqr@RsiIHurvWw>-c9+;$E&nu8cEUlY^~rcVCD|39|Mz{%M18NV2`}P68;k+Iv)_w;%;srU zYg%(<)pDsmQR3%i&Alb1^g2F~gwUDA?usV6S)b^~82K)TFA6iv{U@JJn~WuIRajv)&?>Q-h!VwO2AfmIeFMR5ovzoZHrSn#$~k zhdfz+Z;Bv{Q#D4LNU+^>sI>_;wOuSKWZrf?*5ZfQ%bIuC^MAMhuK#UfQ{WL@>F)O? zDF)F6`t|i`{4v6&t~SK1zyTGYb~)HZwQ*bd=R^Rjv^WNZq5ekk{o?~A_bd* zF1Kv1rYYwHlfOcf9B)nW7W6u=S%p+hZkDUY6xMw@ z40-t-9z~Urir6BFr;^dc?#Lou-Zy3b@wxGfifFtxtFW_%<-V6+MweP`8=*{RugZ>| z=H)c;UTnsAUhiehI}kg5kSk2DetsZeCM)7vF)obO$JL8REMRiqm*LQ@TYFAA(q-=H zCW0eXILcW{b*`CwTfbtyhZDvSOGSocFO5fvL0=t$>OEt}Vz%z3;);hl_< z!ceG@Oq8B4cYl0pdgOOpIZ z`q^KC&vx=hM$jI%G%0E!E(JRm2SgH4`yU-__dVko}C zX^NjM#XIfQ1HVYm;McQvkX1xdJ9=}EEX64cE1oo(Z@ri1lcbC*9QARZ;m1APiV;2@ zT`q61$zPOQh*BN;O!)4jQxigzSdbx#B2B7gK8rxim$s~%jcknv*As%f&hXiBL}LZ& z1V2I z%Nozw0b)X`yQAXh#9{J>=S<^)SG6j`-|p4Fl3+Ex+4x4@JKt zA%3D8@w#*T#>YlesU*hrSUFfPSNUsRkG1Sm#X9&A;b|y^^rSvG?r?FXF+}f$f0kgM zs~@e%!#OTBr{!cM`k>;A8$@hwyuU9~0)J?t>hV6sW<{K-N*??VE1}K;2PUI$3ccZ; zaH}kZmhuYeeR|CXB241KO4rEZF;}Oi758KWQNR26VaZnpc0#g9=%YueEf;9v3fDXo z+@LF>@R*4`kWI?tEt_;A=eM`FU@WgL5%ohyXi zL#L=eFJ%o6SN%FfsLgiHZW0odrXCSF_Te@pE(*2IBooVHTr2aCu(vR4BxcMOF*YYP zqc$&Sv6sUZ;(B)vE$^Ns^7@eevBwndTr(6-JTDfntj7 zl6rJRE~2NI5{1O{lVr$6LKI}8qT?7r+mvW92vx@ZJTtzKE-Yu(A8;HCU*ACvb zo-|l~kWYG6y7~b(;&FTYZf}4KN{zFVsYVYr>dWjysUOlWK8I8Uekx3Ix_`>OB!3(PEi2D=5iCYk@3B+%*2j}oP_VBkxWd{!=ELj_&39? z!^3ALzWuR!^XMj;GZ8H>ipb$Rb2RVTGhejPL%6V{i+8ch{6VjxT{g*izaGSQzZ>az zNjf>C_EUtOgy*N=J9SxWJ5D1ti7#`!D#Up=Yln|ANs!oH$)>m%zZzq7EoCa2nt)@< zBlC%xIh>>v3O;>eI&qKelQJ)PzAQzIHj}2@v6HaM($R-bwMnEjVkbC<@!K^WQ(fdCGsMN zsn5@+!u4SbkaupPgh*#pmC*OehY*A=TlY-yL`YhzSS79Mk{6Rc&PsB2Ms?#EN_<`vTydcTMgH*CWWU1);%GueBc(KYK`W2n% zd(u#T#m5ERgwv`@4IH4yJo-pm6!EP~E)OsH$8WT7uCKivBk%5sBC0%Nn(w?nMr8fK z<+-2qI?F&s$>;dxCyM3bGHhaQZi-F*s)j8qZH+Nylq1osO4U7GO^i66m_H(n`5%#B zy42y>-y5~OxQCyKD!BB6`8DSzwjb@U`$UgE(PJgCbUt{qGg1OI!C3UXx3NMWvb`Fc z7u5C4`oWWkQ5Ii%rP){QkP{K}fX%Ft?09op3KYz@Ob<|&@FkJhk~OtHAU}aG4oE|Y zMs%v(&S}AKP;cf4xsS}vH~>Evrp*?X;o;Xpc3N)x2uW31*Vqk-rRfugo4Q)0crZQ_ zX85_g*jR|krVY;h+vwTA5n*(a@eK4OH&muBGjnIej{`YAEyk>P#^0#F#Jmf8J1TNA zQ)a6vlb6o?YpQ4>kj~6a@VirdW@m!oTV< zP33_jhlW<`^D_+csi;wHtYu!|#C^LWgxlp(ozxbJ?ZBNzBR_1uOIi^#eG)$P{*Wwi?6}B6q{xZJ zVhqJq)VRQ(+>`T=+LAfoT@J+gl?*(=eeC!O7VFO72oiBWCHwdd+eh9MV?A_jk1pOw z(Cb(`AI|%ZDW$&5Ko#}J<3ajJ_o{t^x{)IbC&ipJ$4`PKlOmg9K+PxhYaawxrDN3C zt}F{Tuxy_EAp?n82!~53T_@g(R;xgGO}HZ)HC%|KVfzT))AIG67cn(N`5ad!7j4@o zR~Dal^uPC$7Y8E#Qc)tFm_%$u2{P-+(L@TE`)MDq!i9>ym9NWVJKpS?xBDFy>R+!CQD*gRSve>DvW*~%N z^kX_>m6>QWqnN|Y(+5(>CB#7@9^Zo&UE?^PmfLQ^bEGnKa&RgTgX1#?tDff7b}53VT2s_!@MEPsa~vd?*_B<_&tufNjn#mP3YK3_%L%qZ^MCBQUSjQQn7 zfXeba(LU}C+%!wGChpw8m-dln(;{-)$Jp&SaZfrNzZS84Aaib3xKKj7C;CXX+Vrpr zZbfeeWoG;>5=Ukh$uINIKR9kgN~VJ%CYQ%rB(nSiQFG{6d$oV5SE$H)09H z(LFJc(|!BtJ_H4uE!@UKvl(91d12E$2h-VbTS>hjmRTF&;zaI(4qH~iw6)&F62YUV z=NZPmCo7VQE|-XC+;jb0@EwE_Vo$<^g`ROr=_G|f+h^O!*(u`Y4+w<=n$+-fv~;|i zao63_vL2_aDZMLVH^gawnwHi7^Fu3E6Mb|jBLduhzuP<0Lwqhq%byM1RC6J?$zy{2 zy?GmRXur_|cm3&+8E%Sl=`lBa=@@G=`6JaLRFP+fH$GucRau!uX*Z57g=@dZ*2%K6 zWwJMpT){Fnk_mT@`kdt;Cq5RUUPci0^7O5de-Y;GHU?YyJxgIYx!4%;%MJ~{t7DAC zMfWV0XfeDh?i#|H!PS_KAXg#cRnx2sx2mwgEmbzpL&F}S#ZRab4N|->gJV|P$(bf` z4h?VAVk~B=wIhGycs^Ru-*_<396;VT#z^Ui))AoHXA-8!Jl&Y;_dQ;ty>V2UPu4xz zrHycwOBnYXrWLZ-rPvQ+fri<+amvzH81WGMF@0;KE$`^FeT7RGyg4npx&LHU% z{`jk>UAp(}vuB}?vC46!K2!E+y{SljfGAeDKOb_F>;KG;zJ$^0)B;lQ8t?I}_T|Bw z8WW7yiBu_`6L?qrQ8uCe@jLo*_O=H=ftV5P7dCt4ONyJhXA=(ca3?`=^KD z#h*2|R_xIu9$*O*A`#eC^7oanwYsHZd7U*3ZR~o=4+Ia~*N+VwaR2wNEMP#;T9=dg36ZK({1v)vm?_9`Q+^7`}U) zgCH1m+Nxb=P=)2XO@W&6Q=jZOo|Hm3qL%U`pH+Q7Pv{nhRH*H9PKzhKS*kyNFs;Cb zTUzDT&3wfE)W*;l>k69?;TWSdl_I0JfnT0p9=Y0h{yc-Wd4>i5O*{Lp7H=rcL9~>{ zDuPyu*khe2b-dD*=HVN?)zo)anyKN1k=p2AH^vg)c)GYuj}_d1V0CgY$CLs2Ir$8$ zF*?0fZL(N+cbEiR-@{nc=>l^^=fPe>4s92awZ}~&4I-<&D@e(3$MCMQ^r#dJ;;#-) z-qAfsUS}4TdW%&rYwoZY<=*aCJ~DR5;=cA>xUmkdFJkvte5>dzQ@Wr0M76Fl;oWOd zO>0WD9s;XI)rST;wC{Jc)JNtEHc93&^bWX1KNW4uaKuEvJpG>i7XLbhf1D^6LqLdL zNC(0zNiI>#I;u?`LA?JhR{f16K^m1CdbmXHiB1u{L2ZkqaV$sUsmpo$&-!$i5*MzS zbEFl8c%{$3F81fdag)@gd9QEEzR62DL^wqVA!%20Z3Q6T6Cua6mmrIMrmj z_xJhds^eN{v~Z-@vvG?sc6!OZ-gEIIQ)dRG1^igEJeq#S%vPP4XuA2nP1o2p^6R@) z3^kpa&DWN1syceq;(Ge(Io?npE*DN9h zuSH%;?@LdFkSf6_RV>yWb2Vk*ujdR_K;TRZqJqS^qn3o87Dly-Ghp<89JhuDB&sn` zmN@9$dmt z(5$_mp*C*>&vC_6Q5B=_h{Kc2pp)tf*V)y{*3gz$5aMe;Tf$p*eXK8VhDP$_Tk&kv zh_||cD=X7DevN<4vW?PigA`dyKoCxWw2NR3$M-n+i9$Scs}mi+?50G6*IZOqXQa() zIg$_wsY}Y?9Evcvdtsjv=eV(wAZJSRZ;9x`2)yP60uMd&2-KOf<25$!W2S4Gk4Did z&F2-N?YusWe)5c3WI&Jw{pnM-6xF7NbG7z#Bb$Z;J{es1OXne?vrmrS0`z~?!=SBHn?w1;tgo# zX-(kBBiKq*R8dM)^zTy+aJG@=^NwG#OA$L*K_#Cify|A1IVPJ&>mkmoF&gY_ks=&z zS0;}QVm2#DWnBV<$$&t?2(<^e7J7ub2*UdhKI=IuyS#g}Cxb$7+_LXZygu*jz9%45 zl$31l8VR3bpIR)Y``uLIOFOOi8w4U!QYZNcSqE1{D=2fB08RHFmgPR)8PvE`D z4(M2AQ{4mIJx4jX_8y07DXE7IhIPv}k{pmv;zTcS;p#qdH=mL%Tnp>^Y@8jLz+o<~ zLY_euRgxlqF7nkR;pmA)xvp8i*C8&V^F=Sqq!l{3zc`(601|>)ykbb?(;c1{Pn2DV z_7;hqt{*jEJ0-ODeDFhI>!~2V!hAhA(W@ZuF4e&`biK?)OfF=IYdJ=+emV6DkIL`% zS^bF4jWpr=bBdOUk(`%8miYDc_Ex<_jt%f)8EDh-dfyaede;3|Dp*v!Jpo}X;5sf~=p%9Ni>gI$hE&Qa9N(n`|P z$xOvlUe(0Y#)R9HOi%!Y&z%PhU}xrHMB;8|YwyhC&QEqXE)Vz_cA1fk7`fBiJCnnP_%nu>nX`$Lm7|N5gFOjsOe13lR~LRV zGH{>d@A=s|%E|p>czfr+tN`r6=x*f5$jrdRXlKXxuRWYy#NEIke@*Cr?BT2mPVX3% z&72)volMNc-OTJ=$p3W+Q%Z52*BG>tlj9L{FmZ)VPfCoR z4EB5;QwI|(Q=YpYIoP<^Ik;H3>Di3f%;?#eOw8zwxy@PWjm+81nM{nBO;|Y0{$(gB zduJCTdlNI*P+)KdD=-c>lexJW3zr!^H}n{A4T)O#i&1XlvwR4m$9Y$y(XFy8rWr zs+FCYii;7jnvJYoPj68#$;wLY+HE1Vt`?dV15b) zxa$Gd!XxTrX5`}Fr0U>c%TETI5(%v5oqLn;{n->rD`(Kc1LpC6`MiqRt3TiVgn+Hp z-7ON5JGbRAGWqimXCpVWKjsAe{@gOLGy)1Ci0{7~>hJ4T|HEc+va@oVGO@DJvlz3o z)3X_~n9&R9ns6E2E#n`fJ3E-WK#iQtge|~I!D@g5-K~a%`p%Ru z{=GHS(hO!NCKh&jCT4mTHdSVJ9`HXa8!ZzP4-*p^<6jGAghln=E9PVTf7yiZuED>+ z0Owjf||5fAvde{GN=0f>r!DD6*SP&GHGYCaG2%r=~G?sZO2Dyd(%4#W$16Pn8CAFL( z5Y)%8-*AwWw5Q+XZoHAJa< zaL7TlWJybs4=L^$he=@J&{%#cCDY`?3wp#@=47chmZ@DLp+xuP)2C%AewreyhtWt* z_k$e86|rPH(UD_fOkexbO;@d^Q1E)SW<R7;V$?Ps)q-kOBmoIh!=cx5`qsT@D3t`WDX7j@pF9e_r-s<`HznO(&nEX|D(;H z!S62qv(0~Yyqmy(bo|dY|D4r-&itPp|Jmlxa`5Pao);NtX(dlo=#@@QX(AxJ3x$J3 zKE}q*%gQ1N4ho{?;?g7~CH**U8@2Z&wf}h+6G2j}s=j_jcsNQzVq!^iv#yz$8C#wd z4h{|tHMQdH%{9)WM_lhh9W6R%YHipphe+`5K_JA~#Ke{6{V6Z#>E*1ftb%wYKYaKQ zyLyHL9K}1Z-+XjE)3;(Eo%g|ap6cjfS@I9GPau%iZe-{UIUiq|1TgfS1r%hYq?ANN z0$5mB?iu^oAwGy5-`Zd53Xh6fdk?nAoBr37?M#DnA&=YkQ>HE1!E|JQRRQ#W?Am!0 zvG3KpKco2$m+Nrifqf(v7e*-IezhFb+1W{51c9_q24EoI6xns%ZS*;^=Qj9ZX&`#Y z8SM98JLEy`)^`tL_%0k}<10SBPqF6B4vh$}`xI@jRZWKu^5fg~0cRT+#pzpQtr zx_bzgTi>HyzUukA5%QM?I_E#41qWFFYJi8dFFQ__C7UU9C{lwt?qWc(uxb#N2GVB< zo6O)KRcv#?p`%ZNcu$RB&E|KOiEN{JiUM5Drqx8@cjmsF zOucidUX$D1XpeTc3yD#~ZG}zhk;oBK9^&G@B!@r@rRsS-zUv*aUYVyaudDzs&j*1~ zYdtQlrKJ_bTRFQwMUM*cqYRM$JvS|KRqB4QRoSK<@g!p-(zLD&8Cw+$5!su;t3y5h zF^Csx?|!#ztNNoSeMyXyMM}BYxVTiQ{RLkla5Yk?$jL zzkgqZMJ`cPRW-ysm}Ky}c)V0&>-uzIawS@@;$&;GiI=H*@Ph!Ae43)B=F=t%IEdp1 zob*c?k9EXucPIlU4@at#v-5bi=!5Q>{e8#a(9l%RZLhtS!qSSyUS@i3ZmsonXokM~ z5`vVnxOgy+>t5xc)gN<$Z{XKJR-r=E> zTnd{Mm?_cHz!nGJox_X8;Ct?pk&}a+{F!*NF;iPQQLeK-&0$szU@8L@Isu)}&G zCC#27L$02wvC3UsGz3F($uRXfO^pq{#VA}|-g!->3~ zLBYXR?S62*-@m)amKGH$nwu9OViIk$+x6OOJ#K{V$-}%#y#DB8*GxosczjaQdVPyY zH^$G^RqE%@CDM7J>YkpBU!#dD{ox;24`m6b@IouXBO=(1I`4O;GiGOjY1-7Uyl;w+ z#|{n&p`oWQ|M8C$w&tW&WDKz$KWQDhGYq<-q_RP}h1V^jXh+|FEs3L&o>T3Y*ea}X(fPsEzH z(QtyAAi^-yPgI>N)|OQ+zUO7dCt86;a6h!H3=;~1VLLIHHADms(w;-o`GdJLor(z? zyu1GgDK&6exl7y3_Go^7Q_lKc|q0fcHQK9>d_p&+%pb5&2!1 z3`6MxpFRdKa|;7FC@h3@8Q0~~876nX1}=kK`wj9$AY3p~>C!y-zF88V;)?Y;p> zqS+63`8t@;#3dv~9s+}ES|T)on?xYGB*A_|I@8Z5%$?ICG9tnZCum$fZ3!SvsQ!rR%a<=~VB^7ER1_MpUWtZCE)f_% zd7Vqf^pKHMX>q`Lan+N3X8}{xLy-fYl{k#6ua|l}aFS)LtmQ2?< z5Jv(FM~Cpr3x-%58ymx(NrwtOX2;r{dkZsQ85;C6R%iMHMMXG>pW0GX0LtHA#FwJ1 zweBoSj0jt={lYIUQQ$RC{-h3j31RSMB0DRq|dYqbFx`uwi1&CWuVTC8c%nTGI}Pj<649!KVp1l+NX>-~2QS1awd_l_bV zG|02xM!VI!K_U__ANbC+EeE}RbNvk8vuDr3qoXy%#DZeu;%c3KN*7-4HLMf$C$n~& zY~9f-A|qZUYfNbNkH@=Y6S=EZ0yL)|SSC>s1bhYQ5 z>#n@d?`aEms}Tyh_e{dEYz{y8e2#}-YF8Qh7bs*_IJBNiI5Z!G{P5hO2cWj`IqY$G zc=*WoWY+n~3R~^xueJU>MpSb8o1?`mVe!8aIC9=hTU)yXWCmJx_F8MgfPnavl#=aj zTd9#WZs!{3b@}W4E=kxUvGI&)EkB(lzNYWRACNuiohQ9uOwNMN~f;yY6 z+S0)vEnaGX8Uh|!xB>XxOtVMzWDV`oK#<$c%-ya;sO=`Q(+R9Gu&^jADJj_-wCUJ4 zNK6MOjS57-qA4E4cHVTqN0UNyY zLGgNLg<4wx*?=WZt^NyMkEv--~j9%Tm2T7+I$6&5TpX(*y91-yW|HP|Nd#AwLxck#NBS$AS5+7)Y)2E7LAOI#3v@I zuQ!*fmn%z4hp)K(+#Nbt>9$$w494TMPy4++otw;RK_e(=ASETG{n1pamMr4yQy!Nh zm>atMQY_bPP>y&)A4_33-&iZmA05OS_a_8sO3i4SE$g-cf!^XLusK})#xRZU)%f-; zT7G{1#!lVBgbZWLYSI4n*-EtQv{@1s42Yqn+YPLAyA4MOfc9&DvQ1ta%rq$HS{d}c zaSlurul+-L7Coa(cIk-L2h_$ts-qwy4--P?vSVXo3m7BhN?*fvVbQZAK{Pu^Yu0(a z#9FHq9gj`YX0k#pi9ruAP9YdFJ^F8~XuY}#wp5xl*sMnrq6xZ{5N z`uYNES2qj>P`agub03xTct@YcRy6JMNR7n5P$3uv_=?sQ({HZg;lT~7(A+C~Gn&-( zc!r*Sj4obB3F3WxXNN{d-04rX`Gcb|uXg#0i@RsAcJch~U=J1APo|@J!Ply+NGi49 zd)u-$kX|a6${`i|=+PrK*G<)Q5SRr?fPMeLoQSUZZ`N8<^m>h|$jHdr4NmzNCtHPJ zo>pJNaW?0h>IANjRYv@SahOQ-B&`?pmoNt0cQ!?V>asfC# z`rC|*Wyf5Rj^R>q&o@5J+K>L;nPoTac_=laVc?TbmFcYv;%LnowR>*75|n-EJgy_V z^G#N0!2bCkkAh5gUf z4aEx0Gv+i>_Ord~LVs zk7hLV<@&~oJG5s_;?CX)u#5Vuk22O`gG%bJrl(B%rlELVUV{M zjk`s<;9JoFP>^;pjn&#{z@o__!=qrOwW^A#)^1k4qod=ldb*AZ4MOn7ci9*bk5yGt zT8jOtq-gYqXar_7(NwnlP7%2HMc%!0x!a^buk&dvhE((zpvgaszpdaKOF?Qf(%--4VhEkVrlu5s!ZI|gV z|18(3+XP^kL}52quLO#hKR^rMIU7cj^5giGC_7a9)IpIlwAc|?YcWXFax}ne4X}v9 z>SS}g@#bPnjxjWYmlc`o4H_E6Pliy?zRi#tJ2a7AXEPXAzlh^{DU{e=(?0t}|A&$+ zGL!vUHUJ@1ndsu~?U1L!|sSXDdOx8YOe< zJJ!RqPFA`QAwwR=b|T%*XOTH0xzPe89!G0dH&^E<{92A=Qlg<4<6sCRXoczYGa6eq zK?sS^u(L)#Sb@t@Fs^zk5g}oDAk}Py<9Z32`LMl8;QR-O9|RLB`Vu3HUfz1LBK`32 zute?_*l`^ASZ8{vma?KS4p^qpGk}5mDLgS>xvWO=q*8cXOVrCWNkPE~0(k3W(qNMR zbS{^7uX*&?=_$$=3R)Bz7>B-J))h)Ab&zgM?_2VZIsqoU=+9Ds8chR~Yz(bu#)SNy zs=&Rn0oQOH}qgAk}CZ&g>?)s!9>Iuupas{0*;e&T&iT!x`nD^Z!7E2tkUs z7pu-L2hFRk)(6rBUWJ59gOmXZF%>1HcK{p5a>To$Wb)Prf&wLjxE` zVP~g^nyM<-JQO(Fquu6oVIa$p`1#B-3*(NWVJvtbMcUUdA;Qu)&-Hy{P}KbV`SZ@! zk2}kF*ira_wIS*}j9q&Z8z4mSxE!-Usvav;Anwl;TsSt_t6#xDC*V@V=dc~RxxRE= zY=`S!06+W*+h^Z%SXG~nser$pJ(Z7(zTe~;{`T$I;m^MDW)Ei^T-;KHOaVMzw=(TI zd%ZmqP&OCTExgtN3%41u$=d1(y1MeV`O9gm3oeI>U-}(05gi--pyoWWbtWf zWeKzzJoCQ$i^7!8pO=6wZv^~syg&|LWzc6*54y42v^UWij0zJV$_?6nYz)%5^?5!V z{|KDVl`Fe4r+MEMEP$4oS?MvA{Mgh7lhXM%%VDzo!a|!L%^uS7@)JNSV>kYcWH}3SoCz?a@v;*{&Y6hQR=n zQ2@ndaR)4+&}&u+fslr#a02}Ulp7bz5sWVV?2}`cJkaf_r4;uLb#rsxni5ch0R?g# zL?<>O;m%nqtA%`SqRKz{KaC^;!V7kGwdv{U*t~#%dxYFhf54F5pjDUO>o;5N6@y;m z*tx|M{=Z5rhu;5z{OjA{+SuBbaX9>-le#$QCU%{*tK&v_vD>)CaI!O7*HL*|nE6`S z%BpzucsTdvM1GxXLcU}su!rQU z08I2oQqAulaGZ7iqtggKdnSj^)&O`J3I>Kcu>9yTn0_r_liwEeg*Ud>5r$6c7L?!fIUzJ^ti^X?+GkHC@FSu zfwEbXk&;q8J3AW)X{B|i_TwdL00dvn$tx%%QA838>Z9T^jMcwdsR4PnKaE@E{QO*6 zT6zQ!BH2h2jZDBPEkP*4OazOU!VFJoBO@cvp9vZ;t^=d0iRjB~f&i{TmBEI+;s)$Xg_Br&{OdyuJPweL#t&Z0EXWD$*w1Lo? z#SE3gH5YkrX7LBie`Vh0q05Gcg<0)?!*X%WJ8#I4uJ%wiomUm@f!{g z*K|v*XLwa$5O9%5d9J>D>2mx_sWY9|F$7c^-btLq&UIF!&%3h?Fa@uwK+TuZEF0Cp z+R@0W3Oi=0N&yX$;^Xs>A3y&UfXWbK^b}S>kAqd?t*3K4KL@PjP9kxv4MxSao+#70 z07bT0_4F|a7FvotPrmADTfjYXVFa&UdoY9l@WbS znNd)_PAe6h#3a#$x)@pW)|iW$rtw0Ag!cDO(;3l?uYlqU^5?FywRMTIiVC|~FSgI^ z^(neGB0sd?lT-vuP6v{gQ!C4xgOPk0cF$u=kZ`Pi4Q0pXRkmJ8!$jN%#NJsl3=Kh! z>(YtXs`np^RslsSmJJjmAmZc{_zuKsH%=FP+cq4#?X}{{sEe2aU~-Ni^vZqwBdKNInyU!|9St01=z_`@h?PKLEvTtstMo~adoDL(S2te(FvX@PhBS7&dyfBn zHBMSV0nqnYH3Xcy^M=Hugn;m8tmf1}Zy4k|dQ0TK2q&eitD9y%m{I;hrQqSNZhv?{ zcou8QFj&^D>#v7)LlBT4M+@XeYph0HR(r6dlub=d1CvDSS&aZ!o=D3E`{Fy%>ND<2 z{EPeOc_gxH0$0vL5D<|^!I=cx*as7}NuU|>&jmSJ1auUGf3^f19{Emj#oY%lj;?%B zD+Xs@9}8f++jg=XQ6mhW#`Q`B*TDM&D8wY{opAw!>y3!Mn-28j;iw9rE%7liS)eMf zw96S9QiNqH%DxyIx5GxQR#baQ|k;4i|Fp^XLKf4>+zH7!2*6+ z8d}=X00h+IX^TvlG(G+s59kinz{_B2vLB-RZbQkIheu7Q6fDIztY^?l&4B!zTaC0d zKmiL3zXU*>bUzXR`Sq@RY9|DIwZ>&bnUI$o$wd?s6LxX~nOssVkeID=$b?Z#AaaR< zbBBi4=Ra|!VB22;%J)>@M6lu?0;{6s0BV8A$Q%Paq5JsN%2%szv6KZ#UChve*FpQV zz+9SJaS8&%z%fhFXGdDRpfCn=S6)cL1{o4sPk)Ke#$=N=W1?XN&K*%OF~Q!a;2_{^ zZ-YWJAp{-2pzZd0FKQO-C{V^cH;W77V`GgyxwjM9{Q&|%-d}XOw5F$Q;xN+w14Thf zOqt|ZP*LE|k6>)ogsCndTLG#d04s1VPxmH4H4G#qN+`A}fh}Qa;w6fuEyVBE{pwYv z`?FT3iw`hR1th=|urcgs&x*krMkFDxIw0TH3$6T11GR!;WqS=a4Q(+Yh>)|2qd{mZ z(AjFF$i-g1-0FBhY&*h&1bt4xHq08Yl<)`uaw}y&WXCSZo4(=xyfyu z?^-vCj1zYH!I9c;B|8u5@5&i(&N;`xuaWrRsKM>)3n~&+20`IK0Piw#@bI@k8G(RQ z6Dh^(w(WS%f(+FU$l!(eDHNAOqYNiW^cr1A=gv#1e;@;!X*GSisHxC{Z`E?!ohx3r z`CUgtN2e+&`NsVF`|O3QHO>O!Hm_5ApYxwdup3`q1SkETvt{6V{6ZF}6Ll`<^Ex5J zJrKykP^m`$Xxpv8V(N2H3^@bkOj$V=h^_2~?GQZ>L_8~RSvP&Q0bO;y+^M6bqmu=> z)D5q^%e)$>VEu;y;4lIX(rVoIWG3qg07Z?<-#diU*}zX=d9>J()8s1}i~wNr?fV4; z;0bCIuFQEvJcb35ce_pHI3GXvG^4u1zf)@WgJtU@0Dsg10{U23STM%hn?P%$JFz)l z3evj$1$M#9Xrgh~NxjKDLtx71`S~^A0O*qcNER&S{41k+<^RzUrBy2^0=L#u9VTci zn~g6T99mA)TTUi)~)7W7u&|E>l1+Ruxo# z;OtTwP~}OWyvVi|R+YU%ySig-Z7ldrSrPsyLIw%~N< z1{JP$i)Sq?g>=_|GvL3^DzD4b>*kyy7#e?(UH`6Yvl}{G|DK!+<3ARI8L_J6S)Fv9 z=~C*rs<5{M7Ek0uFbv{SQ%f-jdARH+Cu}q_L6(KRMga;1`B4)SlUi_8@keq4N4muZ zZLgzNPHqmOeTy6xT2B~mfw(pdu-v7OzK+ex2O0Xq_{4e!C{^>H?8^o0MktEmx)VG^ z?&x&R+@UV6uA^}6!Nzw0j*~;#qN!JSFwqkdx&E_5KK4&;QM7p%r1c+qdc7&;!dS7& z$<@&yrw8}(oJXuMcrV7VwyccKwQPg09v%D1lL9z+L~2%66%9St1PTCiBA^CN;q!2c zOKJlLCa`nc(NN;oMc`0CgSN>1bRLQf0@MSh0sgR^tM}9F2gnF;g4=!%Nc14-14^d^ zn$3J^-WLni=`TKce3xvr#Kx!Tt^)BPNr z3!tE*s{!-|NVS;;{Xb{IB{*7aUqR?{CY3P)o!CzZyi?(FRd33+=$?Hv5NFNJ8dzu5 zV?Vvr71~!+NcdbKV>@=fN}UuKr9&7{9Y)8i`s>5fwj_P;;W-m;imCJ1>H(h8frIT& z4h!;tVD)|ZENM40#wT`v(MDSO%BdFbqs~O!&3>pL*E|qwgJAF8P|76!`r)<5-a-JJ z1w>`hus{6owqAqTiRvtn)qE&AVICKP4*ty_=xbY1XvfztQa%@w zncl{jvAj^}1z$$~^E@^m&!2_dd&tdQZMS-tKA3!qDy7S`-E(RgCppTXo*Ljgl0_#L z&~5QN0Yb2{nG@n7mHA+z+fJ;$&$x%DmDTu3Wt&^;UIb%FMKnHD%enbZU)^+GH*uXW z+z;P2{${BW# z_*R!fw=m2@lyF+*+Z<+U8Uyw61#gskzkL|I98;-1?ZMh!w9@B$%7_F_414ni902<= zn}j514vvsVk9jX--rC+Y$fNrg?sn$qJo#aIWIog01xmtAg66544@N3lTF;gS)NFUJ zNHW-@QUo(49+WBT>)-O#BJe_A(8OxjEVcxAsj8k`(|CFbfAMm~t}wN**aA#UMI8PI zRcCq~GDZP9gf?9td&#Hm`A$;qOPSkq%spJ)YW{JxqPVz~)BeRB1e~^;lgd)L*^g!s zu!BLEz{L5(J(tL;JzD;21M4?C1e~L_;VHia0~Z4yv;L2wBDy*o7dMxYI&Hnnkt&7f zLJ1^zhI3rbM{@5KU9W%7+GOsv!_ACEY=iO>5N>5afYsF13)0dET7J*iY>wokf@%iT zlXK3XHkv?e`dY$R^DM+=Ip}iYXQECX?{1ODHBG3h)$AwhnF_t;+nXA~w(|luW3lTa zdR_?$EVSctv%{V$F;MiMlzE@`{g)b$vFK2Y>Du`C~z2rR4f6fp)F9+3FL^u4*bdCb&A-3eL zYnOZY9JibcQ{8HOEmam9$}=OVd?#Zno8a9KfvbJd>&u1P3GjAJxlvaLyQfDGukV%- zL~(w1{$voaZ)eQ;NX{8Xg&hR$@ujYTI=u#)-yi$(t}l0lD-O4D5D>=KdF>WQ@dB-t zl=?H-Q#oZ&pGdA}4q8MrhBSo|eQa|$oin-lfQ%gJHPE!1Gf&R~N+&|W*Saw6&!)1a z;Dbr`#M#Swc)yD|&r<%k<2XGx=Yu6%<=Wh#T&080Us+d5yM4NrxjH6Ft+naL(NCM{ z;xhE}sZUZV*IYU$PKu+dPOZu&^7{Vz{5$I!akZ7o&YrX0?6LFUxh?BIB$8+P=NI;^ z=jM7|JGC&Nb#ZYi_cd3dw|Qhy1o0lK(5u~xZQ1^~X;Do^`Nn zhTi&Vt?#?UBpG30bW~N*P<1_U3|FFPKA{JdvYD8=ZH`QOeQ@!%7p}e>>6ftZNvI&> z+Rpe#RGV4Wp{crv+mH(_HASNp0vtbA`{rSNO6^(P`EKG#qF;y@@xOP=uW#n&8Z(=! zs%|DNo(>qik31i-wXE72&Rv^t{P9qFigG)J<3kCd@1@R<$HXGw6gLq^HD(90HO5SB z+0uIbX=BiLKTCDi9$Nt+*=}~*S17Qp@~rca61+mPuh1Rk@%!8c#cjSxW@+h%uY@15 zugik-IJQfnoXKVR@sKFG4KK7+fn+Y*NT1*e9}mv5>?Kk0W#6wk=lM&|*Ed>?$I4b) z>KkVVGwZX>T8*oZ2I~y|pU%z$s;RBp_px9F5k;kQ6a+*8l`1Wuf}%8~*MKO!D!m0k z!EYl1{_<4NJ%lJS#{p~@(S0LF$NDrd|-z@ix^ z>eW~iExa|uF3UP%+!?mfyY^K=ajco=K*P%n%^2$H8G_kCa=`LCIS)OZ2zEXZbJHDC z5x3t)0Qn6Way)vS-P}xj+#Z;jhrif`YnNW#-nUObL18FxW}&{_L^o;)uUkhg>8UDn zZ#>nzlq=HDP^lauzq9-OdwUsgKo2pYXDIBtn_#U0+LIO;{sXxVa_xtU*9u6wYb}9=G z4^J6BCtI0Abi#`|)IE&mDilMn**+nWm&Bl>I(cwlVW3+qYVz^B#gT=$3m1wRpojeM zk@9o??9h{YIU_FamuS_H)=Snm7iHpIteZ0(2+=nXduPNc)g!MUspq@;J!dGBCk)!~ zOlVBLNm85vrxy8P#f_bf`{o!j_vGm_f*y;lHb=&fX`HxUQ~JE!_5D-(BaF&{)GN{! zK{a@%_QWOyYR_ou!jJ<-P`vw^Jci>Yb%l8@*vPGr=2A53JGyFCH7xg5@{kvGOH&tp zQP{CsA;|7&U$=sF1Z6eJE50<5M!}9p4-{uxrT{om`l#x_A+J863l|C>R;UJ`)aPRv zv=h7t>vbU`TBu|n((aDJd}o8-5D#3eGdw(tuY&*TruCZ3(xhLvWt>XdUd}*dPK^04 zSr%0+hRp;s=rtao_VM7N?+%r+%mYxw-*Qyg{&K(oA<-FCH5!P){Y;i#4Mk+;IQ48v zIoT$}>+)ohaY(7%LMYag*2XMmW^>JG#-!CG zTM|vU%8up5VvPaR4`;ivRhqtw*C2nLJ(x}#XYMkxQS;l_G%#TcrTkopoUyyMGoLe; zt3M#d>z6m)QZbt|+s)&ro|E-)Nw~DxC2&%1sr%ETjVEmQM~z4D3WSqd{bFgUv&m%{ zf2|@u&)~*N9Nm-njw?`#DXXPJIT<59nLyr9umWgm5v7B7p}(^nCt`kSec@sxTjwUR zAnT>)Qh_3+zc2%f=l1GZ8TVQZTL{|IHQQVJl7Bv~Bnc4P>|`@&NOBX_JL+GatsIW6U%@{f*;WxrzbvsGoOZdla?Q)kj)n6xej_ z)tN0yT=-6+Zkd#A>nETo9&O!tY}wQnkN*!UPaVZ`eTVVJ_VNw89wF+PLta@ARg_h3 z_)P#qx)-BSxq2u2x`pMd5m;K9~w>${Ef8cry zQ$N|Xo~)6Jsn0Js7u|J#hxo|adf_a!5cffPG;4?Cx%GJG%DVyy_pCJn4e~UUCiS|{zrUF1%QZdTz(i$P()Qsp)1Xql zo(yesx-27=%WpVA`z8>xoh&xuVL%~mEX;Aem1kLmz%=|gU-%1RT=XRr*gOA@j$L1<%JoX2@DuiV zIZZGHqh;N{?N%dqmHaB!_BCh8A=c;Z6oVwCen@;hDyEw(WqF-~v#lI#V8#cat;qYi z6^EC}hn~sxw29bc$#}f%^j&M|nACVmvuiHs7d5j*UhMGxaWyqzq znzyP!Evfl@JE;AI!)V%$fWb^i|c;@VOdlCk%z;%OorYV9wO+VjwCf=a{wmjlc zpR|{-yPwA6xgJWhGuz>*zPHS+C4aVJ)=)pkKL1g#t9HpkpnImyK@Dfej}tHVCR#%i zW*c&>#X_Rs@VlR}!AH~}LZ~8(cWRTp(~f&H-jH>Lw|_PmrInleSJ7(T;n!q10xoEs zX{2=KJ4p>V27{k7Rg}YCwmrn4&GkdP?l|t^r`XEv*-&Igsm>#x`@h}~6bw>^4XwlC zo~Fl&p$hJ$sa_Bj)r87lxC2|*mH!j0p3wt}Pkj7sCT)BVdSkU^y@QmARI_UOp+7nW z;Y7=C;?GK6@kf7A99h?HU#lXU+d)cSe-vPCdOMK4Br(&sGgSqu&Gmjc2yqVD>tM&k5f_fLg& z)}iRVd`ZH( znJBBAY~+&53u&9**}|7Br4$_WbeN(|<>G3S*R*3vp=Epmnr6)N$NEE_z6l)2F)^To zMGlR$9t(76>Pj!pth#xtYN-N3)RGA;jOX#9(asPdzP5GGz7JVK7ZfoIEf1h>0)hI#u;(05c+R@#0;mqdIPr9Sq+B zD)+v%RUHa6HGg3LsnC2P`~O^xO_;pMaEj@0geo_QHB3@RGtn!mV$GAFtNKhjH=|ki zqT|3s^-zh9;Ldup&XWA@lQ&4aA5;NSb4vKQSpoY^Qi%+;}>N&+J!OGrxSVJQHGj-zB_R&3oG)&I5qF zXQOGAL2jVBO)|Z5E2rWn11nj|4FpKG(*j_4UuAcrRUGac=y`EtI+ZjF+8;qL9rjNgUs z-dV*&l6%OVdEug;ItYbnmcs#F>p0|c(E89h|FGUAqhG6;8bzq4uRS#j#V$*Q9UUCB z?R6_o@}_`rnn{^+(=}qERrT_hqk9V5cik_4*I_tyORVZkeuwllF;Y?!=|||$EUQZL z1!tvbq2J~JeQA}W@auyjM zKnfY{wfxL3l~;yF_pVEienbWCx>*ID?Tr1Xdi&(;-tcfcJHrVMb9Q{`5i&N`WI$0^494Anb8Zb zUy`IB^RvuC&JSnx>>o5@OsI zo#0e0zO-mf^Ah`zyUlG5$gVJ=>2x{8s7yKD=R5ay-q3r2y3p4zwQn)E!%wozkPDBu zeEVc%R`{a(Z+(rE5Zju~KmwI?`=nr>LPxKG2}xBGbII%;ZpXWxK@V@!mLp4Tw|*lm ze8_8=;CgWX-caea;Xj&C3G83{L-uhbY$d$%ty;O!;;GqP+UV~7{e60sNpXSC=Z-hs zry1FLr!)KUmvXj0IXIA-jUD+t^$$bY4*yVjze-6?P z@;4k}mX2so8j(IOy^l@&#g!1Y27{ZtV)x@h+HriI@}|?TAC-v>+6>G^UGwbs<8bT1 zd>(SGrB<%u(^+{@>m_Gc>XD~fH`IF+yjMvbc!jqsE0ci46a*-E%=AT)TSM5svPq`C zPxbsIMQnSqLj3WEoac`N%ce^1&%;G?&^&?VL9WCA=u&Z~pnPC|h30u%$!RAfY27j$Z+DwbL!Y zFM@Q9yDmbj3j%}P4=mwuF7l} zGnqK}BY8yDXMf34e#I@sDfr$ttGd$=dd&da_h?Xp=riXVG4N>bGr7;1ZlOmE1^Tmo zJV)lm@CV5H22;x&Q}`ab=QDB&B2C3M-OQ3&nk(&72d&WsqmsjMZ|mCJ0{Ww`d599d za;Z8_gMqd()J)lWfkjO+hXX*X74)q^T3TCu$Led_9t z>~SjbhNi^A*Ojx6((MB$4D-nywOh(<)E%oxV^8mQoSNN$sgWr$_3g=Qs_T&nm=onD zj*cTkBiUpK?`wS~OoW1qxWU{xpF>_ejWalHzIN^LRQ@5uMGnI=N@Moj8LtQ*{9}S$ z!w|aWqs6ZJzNx0Am4U3%B0j4J_So)5Ym1LZVdRc4j=6X)h&gvnqMauu=XP5q8Al|y z6&)HDQf@fz43lMQ=?;E>h>8l zE&m~#@ZrOH-b~Tln@Gp-YdX9WRV=SHVhA-I+dBp3KhQEKq6uPWK z#o4URf-fl7CyeP$I})`$vXoecQmZ6lX1ftClGf^zO_q)$@vg*Q|9}b+L53*b#cE1O z6Dm+eVk$$zTq@VXevA`ud@!$uUtAXghESRN`ID^* z@pft0<+0qZxu?No@=Ow8&bgQucuA?eC^UxQlq%xyqYw}&9=;8dC9BO&MWkNS&v zH#8-`Oe;5@SiAE#deLI=$P`;>v3s_J{ofSd>(|>zW^O9Io?Z=1y>ZGZNB64TyxCh? z=W5wNI|#*s#F`_RmoYKv!^5WOl2dDspUIW0IOb6%>Z@`yppg<-J^gs_z?7M9_8N6( zW3G%~PLk&hH1wdXxD8B(@ z`7J7kSr_19USe<5L>DG^(+lhC^8-47DdTgowX+*Un@dbUum+*TO`vg?1uc*=oYvhfJqGM6r7O%D~O^Jm*QKAb%BfNi7h%_p>1SyX7K z1IZ|WUyai*F_)?4!0p$ip4V;-NNdFMQ{1fuaqv4tioHxx;zA`&nOoNL`?-&%q;}a0 z#_KU)tIa76i8fjtii4QP6-PSMdfrAFy|$dTAQr9Po%myc3%ic)}XmvlwYtvW4l@NzvHd`kDLFgEz@;qkV41xVu7@<@^x zg*Ew6q5V;BXmU4f4=0}nawscHs7!${-I+!Q^P=;`0Mtf&#{Spp)9{~tAg&l#L8$rJFOSum7u5m=Ri8tLjK<9VL%a6>qC^HU2F(5cZ~hdz z?!Rp*F1>~j5XCJQAMXX={(6*)oX_e*`sT|4A7=&9mEYjpGRtrKhc9MhzxB_ApE)Vo z{tO^*`slS}{@cW%_odb1?jAss%Xf;;LVS8AcNAjF3;+o4=?}--d9~>}{Uxvj&;^wO z9g#;U3jc@#ee64JV`Ka8kB=g$hAgi&xfFz>W+8%c05P_wr1n{rFTFDi2d6CO@6rgSZV8@3vmm z7OAo@FpqbxX}d6Rw9Xlh|DOyqC}1YQEC=FG5XXCf1O@ijh0gF~+&2B6A%Xa3_Ta+s zFVhaFxXlBT(#^;dcrrLd(tp#nPM+s;GCp$Mief0;7rkyZQq6*+BYB%(9sxJ$fBS#@ z+s*_V_v%&3=BBsG&6`3X=D%oAuvUpL_uqOP$su{i&CLz`K8&lq3%ej0OktE>Arhyb%qTx#^-j5SHXXqh*JGU8QnKqij1y!2}qv8LR=;Jdi}CtA(UT+~0sQAs>){Y~X8vH;zI1mwv_J4E&bh zg@BUmPSjik30VZYxO<zPQ3T4tZk_O@3@hxix^U4YAS`n2^W!;hc) z%iYqhfMPZc5OY=F(G6N)U8^}&PGl{RSt8_Iv>Io+^ia(fQzHH?M6;ghI=P1AKylxbofAmPmlJIpHQn`7DBvVT!^sD%I`#iI%zDYt zoPSPDadaSQ)ht|Gj+cx+ef{L&2WW95tZ(1?D?M@x0aKp@B~}^>2Zz05|Ht}Fe69ct zGYK%!hy+O_JG}q;7P}die zS0pf196mlifhz;|4GpK)Y6Act4dv&%c(Fx{O6KvKzmn!L*VhD7-gdsfu)!O|kP#qp zV?%GXpmwLK>hWL*@DL0YRDlti^n`LD%b7DaAd|-VNym17F#}ChC$NK)H0lzKtNR#_ zyzR5FFi!fFWooz_0kinV>lMIeWT&6vz{L;X}QeU;0*$_NGOoA!+rv;Z5=Hz z4ZtsF6b@uL@DdOd%Jm6KNc7Op2QJ}Y@%NVxf$oso>k635tK{URx5V@~^Q)<;sa$jJ zga9%-tVy1}1pvgd^QlI-&TRt&-|W5KYe2EDfehk-`_kyhCQOh>6lK-`tW@UaWI$}r zcjZc9(~f_@2>k>m0X)xI4@B=cz5`U!4bbx3H8L)wv#XVlibKY`8?yty54_`_cGjAUtncSqr%7|v4;q|~cKu;s3N424ZOLx$ao;$Aw z(SjZra%zsD4RH{P=rw-G-+JL`S>P4(U94|`?s5W{?0N!WR~4S22V}4$=*jDL9l?C4 z7fIXF(tx55!~wC=ma{iP=>!{uk{EvADK%jTrbAjeQ$}FodJ^WlFj8dzgqkL;{52~l zXD%p3l8vI9@y2-HpG0CV99)we1{O&I0!?<9_*l2uC*?O_6w#xNEim-6Fp(y=8oqq# z`Shqo7GGG4M-9o-KO{`=OD36gkveGJwL$AopO6p|{Gjx^ytSUQ2KU9bEnYGnG&+$( znPONgaDzE5YMMod#9*v+!Ez}9k};6zd?n(Yfb{g8NssG+3L9N|41qQ?L;YPU-rTQk zNzvrAn0X!)zGq`ioVREjiwYmI>!SB<`bkPl7U~t*HGQ=c6%k3$wW8cJRDL!>oxbz- zl*;M6t1VeV!RlXv3skjFo4q)6{!*5D(yJe*45C^;eLAXp{gHIqC1eM$0=K-O!j>OK z)TU#HA!KF5%ZB(w(X-u#P_!gnw^_A62tXnAB-v8|I~yuWN(gXLDD+-=0M;xGVpt~Q zue6(Qfh)YpyaPQLOHXVAc(yL|b_ z5;Rexf)Psh@1KBx6$}{?7kHH@D=WuG<(HJ4f~@UZTN?v{Ur?}tNU|g^j=*uqx$>Y> zJ_Gv;WvqG4u0RGbgySH3mwEj25Qy&%!ka<_PNOdhamf6MZzH@GQVE?PWcDmd1N)Rv z&|$TGUGYGnS#NoJ%W!aV>S58_Ylp?mtF_m+`lJQ}o3N!I3rm#uIRUe58g$;mVBdmF z&CZ?)yF$&R(9Dy6{P=~*AXN!B)2^%XALPSsRt06e$1`I zpRByRub>8akDs$FG?d`AG7)%l7}n%0EEaYbkc7X4tinDV>?xe(c3LWa)rTCE@mk~p zP;yS1$cmuPPSvFoCr@4h-xr9?H`$fRnGXBHA0L`79WV-HK29#)R)*#75h0o!Z$G`w8w?N5X^o)foeXHD)91Ohr{Y;25h zbtIrZn4t@FQ!C!`Iq65^6B6|9Y8vM2Ul0?E%Qe9M)jh+S**)Mq5Ddb%aQs{hpvZR( z4g00zMBtXi4h|X*z?c(5mHr_pcncI8zu>eVPc|_@$Yn{=6ofUU*WDD^Uyn1Lpa*4SxTvV~@XMoEYkKjky>oIbMQy%{PEn&qEgp_BV5_S2Z7ZNMh{Jg8%gj-+R0-EOrm0@1**%PKBidzyEsHlPa5{|X!9 zRqths3ZTJJWzjIi_r4$D17nMKtF_ShEHu3IK!W2ublYZUXLX0Xsk;^GAnv$+{W`yd zgf1Ah{RA~2k3Ub z#wwC*0^&VB#Na{^^5-IWWKDQhHQJZlv9Nd#jtoYS8QOn8kx$f`bEnmQ1{8|r74MkK z;tTjP9Hu{H373PhdhUVSt()+5>A(F6LOx)d)wzrlnD0Z3jL#Y$JZ<@PU0Hb#j2WMx z7t^%Yn6RK1#nXUJyH=Q9ey0vWP36IZa}lb1_V)H4Aw{~Ws`_BK+-+eL-_;cy6olYW zI=5=VRl#>o17u>)%&|Do>s^736lAhMk6G?+uE==IvO+qGK*%N?gCv_x+L;S1b%G&# zL?B=RKYsG$zL0uE)kqXf1K}ZOTiYPb%nrsc;WdE2t)P14MiN0m8q!!3OgAJ5T>QQ!ps z6W6MdZ_UlY5cqeuNN{7GLgs#K>8_`z6v##+AW8h=(j`V%>22-oEMLEP(a6ox*^z_A zhQ0XgT$Vf7W-Shb>-dQi!7zq}7GM_LQdK?RD8-qle-ZMOE07GqU^*u%dhBkTXyXU9 zH!qWu3F84qRh}Z?^7H)b*PC22t_Tp;-Eece^5$Q6ifRJWCKNxXQWr>pmEmBz0_cHy zv8AP_MdR-FPCc&BHZd6*?{Jq*Y9_&2(%@W1OItY3LczLTXJE0@bacGrF3U|H{|izH zOo-#)o@h@S%^z9{oLgA?<%fMCj zIXQQF_4V|?adAIF2`t$VW8dEhHqSW;8Wel1OsM?Y0=qc3oed|rq2g*c*Kn>sqh3hZ z;S0=QfZSG7s|Rcryeyu9HQqQ#G|D~ZPIDc@c@Ywlk|N*P1c~}C{3JX~vL{X?2JIKj^k3N3&vznswa_Oj*PlmEI(vgZT@ z4)pf+!f0%SH(A+Sv0{r<<>%+OfDrnvwRLyVPf-1B5UdR6kVsw_-kqJDLYFRmg)17P z2w6%4*b0rg%2<>Q6@r2?f=8ipq!e^KW1dxR#CC2*E}7$yLSJTMREH~jMb>U7?Zmlr z+ZSNaK_2`WKELJFciu`brm^*e-xIE|oDb$(Y8{J9%L$l%TP#uk>A%mN_M%V2uM<)9 e85-DLhtT{ncE7#Z;u`Ft5o#)$Hw&&?1phBb_eJFZ literal 0 HcmV?d00001 diff --git a/docs/_toc.yml b/docs/_toc.yml index 8f15ad5a..6ba9d46d 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -14,6 +14,7 @@ parts: - caption: Inference chapters: - file: inference + - file: large_scale - caption: Interfaces chapters: - file: api diff --git a/docs/api.rst b/docs/api.rst index e3a5ebb6..0b9f9b8a 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -16,6 +16,11 @@ File formats Sample data +++++++++++ +.. autoclass:: tsinfer.VariantData + :members: + :inherited-members: + + .. autoclass:: tsinfer.SampleData :members: :inherited-members: @@ -60,6 +65,27 @@ Running inference .. autofunction:: tsinfer.post_process +***************** +Batched inference +***************** + +.. autofunction:: tsinfer.match_ancestors_batch_init + +.. autofunction:: tsinfer.match_ancestors_batch_groups + +.. autofunction:: tsinfer.match_ancestors_batch_group_partition + +.. autofunction:: tsinfer.match_ancestors_batch_group_finalise + +.. autofunction:: tsinfer.match_ancestors_batch_finalise + +.. autofunction:: tsinfer.match_samples_batch_init + +.. autofunction:: tsinfer.match_samples_batch_partition + +.. autofunction:: tsinfer.match_samples_batch_finalise + + ***************** Container classes ***************** diff --git a/docs/inference.md b/docs/inference.md index e6ebac21..d6b846fc 100644 --- a/docs/inference.md +++ b/docs/inference.md @@ -300,4 +300,4 @@ The final phase of a `tsinfer` inference consists of a number steps: section 2. Describe the structure of the output tree sequences; how the nodes are mapped, what the time values mean, etc. -::: +::: \ No newline at end of file diff --git a/docs/large_scale.md b/docs/large_scale.md new file mode 100644 index 00000000..ee343d8c --- /dev/null +++ b/docs/large_scale.md @@ -0,0 +1,136 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsinfer +::: + +(sec_large_scale)= + +# Large Scale Inference + +tsinfer scales well and has been successfully used with datasets up to half a +million samples. Here we detail considerations and tips for each step of the +inference process to help you scale up your analysis. A snakemake pipeline +which implements this parallelisation scheme is available at https://github.com/benjeffery/tsinfer-snakemake. + +(sec_large_scale_ancestor_generation)= + +## Data preparation + +For large scale inference the data must be in [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec) +format, read by the {class}`VariantData` class. [bio2zarr](https://github.com/sgkit-dev/bio2zarr) +is recommended for conversion from VCF. [sgkit](https://github.com/sgkit-dev/sgkit) can then +be used to perform initial filtering. + + +## Ancestor generation + +Ancestor generation is generally the fastest step in inference and is not yet +parallelised out-of-core in tsinfer. However it scales well on machines with +many cores and hyperthreading via the `num_threads` argument to +{meth}`generate_ancestors`. The limiting factor is often that the +entire genotype array for the contig being inferred needs to fit in RAM. +This is the high-water mark for memory usage in tsinfer. +Note the `genotype_encoding` argument, setting this to +{class}`tsinfer.GenotypeEncoding.ONE_BIT` reduces the memory footprint of +the genotype array by a factor of 8, for a surprisingly small increase in +runtime. With this encoding, the RAM needed is roughly +`num_sites * num_samples * ploidy / 8 bytes.` + +## Ancestor matching + +Ancestor matching is one of the more time consuming steps of inference. It +proceeds in groups, progressively growing the tree sequence with younger +ancestors. At each stage the parallelism is limited to the number of ancestors +whose possible inheritors are already matched, as all possible inheritors +of a sample must be matched in an earlier group. For a typical human data set +the number of samples per group varies from single digits up to approximately +the number of samples. +The plot below shows the number of ancestors matched in each group for a typical +human data set: + +```{figure} _static/ancestor_grouping.png +:width: 80% +``` + +There are five tsinfer API methods that can be used to parallelise ancestor +matching. + +Initially {meth}`match_ancestors_batch_init` should be called to +set up the batch matching and to determine the groupings of ancestors. +This method writes a file `metadata.json` to the `work_dir` that contains +a JSON encoded dictionary with configuration for later steps, and a key +`ancestor_grouping` which is a list of dictionaries, each containing the +list of ancestors in that group (key:`ancestors`) and a proposed partioning of +those ancestors into sets that can be matched in parallel (key:`partitions`). +The dictionary is also returned by the method. +The partitioning is controlled by the `min_work_per_job` and `max_num_partitions` +arguments. Ancestors are placed in a partition until the sum of their lengths exceeds +`min_work_per_job`, when a new partition is started. However, the number of partitions +is not allowed to exceed `max_num_partitions`. It is suggested to set `max_num_partitions` +to around 3-4x the number of worker nodes available, and `min_work_per_job` to around +2,000,000 for a typical human data set. + +Each group is then matched in turn, either by calling {meth}`match_ancestors_batch_groups` +to match without partitioning, or by calling {meth}`match_ancestors_batch_group_partition` +many times in parallel followed by a single call to {meth}`match_ancestors_batch_group_finalise`. +Each call to {meth}`match_ancestors_batch_groups` or {meth}`match_ancestors_batch_group_finalise` +outputs the tree sequence to `work_dir`, which is then used by the next group. The length of +the `ancestor_grouping` in the metadata dictionary determines the group numbers that these methods +will need to be called for, and the length of the `partitions` list in each group determines +the number of calls to {meth}`match_ancestors_batch_group_partition` that are needed (if any). + +{meth}`match_ancestors_batch_groups` matches groups, without partitioning, from +`group_index_start` (inclusively) to `group_index_end` (exclusively). Combining +many groups into one call reduces the overhead from job submission and start +up times, but note on job failure the process can only be resumed from the +last `group_index_end`. + +To match a single group in parallel, call {meth}`match_ancestors_batch_group_partition` +once for each partition listed in the `ancestor_grouping[group_index]['partitions']` list, +incrementing `partition_index`. This will match the ancestors, placing the match data in +the `working_dir`. Once all are complete a single call to +{meth}`match_ancestors_batch_group_finalise` will then insert the matches and +output the tree sequence to `work_dir`. + +At anypoint the process can be resumed from the last successfully completed call to +{meth}`match_ancestors_batch_groups`. As the tree sequences in `work_dir` checkpoint the +progress. + +Finally after the final group, call {meth}`match_ancestors_batch_finalise` to +combine the groups into a single tree sequence. + +The partitioning in `metadata.json` does not have to be used for every group. As early groups are +not matching to a large tree sequence it is often faster to not partition the first half of the +groups, depending on job set up and queueing time on your cluster. + +Calls to {meth}`match_ancestors_batch_group_partition` will only use a single core, but +{meth}`match_ancestors_batch_groups` will use as many cores as `num_threads` is set to +Therefore this value and cluster resources requested should be scaled with the number of ancestors, +which can be read from the metadata dictionary. + + + +## Sample matching + +Sample matching is far simpler than ancestor matching as it is essentially the same as a single group +of ancestors. There are three API methods that work together to enable distributed sample matching. +{meth}`match_samples_batch_init` should be called to set up the batch matching and to determine the +groupings of samples. Similar to {meth}`match_ancestors_batch_init` is has a `min_work_per_job` and +`max_num_partitions` arguments to control the level of parallelism. The method writes a file +`metadata.json` to the directory `work_dir` that contains a JSON encoded dictionary with +configuration for later steps. This is also returned by the call. The `num_partitions` key in +this dictionary is the number of times {meth}`match_samples_batch_partition` will need +to be called, with each partition index as the `partition_index` argument. These calls can happen +in parallel and write match data to the `work_dir` which is then used by +{meth}`match_samples_batch_finalise` to output the tree sequence. \ No newline at end of file diff --git a/tsinfer/formats.py b/tsinfer/formats.py index c485d792..95e23a44 100644 --- a/tsinfer/formats.py +++ b/tsinfer/formats.py @@ -2308,7 +2308,7 @@ class VariantData(SampleData): the inference process will have ``inferred_ts.num_samples`` equal to double the number returned by ``VariantData.num_samples``. - :param Union(str, zarr.hierarchy.Group) path_or_zarr: The input dataset in + :param Union(str, zarr.Group) path_or_zarr: The input dataset in `VCF Zarr `_ format. This can either a path to the Zarr dataset saved on disk, or the Zarr object itself. diff --git a/tsinfer/inference.py b/tsinfer/inference.py index 9693e0ed..b948d060 100644 --- a/tsinfer/inference.py +++ b/tsinfer/inference.py @@ -592,7 +592,7 @@ def match_ancestors( def match_ancestors_batch_init( - working_dir, + work_dir, sample_data_path, ancestral_state, ancestor_data_path, @@ -613,11 +613,78 @@ def match_ancestors_batch_init( time_units=None, record_provenance=True, ): + """ + match_ancestors_batch_init(work_dir, sample_data_path, ancestral_state, + ancestor_data_path, min_work_per_job, \\*, max_num_partitions=None, + sample_mask=None, site_mask=None, recombination_rate=None, mismatch_ratio=None, + path_compression=True) + + Initialise a batched ancestor matching job. This function is used to + prepare a working directory for running a batched ancestor matching job. The + job is split into groups of ancestors, with each group further split into + partitions of ancestors if necessary. `work_dir` is created and details + are written to `metadata.json` in `work_dir`. The job can then be run + using :meth:`match_ancestors_batch_groups` and + :meth:`match_ancestors_batch_group_partition` then finally + :meth:`match_ancestors_batch_group_finalise`. See + :ref:`large scale inference` for more details about how these + methods work together. See :meth:`match_ancestors` for details on + ancestor matching. + + :param str work_dir: The directory in which to store the working files. + :param str sample_data_path: The input dataset in + `VCF Zarr `_ format. + Path to the Zarr dataset saved on disk. See :class:`VariantData`. + :param Union(array, str) ancestral_state: A numpy array of strings specifying + the ancestral states (alleles) used in inference. This must be the same length + as the number of unmasked sites in the dataset. Alternatively, a single string + can be provided, giving the name of an array in the input dataset which contains + the ancestral states. Unknown ancestral states can be specified using "N". + Any ancestral states which do not match any of the known alleles at that site, + will be tallied, and a warning issued summarizing the unknown ancestral states. + :param str ancestor_data_path: The path to the file containing the ancestors + generated by :meth:`generate_ancestors`. + :param int min_work_per_job: The minimum amount of work (as a count of genotypes) to + allocate to a single parallel job. If the amount of work in a group of ancestors + exceeds this level it will be broken up into parallel partitions, subject to + the constriant of `max_num_partitions`. + :param int max_num_partitions: The maximum number of partitions to split a group of + ancestors into. Useful for limiting the number of jobs in a workflow to + avoid job overhead. Defaults to 1000. + :param Union(array, str) sample_mask: A numpy array of booleans specifying which + samples to mask out (exclude) from the dataset. Alternatively, a string + can be provided, giving the name of an array in the input dataset which contains + the sample mask. If ``None`` (default), all samples are included. + :param Union(array, str) site_mask: A numpy array of booleans specifying which + sites to mask out (exclude) from the dataset. Alternatively, a string + can be provided, giving the name of an array in the input dataset which contains + the site mask. If ``None`` (default), all sites are included. + :param recombination_rate: Either a floating point value giving a constant rate + :math:`\\rho` per unit length of genome, or an :class:`msprime.RateMap` + object. This is used to calculate the probability of recombination between + adjacent sites. If ``None``, all matching conflicts are resolved by + recombination and all inference sites will have a single mutation + (equivalent to mismatch_ratio near zero) + :type recombination_rate: float, msprime.RateMap + :param float mismatch_ratio: The probability of a mismatch relative to the median + probability of recombination between adjacent sites: can only be used if a + recombination rate has been set (default: ``None`` treated as 1 if + ``recombination_rate`` is set). + :param bool path_compression: Whether to merge edges that share identical + paths (essentially taking advantage of shared recombination breakpoints). + :return: A dictionary of the job metadata, as written to `metadata.json` + in `work_dir`. `ancestor_grouping` in this dict contains the grouping + of ancestors into groups and should be used to guide calling + :meth:`match_ancestors_batch_groups` and + :meth:`match_ancestors_batch_group_partition`. + :rtype: dict + """ + if max_num_partitions is None: max_num_partitions = 1000 - working_dir = pathlib.Path(working_dir) - working_dir.mkdir(parents=True, exist_ok=True) + work_dir = pathlib.Path(work_dir) + work_dir.mkdir(parents=True, exist_ok=True) ancestors = formats.AncestorData.load(ancestor_data_path) sample_data = formats.VariantData( @@ -663,7 +730,7 @@ def match_ancestors_batch_init( current_partition_work += ancestor_lengths[ancestor] partitions.append(current_partition) if len(partitions) > 1: - group_dir = working_dir / f"group_{group_index}" + group_dir = work_dir / f"group_{group_index}" group_dir.mkdir() # TODO: Should be a dataclass group = { @@ -690,7 +757,7 @@ def match_ancestors_batch_init( "record_provenance": record_provenance, "ancestor_grouping": ancestor_grouping, } - metadata_path = working_dir / "metadata.json" + metadata_path = work_dir / "metadata.json" metadata_path.write_text(json.dumps(metadata)) return metadata @@ -725,6 +792,28 @@ def initialize_ancestor_matcher(metadata, ancestors_ts=None, **kwargs): def match_ancestors_batch_groups( work_dir, group_index_start, group_index_end, num_threads=0 ): + """ + match_ancestors_batch_groups(work_dir, group_index_start, + group_index_end, num_threads=0) + + Match a set of ancestor groups from `group_index_start`(inclusive) to + `group_index_end`(exclusive) in a batched ancestor matching job. See + :ref:`large scale inference` for more details. + + A tree sequence file for `group_index_start - 1` must exist in `work_dir`, unless + `group_index_start` is 0. After matching the tree sequence for `group_index_end - 1` + is written to `work_dir`. + + :param str work_dir: The working directory for the batch job, as written by + :meth:`match_ancestors_batch_init`. + :param int group_index_start: The first group index to match. + :param int group_index_end: The group index to stop matching at. + :param int num_threads: The number of worker threads to use. If this is <= 1 then + match sequentially. + :return: The tree sequence representing the inferred ancestors for the last group + matched + :rtype: tskit.TreeSequence + """ metadata_path = os.path.join(work_dir, "metadata.json") with open(metadata_path) as f: metadata = json.load(f) @@ -756,6 +845,24 @@ def match_ancestors_batch_groups( def match_ancestors_batch_group_partition(work_dir, group_index, partition_index): + """ + match_ancestors_batch_group_partition(work_dir, group_index, partition_index) + + Match a single partition of ancestors from a group in a batched ancestor matching + job. See :ref:`large scale inference` for more details. The + tree sequence for the group before must exist in `work_dir`. After matching the + results for the partition are written to `work_dir`. Once all partitions for a + group have been matched, the group can be finalised using + :meth:`match_ancestors_batch_group_finalise`. The number of partitions in a + group is recorded in `metadata.json` in the work dir under the + `ancestor_grouping` key. This method uses a single thread. + + :param str work_dir: The working directory for the batch job, as written by + :meth:`match_ancestors_batch_init`. + :param int group_index: The group index that contains the partition to match. + :param int partition_index: The partition index to match. Must be less than the + number of partitions in the batch job metadata for this group. + """ metadata_path = os.path.join(work_dir, "metadata.json") with open(metadata_path) as f: metadata = json.load(f) @@ -781,6 +888,20 @@ def match_ancestors_batch_group_partition(work_dir, group_index, partition_index def match_ancestors_batch_group_finalise(work_dir, group_index): + """ + match_ancestors_batch_group_finalise(work_dir, group_index) + + Finalise a group of partitioned ancestors in a batched ancestor matching job. See + :ref:`large scale inference` for more details. The tree sequence + for the group before must exist in `work_dir`, along with the results for all + partitions in this group. Writes the tree sequence for the group to `work_dir`. + + :param str work_dir: The working directory for the batch job, as written by + :meth:`match_ancestors_batch_init`. + :param int group_index: The group index to finalise. + :return: The tree sequence representing the inferred ancestors for the group + :rtype: tskit.TreeSequence + """ metadata_path = os.path.join(work_dir, "metadata.json") with open(metadata_path) as f: metadata = json.load(f) @@ -805,6 +926,19 @@ def match_ancestors_batch_group_finalise(work_dir, group_index): def match_ancestors_batch_finalise(work_dir): + """ + match_ancestors_batch_finalise(work_dir) + + Finalise a batched ancestor matching job. This method should be called after all + groups have been matched, either by :meth:`match_ancestors_batch_groups` or + :meth:`match_ancestors_batch_group_finalise`. Returns the final ancestors + tree sequence for the batch job. `work_dir` is retained and not deleted. + + :param str work_dir: The working directory for the batch job, as written by + :meth:`match_ancestors_batch_init`. + :return: The tree sequence representing the inferred ancestors for the batch job + :rtype: tskit.TreeSequence + """ metadata_path = os.path.join(work_dir, "metadata.json") with open(metadata_path) as f: metadata = json.load(f) @@ -1023,6 +1157,79 @@ def match_samples_batch_init( record_provenance=True, map_additional_sites=None, ): + """ + match_samples_batch_init(work_dir, sample_data_path, ancestral_state, + ancestor_ts_path, min_work_per_job, \\*, max_num_partitions=None, + sample_mask=None, site_mask=None, recombination_rate=None, mismatch_ratio=None, + path_compression=True, indexes=None, post_process=None, force_sample_times=False) + + Initialise a batched sample matching job. Creates `work_dir` and writes job + details to `metadata.json`. The job can then be run using parallel calls to + :meth:`match_samples_batch_partition` and once those are complete + finally :meth:`match_samples_batch_finalise`. + + The `num_partitions` key in the metadata dict contains the number of partitions + that need to be processed. + + :param str work_dir: The directory in which to store the working files. + :param str sample_data_path: The input dataset in + `VCF Zarr `_ format. + Path to the Zarr dataset saved on disk. See :class:`VariantData`. + :param Union(array, str) ancestral_state: A numpy array of strings specifying + the ancestral states (alleles) used in inference. This must be the same + length as the number of unmasked sites in the dataset. Alternatively, a + single string can be provided, giving the name of an array in the input + dataset which contains the ancestral states. Unknown ancestral states can + be specified using "N". Any ancestral states which do not match any of the + known alleles at that site, will be tallied, and a warning issued + summarizing the unknown ancestral states. + :param str ancestor_ts_path: The path to the tree sequence file containing the + ancestors generated by :meth:`match_ancestors_batch_finalise`, or + :meth:`match_ancestors`. + :param int min_work_per_job: The minimum amount of work (as a count of + genotypes) to allocate to a single parallel job. If the amount of work in + a group of samples exceeds this level it will be broken up into parallel + partitions, subject to the constriant of `max_num_partitions`. + :param int max_num_partitions: The maximum number of partitions to split a + group of samples into. Useful for limiting the number of jobs in a + workflow to avoid job overhead. Defaults to 1000. + :param Union(array, str) sample_mask: A numpy array of booleans specifying + which samples to mask out (exclude) from the dataset. Alternatively, a + string can be provided, giving the name of an array in the input dataset + which contains the sample mask. If ``None`` (default), all samples are + included. + :param Union(array, str) site_mask: A numpy array of booleans specifying which + sites to mask out (exclude) from the dataset. Alternatively, a string can + be provided, giving the name of an array in the input dataset which + contains the site mask. If ``None`` (default), all sites are included. + :param recombination_rate: Either a floating point value giving a constant + rate :math:`\\rho` per unit length of genome, or an + :class:`msprime.RateMap` object. This is used to calculate the + probability of recombination between adjacent sites. If ``None``, all + matching conflicts are resolved by recombination and all inference sites + will have a single mutation (equivalent to mismatch_ratio near zero) + :type recombination_rate: float, msprime.RateMap + :param float mismatch_ratio: The probability of a mismatch relative to the + median probability of recombination between adjacent sites: can only be + used if a recombination rate has been set (default: ``None`` treated as 1 + if ``recombination_rate`` is set). + :param bool path_compression: Whether to merge edges that share identical paths + (essentially taking advantage of shared recombination breakpoints). + :param indexes: The sample indexes to match. If ``None`` (default), all + samples are matched. + :type indexes: arraylike + :param bool post_process: Whether to run the :func:`post_process` method on + the the tree sequence which, among other things, removes ancestral + material that does not end up in the current samples (if not specified, + defaults to ``True``) + :param bool force_sample_times: After matching, should an attempt be made to + adjust the time of "historical samples" (those associated with an + individual having a non-zero time) such that the sample nodes in the tree + sequence appear at the time of the individual with which they are + associated. + :return: A dictionary of the job metadata, as written to `metadata.json` in + `work_dir`. + """ if max_num_partitions is None: max_num_partitions = 1000 @@ -1079,13 +1286,26 @@ def match_samples_batch_init( num_samples_per_partition = 1 wd.num_samples_per_partition = num_samples_per_partition wd.num_partitions = math.ceil(len(sample_indexes) / num_samples_per_partition) - wd_path = work_dir / "wd.json" + wd_path = work_dir / "metadata.json" wd.save(wd_path) return wd def match_samples_batch_partition(work_dir, partition_index): - wd_path = pathlib.Path(work_dir) / "wd.json" + """ + match_samples_batch_partition(work_dir, partition_index) + + Match a single partition of samples in a batched sample matching job. See + :ref:`large scale inference` for more details. Match data + for the partition is written to `work_dir`. Uses a single thread to perform + matching. + + :param str work_dir: The working directory for the batch job, as written by + :meth:`match_samples_batch_init`. + :param int partition_index: The partition index to match. Must be less than + the number of partitions in the batch job metadata key `num_partitions`. + """ + wd_path = pathlib.Path(work_dir) / "metadata.json" wd = SampleBatchWorkDescriptor.load(wd_path) if partition_index >= wd.num_partitions or partition_index < 0: raise ValueError(f"Partition {partition_index} is out of range") @@ -1113,7 +1333,19 @@ def match_samples_batch_partition(work_dir, partition_index): def match_samples_batch_finalise(work_dir): - wd_path = os.path.join(work_dir, "wd.json") + """ + match_samples_batch_finalise(work_dir) + + Finalise a batched sample matching job. This method should be called after all + partitions have been matched by :meth:`match_samples_batch_partition`. Returns + the final tree sequence for the batch job. `work_dir` is retained and not deleted. + + :param str work_dir: The working directory for the batch job, as written by + :meth:`match_samples_batch_init`. + :return: The tree sequence representing the inferred history of the samples. + :rtype: tskit.TreeSequence + """ + wd_path = os.path.join(work_dir, "metadata.json") wd = SampleBatchWorkDescriptor.load(wd_path) variant_data, ancestor_ts, matcher = load_variant_data_and_ancestors_ts(wd) results = [] From d03946128c0427981a3c6f8d65f12029ab177c2c Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Thu, 6 Feb 2025 13:47:15 +0000 Subject: [PATCH 02/10] Add example flow --- docs/_static/example_flow.svg | 1 + docs/large_scale.md | 40 +++++++++++++++++++++++++++++++++++ tsinfer/inference.py | 11 ---------- 3 files changed, 41 insertions(+), 11 deletions(-) create mode 100644 docs/_static/example_flow.svg diff --git a/docs/_static/example_flow.svg b/docs/_static/example_flow.svg new file mode 100644 index 00000000..90fdca29 --- /dev/null +++ b/docs/_static/example_flow.svg @@ -0,0 +1 @@ +

batch_init()

batch_groups(0, 5)

batch_group_partition(5, 0)

batch_group_partition(5, 1)

batch_group_finalise(5)

batch_groups(6, 7)

batch_groups(7, 8)

batch_group_partition(8, 0)

batch_group_partition(8, 1)

batch_group_partition(8, 2)

batch_group_finalise(8)

batch_finalise()

\ No newline at end of file diff --git a/docs/large_scale.md b/docs/large_scale.md index ee343d8c..16c76d52 100644 --- a/docs/large_scale.md +++ b/docs/large_scale.md @@ -119,6 +119,46 @@ Calls to {meth}`match_ancestors_batch_group_partition` will only use a single co Therefore this value and cluster resources requested should be scaled with the number of ancestors, which can be read from the metadata dictionary. +As an example of how the API methods can be used together, suppose the metadata dictionary +created by {meth}`match_ancestors_batch_init` contains the following: + +```python +{ + "ancestor_grouping": [ + {"ancestors": [0, ... 9], "partitions": None}, + { + "ancestors": [10, ... 15], + "partitions": [[10, 11, 12], [13, 14, 15]] + }, + {"ancestors": [16, ... 19], "partitions": None}, + {"ancestors": [20, ... 25], "partitions": None}, + {"ancestors": [26, ... 30], "partitions": None}, + { + "ancestors": [31, ... 41], + "partitions": [[31, 32, 33, 34, 35, 36], [37, 38, 39, 40, 41]] + }, + {"ancestors": [42, ... 45], "partitions": None}, + {"ancestors": [46, ... 50], "partitions": None}, + { + "ancestors": [51, ... 65], + "partitions": [ + [51, 52, 53, 54], + [55, 56, 57, 58], + [59, 60, 61, 62, 63, 64, 65] + ] + }, + ] +} +``` +Then the flow could look like the following diagram: (calls on the same horizontal line can be +done in parallel, note that method names are shortened): + +```{figure} _static/example_flow.svg +:width: 80% +``` + +Note that groups 1, 5 and 8 can be partitioned, but only groups 5 and 8 are actually partitioned in this example, as stated above partitioning for groups is optional. Groups 0-4 are matched in one call, groups 6 and 7 are matched in two calls, but +could have been matched in one. By splitting 6 and 7 the flow makes an additional resume point in the case of job failure at the cost of job start up and queueing time. ## Sample matching diff --git a/tsinfer/inference.py b/tsinfer/inference.py index b948d060..4d5e5864 100644 --- a/tsinfer/inference.py +++ b/tsinfer/inference.py @@ -1899,17 +1899,6 @@ def run(self): return self.ancestor_data -@dataclasses.dataclass -class StoredMatchData: - """ - A class to store the results of a matching run to disk, for later use. - """ - - group_id: str - num_sites: int - results: dict - - class Matcher: """ A matching instance, used in both ``tsinfer.match_ancestors`` and From 6c02df19f502be6ff5b9833ceb6f17a8b2818789 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Thu, 6 Feb 2025 13:53:48 +0000 Subject: [PATCH 03/10] Nits --- docs/large_scale.md | 11 ++++++----- tsinfer/inference.py | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/large_scale.md b/docs/large_scale.md index 16c76d52..eb4dca59 100644 --- a/docs/large_scale.md +++ b/docs/large_scale.md @@ -45,7 +45,8 @@ Note the `genotype_encoding` argument, setting this to {class}`tsinfer.GenotypeEncoding.ONE_BIT` reduces the memory footprint of the genotype array by a factor of 8, for a surprisingly small increase in runtime. With this encoding, the RAM needed is roughly -`num_sites * num_samples * ploidy / 8 bytes.` +`num_sites * num_samples * ploidy / 8 bytes.` However this encoding +only supports biallelic sites, with no missingness. ## Ancestor matching @@ -57,7 +58,7 @@ of a sample must be matched in an earlier group. For a typical human data set the number of samples per group varies from single digits up to approximately the number of samples. The plot below shows the number of ancestors matched in each group for a typical -human data set: +human data set, earlier groups are older ancestors: ```{figure} _static/ancestor_grouping.png :width: 80% @@ -103,9 +104,9 @@ the `working_dir`. Once all are complete a single call to {meth}`match_ancestors_batch_group_finalise` will then insert the matches and output the tree sequence to `work_dir`. -At anypoint the process can be resumed from the last successfully completed call to -{meth}`match_ancestors_batch_groups`. As the tree sequences in `work_dir` checkpoint the -progress. +Each call to {meth}`match_ancestors_batch_groups` and {meth}`match_ancestors_batch_group_finalise` results in a tree sequence being written to `work_dir`. +These tree sequences are essentially checkpoints from with the batch matching workflow +can be resumed on job failure. Finally after the final group, call {meth}`match_ancestors_batch_finalise` to combine the groups into a single tree sequence. diff --git a/tsinfer/inference.py b/tsinfer/inference.py index 4d5e5864..858a91fd 100644 --- a/tsinfer/inference.py +++ b/tsinfer/inference.py @@ -647,7 +647,7 @@ def match_ancestors_batch_init( :param int min_work_per_job: The minimum amount of work (as a count of genotypes) to allocate to a single parallel job. If the amount of work in a group of ancestors exceeds this level it will be broken up into parallel partitions, subject to - the constriant of `max_num_partitions`. + the constraint of `max_num_partitions`. :param int max_num_partitions: The maximum number of partitions to split a group of ancestors into. Useful for limiting the number of jobs in a workflow to avoid job overhead. Defaults to 1000. @@ -1189,7 +1189,7 @@ def match_samples_batch_init( :param int min_work_per_job: The minimum amount of work (as a count of genotypes) to allocate to a single parallel job. If the amount of work in a group of samples exceeds this level it will be broken up into parallel - partitions, subject to the constriant of `max_num_partitions`. + partitions, subject to the constraint of `max_num_partitions`. :param int max_num_partitions: The maximum number of partitions to split a group of samples into. Useful for limiting the number of jobs in a workflow to avoid job overhead. Defaults to 1000. From edcf3ff680c7f4fe7fae608e8a12555861e1470f Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Tue, 11 Feb 2025 15:07:00 +0000 Subject: [PATCH 04/10] Add link --- docs/large_scale.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/large_scale.md b/docs/large_scale.md index eb4dca59..8428c2d6 100644 --- a/docs/large_scale.md +++ b/docs/large_scale.md @@ -21,7 +21,7 @@ kernelspec: tsinfer scales well and has been successfully used with datasets up to half a million samples. Here we detail considerations and tips for each step of the inference process to help you scale up your analysis. A snakemake pipeline -which implements this parallelisation scheme is available at https://github.com/benjeffery/tsinfer-snakemake. +which implements this parallelisation scheme is available as [tsinfer-snakemake](https://github.com/benjeffery/tsinfer-snakemake). (sec_large_scale_ancestor_generation)= From abc170fea87a4677220b710cc9205d04e417effd Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Tue, 11 Feb 2025 15:07:13 +0000 Subject: [PATCH 05/10] Add tut todo --- docs/large_scale.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/large_scale.md b/docs/large_scale.md index 8428c2d6..2f58addb 100644 --- a/docs/large_scale.md +++ b/docs/large_scale.md @@ -32,6 +32,10 @@ format, read by the {class}`VariantData` class. [bio2zarr](https://github.com/sg is recommended for conversion from VCF. [sgkit](https://github.com/sgkit-dev/sgkit) can then be used to perform initial filtering. +:::{todo} +An upcoming tutorial will detail conversion from VCF to a VCF Zarr suitable for tsinfer. +::: + ## Ancestor generation From c6b5e1e53a95a260a33a767ae0bae65f215815cd Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Tue, 11 Feb 2025 15:14:43 +0000 Subject: [PATCH 06/10] Add more context --- docs/large_scale.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/large_scale.md b/docs/large_scale.md index 2f58addb..5586b310 100644 --- a/docs/large_scale.md +++ b/docs/large_scale.md @@ -18,10 +18,16 @@ kernelspec: # Large Scale Inference -tsinfer scales well and has been successfully used with datasets up to half a -million samples. Here we detail considerations and tips for each step of the +Generally, for up to a few thousand samples a single multi-core machine +can infer a tree seqeunce in a few days. However, tsinfer has been +successfully used with datasets up to half a million samples, where +ancestor and sample matching can take several CPU-years. +At this scale inference must be scaled across many machines. +tsinfer provides specific APIs to enable this. +Here we detail considerations and tips for each step of the inference process to help you scale up your analysis. A snakemake pipeline -which implements this parallelisation scheme is available as [tsinfer-snakemake](https://github.com/benjeffery/tsinfer-snakemake). +which implements this parallelisation scheme is available as +[tsinfer-snakemake](https://github.com/benjeffery/tsinfer-snakemake). (sec_large_scale_ancestor_generation)= From 587c110705e1a9ce060baef9d5d7a3e376b17bf8 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Tue, 11 Feb 2025 15:15:00 +0000 Subject: [PATCH 07/10] Explain out-of-core --- docs/large_scale.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/large_scale.md b/docs/large_scale.md index 5586b310..4b89958a 100644 --- a/docs/large_scale.md +++ b/docs/large_scale.md @@ -46,7 +46,8 @@ An upcoming tutorial will detail conversion from VCF to a VCF Zarr suitable for ## Ancestor generation Ancestor generation is generally the fastest step in inference and is not yet -parallelised out-of-core in tsinfer. However it scales well on machines with +parallelised out-of-core in tsinfer and must be performed on a single machine. +However it scales well on machines with many cores and hyperthreading via the `num_threads` argument to {meth}`generate_ancestors`. The limiting factor is often that the entire genotype array for the contig being inferred needs to fit in RAM. From 61292ce096cf02ad0a1741b287aa796d0d1a232e Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Tue, 11 Feb 2025 15:17:16 +0000 Subject: [PATCH 08/10] Full stop --- docs/large_scale.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/large_scale.md b/docs/large_scale.md index 4b89958a..05ce4837 100644 --- a/docs/large_scale.md +++ b/docs/large_scale.md @@ -127,7 +127,7 @@ not matching to a large tree sequence it is often faster to not partition the fi groups, depending on job set up and queueing time on your cluster. Calls to {meth}`match_ancestors_batch_group_partition` will only use a single core, but -{meth}`match_ancestors_batch_groups` will use as many cores as `num_threads` is set to +{meth}`match_ancestors_batch_groups` will use as many cores as `num_threads` is set to. Therefore this value and cluster resources requested should be scaled with the number of ancestors, which can be read from the metadata dictionary. From 6a65530d6ed34c539a53bcc5c57ad63995039199 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Tue, 11 Feb 2025 15:19:03 +0000 Subject: [PATCH 09/10] List methods --- docs/large_scale.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/large_scale.md b/docs/large_scale.md index 05ce4837..204304b3 100644 --- a/docs/large_scale.md +++ b/docs/large_scale.md @@ -78,6 +78,14 @@ human data set, earlier groups are older ancestors: There are five tsinfer API methods that can be used to parallelise ancestor matching. +The five methods are: + +1. {meth}`match_ancestors_batch_init` +2. {meth}`match_ancestors_batch_groups` +3. {meth}`match_ancestors_batch_group_partition` +4. {meth}`match_ancestors_batch_group_finalise` +5. {meth}`match_ancestors_batch_finalise` + Initially {meth}`match_ancestors_batch_init` should be called to set up the batch matching and to determine the groupings of ancestors. This method writes a file `metadata.json` to the `work_dir` that contains @@ -177,6 +185,11 @@ could have been matched in one. By splitting 6 and 7 the flow makes an additiona Sample matching is far simpler than ancestor matching as it is essentially the same as a single group of ancestors. There are three API methods that work together to enable distributed sample matching. + +1. {meth}`match_samples_batch_init` +2. {meth}`match_samples_batch_partition` +3. {meth}`match_samples_batch_finalise` + {meth}`match_samples_batch_init` should be called to set up the batch matching and to determine the groupings of samples. Similar to {meth}`match_ancestors_batch_init` is has a `min_work_per_job` and `max_num_partitions` arguments to control the level of parallelism. The method writes a file From d6725ce63d1b612389cc266946be6cd6e6942f46 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Tue, 11 Feb 2025 15:24:24 +0000 Subject: [PATCH 10/10] Reduce confusion? --- docs/large_scale.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/large_scale.md b/docs/large_scale.md index 204304b3..1ae68d71 100644 --- a/docs/large_scale.md +++ b/docs/large_scale.md @@ -95,13 +95,17 @@ list of ancestors in that group (key:`ancestors`) and a proposed partioning of those ancestors into sets that can be matched in parallel (key:`partitions`). The dictionary is also returned by the method. The partitioning is controlled by the `min_work_per_job` and `max_num_partitions` -arguments. Ancestors are placed in a partition until the sum of their lengths exceeds -`min_work_per_job`, when a new partition is started. However, the number of partitions -is not allowed to exceed `max_num_partitions`. It is suggested to set `max_num_partitions` -to around 3-4x the number of worker nodes available, and `min_work_per_job` to around -2,000,000 for a typical human data set. +arguments. For each group, ancestors are placed in a partition until the sum of their +lengths exceeds `min_work_per_job`, when a new partition is started. However, the +number of partitions is not allowed to exceed `max_num_partitions`. It is suggested +to set `max_num_partitions` to around 3-4x the number of worker nodes available, +and `min_work_per_job` to around 2,000,000 for a typical human data set. -Each group is then matched in turn, either by calling {meth}`match_ancestors_batch_groups` +Groups vs partitions is a point of common confusion. Note that groups of ancestors +are matched serially, and each group is split into partitions that can be +matched in parallel. + +Each group is matched in turn, either by calling {meth}`match_ancestors_batch_groups` to match without partitioning, or by calling {meth}`match_ancestors_batch_group_partition` many times in parallel followed by a single call to {meth}`match_ancestors_batch_group_finalise`. Each call to {meth}`match_ancestors_batch_groups` or {meth}`match_ancestors_batch_group_finalise`