-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest_input.s
1329 lines (1220 loc) · 82.8 KB
/
test_input.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2023 Sneller, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "textflag.h"
#include "funcdata.h"
#include "go_asm.h"
#include "../../../internal/asmutils/bc_imm_amd64.h"
#include "../../../internal/asmutils/bc_constant.h"
// -------------------------------------------
#define short_literal_stride 32
#define short_literal_register Y2
#define long_literal_stride 32
#define long_literal_register Y2
#define short_match_stride 32
#define short_match_register Y2
#define long_match_stride 32
#define long_match_register Y2
// -------------------------------------------
#define COPY_SINGLE_ITEM(slot_id, lbl_litcpy, lbl_litcpy_completed, lbl_match_loop) \
/* cycle 0 */ \
VMOVDQU8 (SI), short_literal_register /* SIMDREG2 := the first short_literal_stride bytes of the literal */ \
VPEXTRD $(slot_id), X15, AX /* AX := token[1].offset */ \
\
/* cycle 1 */ \
VMOVDQU8 short_literal_register, (DI) /* Store the first short_literal_stride bytes of the literal payload */ \
VPEXTRD $(slot_id), X16, BX /* BX := token[1].litlen */ \
NEGQ AX /* AX := -match_offset */ \
\
/* cycle 2 */ \
CMOVQNE AX, R9 /* lastOffs := -match_offset for non-zero offsets */ \
ADDQ BX, DI /* Adjust the dst.Data cursor, optimistically assuming the copying is over */ \
ADDQ BX, SI /* Adjust the literals cursor, optimistically assuming the copying is over */ \
CMPL BX, $short_literal_stride /* Check if len(literals) > short_literal_stride */ \
JA lbl_litcpy /* Handle the long literal case */ \
\
lbl_litcpy_completed: \
/* cycle 3 */ \
MOVQ $-32, BX /* BX := -stride = -sizeof(ymm) */ \
VPEXTRD $(slot_id), X17, CX /* CX := token[1].matchlen */ \
CMPQ R9, BX \
CMOVQGT R9, BX /* BX := max(-32, offs) */ \
\
/* cycle 4 */ \
lbl_match_loop: \
VMOVDQU8 (DI)(R9*1), short_match_register /* SIMDREG2 := the first short_match_stride bytes of the match */ \
VMOVDQU8 short_match_register, (DI) /* Store the first match_copy_stride bytes of the match payload */ \
SUBQ BX, DI /* dst offset += sizeof(ymm) */ \
ADDQ BX, CX /* matchlen -= sizeof(ymm) */ \
JG lbl_match_loop /* continue while matchlen > 0 */ \
ADDQ CX, DI /* dst += matchlen (negative re-adjustment)*/ \
\
// -------------------------------------------
#define COPY_SINGLE_ITEM_COMPLETERS(lbl_litcpy, lbl_litcpy_completed) \
lbl_litcpy: \
CALL copySingleLongLiteral<>(SB) \
JMP lbl_litcpy_completed \
// -------------------------------------------
//
// func decompressIguanaAVX512VBMI2(dst []byte, streams *streamPack, lastOffs *int64) ([]byte, errorCode)
TEXT ·decompressIguanaAVX512VBMI2(SB), NOSPLIT | NOFRAME, $0-40
MOVQ streams+24(FP), BX
VPTERNLOGQ $0xff, Z1, Z1, Z1 // Z1 := {-1*}
VPXORQ Z0, Z0, Z0 // Z0 := {0*}
MOVQ (stream__size*const_stridTokens+stream_data + const_offsSliceHeaderData)(BX), R11 // R11 := Tokens.Data cursor
MOVQ (stream__size*const_stridTokens+stream_data + const_offsSliceHeaderLen)(BX), R10 // R10 := token_count
VPABSB Z1, Z2 // Z2 := uint8{0x01*}
MOVQ (stream__size*const_stridOffset16+stream_data + const_offsSliceHeaderData)(BX), R14 // R14 := Offsets16.Data
MOVQ (stream__size*const_stridOffset24+stream_data + const_offsSliceHeaderData)(BX), R15 // R15 := Offsets24.Data
VPSLLD $3, Z2, Z29 // Z29 := uint8{0x08*}
VPADDD Z2, Z2, Z27 // Z27 := uint8{0x02*}
MOVQ (stream__size*const_stridVarLitLen+stream_data + const_offsSliceHeaderData)(BX), R12 // R12 := VarLitLen.Data
MOVQ (stream__size*const_stridVarMatchLen+stream_data + const_offsSliceHeaderData)(BX), R13 // R13 := VarMatchLen.Data
VPSUBB Z27, Z1, Z25 // Z25 := uint8{0xfd*}
VPADDD Z29, Z29, Z30 // Z30 := uint8{0x10*}
VPADDD Z27, Z27, Z28 // Z28 := uint8{0x04*}
MOVQ (stream__size*const_stridLiterals+stream_data + const_offsSliceHeaderData)(BX), SI // SI := Literals.Data
MOVQ lastOffs+32(FP), R9 // R9 := &lastOffs
VMOVDQU8 CONST_GET_PTR(consts_uint24_expander_vbmi2, 0), Z21
VPADDD Z30, Z30, Z31 // Z31 := uint8{0x20*}
VPADDB Z1, Z29, Z24 // Z24 := uint8{0x07*}
MOVQ dst_base+0(FP), DI // DI := dst.Data cursor
MOVQ dst_len+8(FP), DX // DX := dst.Len
VPADDB Z1, Z30, Z23 // Z23 := uint8{0x0f*}
VPADDB Z1, Z31, Z22 // Z22 := uint8{0x1f*}
MOVQ dst_cap+16(FP), CX // CX := dst.Cap
MOVQ DI, ret_base+40(FP) // Set the result base address
ADDQ DX, DI // DI := Move to the dst.Data end as required by append mode
MOVQ CX, ret_cap+56(FP) // Set the result capacity
MOVQ (R9), R9 // R9 := lastOffs
predecoded_tokens_exhausted:
SUBL $64, R10 // token_count -= 64
JLT fetch_last_tokens
// There are still at least 64 tokens available
VMOVDQU8 (R11), Z2 // Z2 := uint8{token[i]} for i in 63..0
MOVL $0b1_0_0111_1_1_0111_1_1_0111_1_1_0111_1, R8
ADDQ $64, R11 // Move the tokens cursor to the next group of 64 tokens
tokens_fetched:
// Decode the fetched tokens
//
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// Z0 := {0*}
// Z1 := {-1*}
// Z2 := uint8{token[i]} for i in 63..0
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
VPMINUB Z31, Z2, Z3 // Z3 := uint8{32 for token[i] >= 32, token[i] otherwise} for i in 63..0
VPANDD Z24, Z2, Z19 // Z19 := uint8{token[i].LLL for token[i] >= 32, garbage otherwise} for i in 63..0
VPSUBB Z31, Z3, Z3 // Z3 := uint8{0 for token[i] >= 32, negative otherwise} for i in 63..0
VPSRLD $3, Z2, Z18 // Z18 := uint8{token[i].GGG_S_MMMM for token[i] >= 32, GGG_S_GGGG otherwise} for i in 63..0
VPMAXSB Z1, Z3, Z3 // Z3 := uint8{0 for token[i] >= 32, 0xff otherwise} for i in 63..0
VPANDD Z23, Z18, Z20 // Z20 := uint8{token[i].MMMM for token[i] >= 32, garbage otherwise} for i in 63..0
VPMINUB Z22, Z2, Z4 // Z4 := uint8{31 for token[i] >= 31, token[i] otherwise} for i in 63..0
VPANDND Z19, Z3, Z19 // Z19 := uint8{token[i].LLL for token[i] >= 32, 0 otherwise} for i in 63..0
VPSUBB Z22, Z4, Z4 // Z4 := uint8{0 for token[i] >= 31, negative otherwise} for i in 63..0
VPANDND Z20, Z3, Z20 // Z20 := uint8{token[i].MMMM for token[i] >= 32, 0 otherwise} for i in 63..0
VPMAXSB Z1, Z4, Z4 // Z4 := uint8{0 for token[i] >= 31, 0xff otherwise} for i in 63..0
VPSUBB Z24, Z19, Z5 // Z5 := uint8{0 for (token[i] >= 32) && (token[i].LLL == 7), negative otherwise} for i in 63..0
VPSUBB Z23, Z20, Z6 // Z6 := uint8{0 for (token[i] >= 32) && (token[i].MMMM == 15), negative otherwise} for i in 63..0
VPMAXSB Z1, Z5, Z5 // Z5 := uint8{0 for (token[i] >= 32) && (token[i].LLL == 7), 0xff otherwise} for i in 63..0
VPTERNLOGD $0b0000_0010, Z30, Z3, Z18 // Z18 := uint8{(flOffset16)_0000} for i in 63..0
VPMAXSB Z1, Z6, Z6 // Z6 := uint8{0 for (token[i] >= 32) && (token[i].MMMM == 15), 0xff otherwise} for i in 63..0
VPTERNLOGD $0b1101_1000, Z29, Z3, Z18 // Z18 := uint8{(flOffset16)_(flOffset24)_000} for i in 63..0
VPADDB Z30, Z2, Z2 // Z2 := uint8{token[i] + 16} for i in 63..0
VPTERNLOGD $0b0111_0010, Z27, Z5, Z18 // Z18 := uint8{(flOffset16)_(flOffset24)_0_(flVarLitLen)_0} for i in 63..0
VPTERNLOGD $0b0010_0010, Z3, Z4, Z5 // Z5 := uint8{0xff for token[i] == 31, 0 otherwise} for i in 63..0
VPTERNLOGD $0b0111_0010, Z28, Z6, Z18 // Z18 := uint8{(flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 63..0
VPTERNLOGD $0b1011_1000, Z2, Z3, Z20 // Z20 := uint8{set match_length[i] for token[i] < 32} for i in 63..0
VPTERNLOGD $0b1111_1000, Z28, Z5, Z18 // Z18 := uint8{(flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 63..0
// Z18 := flags := uint8{(flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 63..0
// Z19 := literal length offsets
// Z20 := match length offsets
predecoded_tokens_available:
// Arm up to 16 predecoded tokens with parameters.
//
// DI := dst.Data cursor
// SI := Literals.Data cursor
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// R11 := Tokens.Data cursor
// R12 := VarLitLen.Data
// R13 := VarMatchLen.Data cursor
// R14 := Offsets16.Data
// R15 := Offsets24.Data
// Z0 := {0*}
// Z1 := {-1*}
// Z18 := flags := uint8{(flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 63..0
// Z19 := literal length offsets
// Z20 := match length offsets
// Z21 := uint8{consts_uint24_expander_vbmi2}
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
VPTESTMB X29, X18, K5 // K5 := {token[i] needs an Offset24 parameter} for i in 15..0
VMOVDQU8 (R12), Z2 // Z2 := uint8{VarLitLen.Data[i]} for i in 63..0
VPTESTMB X30, X18, K6 // K6 := {token[i] needs an Offset16 parameter} for i in 15..0
VMOVDQU8 (R13), Z3 // Z3 := uint8{VarMatchLen.Data[i]} for i in 63..0
VPCMPUB $VPCMP_IMM_LE, Z25, Z2, K2 // K2 := {VarLitLen.Data[i] <= 0xfd} for i in 63..0
VMOVDQU16 (R14), Y15 // Y15 := uint16{Offsets16.Data[(i*2+1)..(i*2)} for i in 15..0
VPTESTMB X27, X18, K4 // K4 := {token[i] needs a VarLitLen parameter} for i in 15..0
KMOVW K5, AX // AX := {token[i] needs an Offset24 parameter}
VMOVDQU8 (R15), Z14 // Z14 := uint24{Offsets24.Data[(i*3+2)..(i*3)]}
VPMOVZXBD X2, Z4 // Z4 := uint32{VarLitLen.Data[i]} for i in 15..0
KMOVW K6, BX // BX := {token[i] needs an Offset16 parameter}
POPCNTL AX, AX // AX := the number of tokens requesting Offset24 parameters
VPMOVZXBD X19, Z16 // Z16 := uint32{litlen_token[i].offset} for i in 15..0
LEAQ (AX)(AX*2), AX // AX := AX * 3
POPCNTL BX, BX // BX := the number of tokens requesting Offset16 parameters
VPERMB Z14, Z21, Z14 // Z14 := uint32{Offsets24.Data[(i*3+2)..(i*3)] << 8} for i in 15..0
KORTESTW K2, K2 // EFLAGS.CF==0 <=> there are VarLitLen.Data[i] values greater than 253
LEAQ (R15)(AX*1), R15 // Skip the consumed bytes from the Offsets24 stream
VPMOVZXWD Y15, Z15 // Z15 := uint32{Offsets16.Data[(i*2+1)..(i*2)} for i in 15..0
KMOVW K4, DX // DX := {token[i] needs a VarLitLen parameter} for i in 15..0
LEAQ (R14)(BX*2), R14 // Skip the consumed bytes from the Offsets16 stream
JCC decode_wide_varlitlen
// {VarLitLen.Data[i] <= 0xfd} for i in 15..0, so the bytes can be simply skipped by adding the number of consumed ones
POPCNTL DX, DX // DX := the number of consumed VarLitLen bytes
varlitlen_decoded:
// DX := the number of consumed VarLitLen bytes
// DI := dst.Data cursor
// SI := Literals.Data cursor
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// R11 := Tokens.Data cursor
// R12 := VarLitLen.Data
// R13 := VarMatchLen.Data cursor
// R14 := adjusted Offsets16.Data cursor
// R15 := adjusted Offsets24.Data cursor
// K4 := {token[i] needs a VarLitLen parameter} for i in 15..0
// K5 := {token[i] needs an Offset24 parameter} for i in 15..0
// K6 := {token[i] needs an Offset16 parameter} for i in 15..0
// Z0 := {0*}
// Z1 := {-1*}
// Z3 := uint8{VarMatchLen.Data[i]} for i in 63..0
// Z4 := uint32{VarLitLen.Data[i]} for i in 15..0
// Z14 := uint32{Offsets24.Data[(i*3+2)..(i*3)] << 8} for i in 15..0
// Z15 := uint32{Offsets16.Data[(i*2+1)..(i*2)} for i in 15..0
// Z16 := uint32{litlen_token[i].offset} for i in 15..0
// Z18 := flags := uint8{(flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 63..0
// Z19 := literal length offsets
// Z20 := match length offsets
// Z21 := uint8{consts_uint24_expander_vbmi2}
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
VPEXPANDD.Z Z4, K4, Z4 // Scatter the subsequent varlitlen values to the requesting token slots
ADDQ DX, R12 // Skip the bytes consumed from the VarLitLen stream
VPCMPUB $VPCMP_IMM_LE, Z25, Z3, K2 // K2 := {VarMatchLen.Data[i] <= 0xfd} for i in 63..0
VPSRLD $8, Z14, Z14 // Z14 := uint32{Offsets24.Data[(i*3+2)..(i*3)]} for i in 15..0
VPTESTMB X28, X18, K4 // K4 := {token[i] needs a VarMatchLen parameter} for i in 15..0
//p0
VPMOVZXBD X20, Z17 // Z17 := uint32{token[i].matchlen} for i in 15..0
VPADDD Z4, Z16, Z16 // Z16 := uint32{token[i].litlen} for i in 15..0
VPMOVZXBD X3, Z4 // Z4 := uint32{VarMatchLen.Data[i]} for i in 15..0
KORTESTW K2, K2 // EFLAGS.CF==0 <=> there are VarMatchLen.Data[i] values greater than 253
VALIGND $4, Z18, Z0, Z18 // Skip the first 16 predecoded token flags entries
KMOVW K4, DX // DX := {token[i] needs a VarMatchLen parameter} for i in 15..0
JCC decode_wide_varmatchlen
// {VarMatchLen.Data[i] <= 0xfd} for i in 15..0, so the bytes can be simply skipped by adding the number of consumed ones
POPCNTL DX, DX // DX := the number of consumed VarMatchLen bytes
varmatchlen_decoded:
// DX := the number of consumed VarMatchLen bytes
// DI := dst.Data cursor
// SI := Literals.Data cursor
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// R11 := Tokens.Data cursor
// R12 := adjusted VarLitLen.Data cursor
// R13 := VarMatchLen.Data cursor
// R14 := adjusted Offsets16.Data cursor
// R15 := adjusted Offsets24.Data cursor
// K4 := {token[i] needs a VarMatchLen parameter} for i in 15..0
// K5 := {token[i] needs an Offset24 parameter} for i in 15..0
// K6 := {token[i] needs an Offset16 parameter} for i in 15..0
// Z0 := {0*}
// Z1 := {-1*}
// Z4 := uint32{VarMatchLen.Data[i]} for i in 15..0
// Z14 := uint32{Offsets24.Data[(i*3+2)..(i*3)]} for i in 15..0
// Z15 := uint32{Offsets16.Data[(i*2+1)..(i*2)} for i in 15..0
// Z16 := uint32{token[i].litlen} for i in 15..0
// Z17 := uint32{token[i].matchlen} for i in 15..0
// Z18 := adjusted flags := uint8{0 times 16, (flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 47..0
// Z19 := literal length offsets
// Z20 := match length offsets
// Z21 := uint8{consts_uint24_expander_vbmi2}
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
VPEXPANDD.Z Z4, K4, Z4 // Scatter the subsequent varmatchlen values to the requesting token slots
ADDQ DX, R13 // Skip the bytes consumed from the VarMatchLen stream
VPEXPANDD.Z Z15, K6, Z15 // Scatter the subsequent Offset16 values to the requesting token slots
//p0
VPEXPANDD.Z Z14, K5, Z14 // Scatter the subsequent Offset24 values to the requesting token slots
SHRL $1, R8 // Sequencer: EFLAGS.CF==1 <=> loop_4x should be entered
VALIGND $4, Z19, Z0, Z19 // Skip the first 16 predecoded token literal length entries
VPADDD Z4, Z17, Z17 // Z17 := uint32{matchlen[i]} for i in 15..0
//p05
VALIGND $4, Z20, Z0, Z20 // Skip the first 16 predecoded token match length entries
VPORD Z14, Z15, Z15 // Z15 := uint32{Offset24 or Offset16 value for token[i]} for i in 15..0
JCC check_loop_1x // There remain 0..3 tokens to handle
// There are at least 4 opcodes available for processing
//
// Z16 contains token[i].litlen for i in 15..0
// Z17 contains matchlen[i] for i in 15..0
// Z18 contains token[i].offset for i in 15..0
//
// DI := dst.Data cursor
// SI := Literals.Data cursor cursor
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// R11 := adjusted the Tokens.Data cursor cursor
// R12 := adjusted VarLitLen.Data cursor
// R13 := adjusted VarMatchLen.Data cursor
// R14 := adjusted Offsets16.Data cursor
// R15 := adjusted Offsets24.Data cursor
// Z0 := {0*}
// Z1 := {-1*}
// Z15 := uint32{token[i].offset} for i in 15..0
// Z16 := uint32{token[i].litlen} for i in 15..0
// Z17 := uint32{token[i].matchlen} for i in 15..0
// Z18 := adjusted flags := uint8{0 times 16, (flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 47..0
// Z19 := adjusted literal length offsets
// Z20 := adjusted match length offsets
// Z21 := uint8{consts_uint24_expander_vbmi2}
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
loop_4x:
COPY_SINGLE_ITEM(0, copy_long_literal0, copy_long_literal0_completed, copy_match0_loop)
COPY_SINGLE_ITEM(1, copy_long_literal1, copy_long_literal1_completed, copy_match1_loop)
COPY_SINGLE_ITEM(2, copy_long_literal2, copy_long_literal2_completed, copy_match2_loop)
COPY_SINGLE_ITEM(3, copy_long_literal3, copy_long_literal3_completed, copy_match3_loop)
// Rewind the opcode queue
VALIGND $4, Z15, Z0, Z15 // Skip the first 4 entries, part 1
SHRL $1, R8 // Sequencer: EFLAGS.CF==1 <=> loop_4x should be repeated
VALIGND $4, Z16, Z0, Z16 // Skip the first 4 entries, part 2
VALIGND $4, Z17, Z0, Z17 // Skip the first 4 entries, part 3
JCS loop_4x
SHRL $1, R8 // Sequencer: EFLAGS.CF==1 <=> predecoded tokens are present and should be armed with parameters
JCS predecoded_tokens_available
SHRL $1, R8 // Sequencer: EFLAGS.CF==1 <=> a new batch of tokens should be predecoded
JCS predecoded_tokens_exhausted
// There still are 0..3 tokens not processed by loop_4x
check_loop_1x:
TESTL R8, R8
JZ no_more_tokens
loop_1x:
COPY_SINGLE_ITEM(0, copy_long_literal_f0, copy_long_literal_f0_completed, copy_f0_loop)
// Rewind the opcode queue
VALIGND $1, Z15, Z0, Z15 // Skip the first entry, part 1
VALIGND $1, Z16, Z0, Z16 // Skip the first entry, part 2
VALIGND $1, Z17, Z0, Z17 // Skip the first entry, part 3
SUBL $1, R8
JNZ loop_1x
no_more_tokens:
MOVQ streams+24(FP), BX
MOVQ lastOffs+32(FP), AX // AX := &lastOffs
MOVQ (stream__size*const_stridLiterals+stream_data + const_offsSliceHeaderData)(BX), DX // DX := Literals.Data
MOVQ (stream__size*const_stridLiterals+stream_data + const_offsSliceHeaderLen)(BX), CX // CX := Literals.Len
SUBQ SI, DX // DX := -consumed_literals_bytes
MOVQ R9, (AX) // Store the lastOffs value
ADDQ DX, CX // CX := the number of the remaining literals bytes
LEAQ (DI)(CX*1), DX // DX := just past the dst.Data
SUBQ dst_base+0(FP), DX // DX := the number of written bytes
// Append the remaining literals payload bytes
REP; MOVSB // TODO: is there a real need for being excessively smart here?
MOVQ DX, ret_len+48(FP)
MOVL $const_ecOK, ret1+64(FP)
RET
fetch_last_tokens:
// R9 := lastOffs
// R10 := token_count
// Z0 := {0*}
// Z1 := {-1*}
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
LEAL 64(R10), AX
LEAQ CONST_GET_PTR(consts_composite_remainder, 0), BX
MOVQ $-1, DX
CMPL R10, $-64
JLE no_more_tokens
MOVL (BX)(AX*4), R8 // R8 := sequencer for the (R10 & 0x3f) remaining tokens (63..1)
SHLXQ R10, DX, DX // DX := uint64{-1 >> (R10 & 0x3f)}
NOTQ DX
KMOVQ DX, K1
VMOVDQU8.Z (R11), K1, Z2 // Z2 := uint8{token[i]} for i in K1 range
JMP tokens_fetched
decode_wide_varlitlen:
// DX := {token[i] needs a VarLitLen parameter} for i in 15..0
// DI := dst.Data cursor
// SI := Literals.Data cursor
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// R11 := adjusted the Tokens.Data cursor
// R12 := VarLitLen.Data
// R13 := VarMatchLen.Data cursor
// R14 := adjusted Offsets16.Data cursor
// R15 := adjusted Offsets24.Data cursor
// K2 := {VarLitLen.Data[i] <= 0xfd} for i in 63..0
// K4 := {token[i] needs a VarLitLen parameter} for i in 15..0
// K5 := {token[i] needs an Offset24 parameter} for i in 15..0
// K6 := {token[i] needs an Offset16 parameter} for i in 15..0
// Z0 := {0*}
// Z1 := {-1*}
// Z2 := uint8{VarLitLen.Data[i]} for i in 63..0
// Z3 := uint8{VarMatchLen.Data[i]} for i in 63..0
// Z14 := uint32{Offsets24.Data[(i*3+2)..(i*3)] << 8} for i in 15..0
// Z15 := uint32{Offsets16.Data[(i*2+1)..(i*2)} for i in 15..0
// Z16 := uint32{litlen_token[i].offset} for i in 15..0
// Z18 := flags := uint8{(flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 63..0
// Z19 := literal length offsets
// Z20 := match length offsets
// Z21 := uint8{consts_uint24_expander_vbmi2}
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
VPCMPUB $VPCMP_IMM_EQ, Z1, Z2, K1 // K1 := {VarLitLen.Data[i] == 0xff} for i in 63..0
KMOVQ K2, AX // AX := {VarLitLen.Data[i] <= 0xfd} for i in 63..0
VPCOMPRESSB.Z Z2, K2, Z2 // Z2 := uint8{the sequence of the payload bytes only}
MOVQ AX, BX // BX := {VarLitLen.Data[i] <= 0xfd} for i in 63..0
NOTQ AX // AX := {VarLitLen.Data[i] > 0xfd} for i in 63..0
POPCNTL DX, CX // CX := uint32{#tokens that need VarLitLen parameter}
LEAQ (AX)(AX*2), DX
LEAQ (DX)(AX*4), AX
KMOVQ K1, DX // DX := {VarLitLen.Data[i] == 0xff} for i in 63..0
LEAQ (AX)(DX*8), AX
XORQ BX, AX
PEXTQ AX, DX, DX // DX := uint64{(VarLitLen.Data[i] == 0xff) at the beginnings of item[i]} for i in 15..0
NOTQ BX // BX := {VarLitLen.Data[i] > 0xfd} for i in 63..0
PEXTQ AX, BX, BX // BX := uint64{(VarLitLen.Data[i] > 0xfd) at the beginnings of item[i]} for i in 15..0
MOVQ $0x1111_1111_1111_1111, AX
PDEPQ AX, DX, DX
PDEPQ AX, BX, BX
LEAQ (AX)(DX*4), DX
MOVQ $-1, AX
LEAQ (DX)(BX*2), DX // DX := uint4{0b0001 if len(item[i])== 1, 0b0011 if len(item[i])== 3, 0b0111 if len(item[i])== 4} for i in 15..0
SHLQ CX, AX // AX := uint32{-1 << CX)}
KMOVQ DX, K1 // K1 := uint4{0b0001 if len(item[i])== 1, 0b0011 if len(item[i])== 3, 0b0111 if len(item[i])== 4} for i in 15..0
LEAQ (DX)(BX*8), DX // DX := uint4{0b0001 if len(item[i])== 1, 0b1011 if len(item[i])== 3, 0b1111 if len(item[i])== 4} for i in 15..0
LEAL (CX)(CX*2), CX // CX := uint32{CX*3}
VPEXPANDB.Z Z2, K1, Z4 // Z4 := uint32{misencoded varuint_256[i]} for i in 15..0
SHLQ CX, AX // AX := uint64{-1 << (4*initial_CX); but returns 0 for CX==16!}
VPSRLD $8, Z4, Z2 // Z2 := 256*a2 + a1
VPSRLD $16, Z4, Z5 // Z5 := a2
ANDNQ DX, AX, DX // Trim the length vector to contain the requested lengths only
VPADDD Z2, Z2, Z2 // Z2 := 512*a2 + 2*a1
VPSLLD $9, Z5, Z6 // Z6 := 512*a2
POPCNTQ DX, DX // DX := the number of consumed VarLitLen bytes
VPSUBD Z2, Z4, Z4 // Z4 := varuint_256 - 512*a2 - 2*a1
VPSLLD $2, Z5, Z2 // Z2 := 4*a2
VPSUBD Z6, Z4, Z4 // Z4 := varuint_256 - 1024*a2 - 2*a1
VPADDD Z2, Z4, Z4 // Z4 := uint32{corrected varuint_254[i]} for i in 15..0
JMP varlitlen_decoded
decode_wide_varmatchlen:
// DX := {token[i] needs a VarMatchLen parameter} for i in 15..0
// DI := dst.Data cursor
// SI := Literals.Data cursor
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// R11 := adjusted the Tokens.Data cursor
// R12 := adjusted VarLitLen.Data cursor
// R13 := VarMatchLen.Data cursor
// R14 := adjusted Offsets16.Data cursor
// R15 := adjusted Offsets24.Data cursor
// K2 := {VarMatchLen.Data[i] <= 0xfd} for i in 63..0
// K4 := {token[i] needs a VarMatchLen parameter} for i in 15..0
// K5 := {token[i] needs an Offset24 parameter} for i in 15..0
// K6 := {token[i] needs an Offset16 parameter} for i in 15..0
// Z0 := {0*}
// Z1 := {-1*}
// Z3 := uint8{VarMatchLen.Data[i]} for i in 63..0
// Z14 := uint32{Offsets24.Data[(i*3+2)..(i*3)]} for i in 15..0
// Z15 := uint32{Offsets16.Data[(i*2+1)..(i*2)} for i in 15..0
// Z16 := uint32{token[i].litlen} for i in 15..0
// Z17 := uint32{token[i].matchlen} for i in 15..0
// Z18 := adjusted flags := uint8{0 times 16, (flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 47..0
// Z19 := literal length offsets
// Z20 := match length offsets
// Z21 := uint8{consts_uint24_expander_vbmi2}
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
VPCMPUB $VPCMP_IMM_EQ, Z1, Z3, K1 // K1 := {VarMatchLen.Data[i] == 0xff} for i in 63..0
KMOVQ K2, AX // AX := {VarMatchLen.Data[i] <= 0xfd} for i in 63..0
VPCOMPRESSB.Z Z3, K2, Z2 // Z2 := uint8{the sequence of the payload bytes only}
MOVQ AX, BX // BX := {VarMatchLen.Data[i] <= 0xfd} for i in 63..0
NOTQ AX // AX := {VarMatchLen.Data[i] > 0xfd} for i in 63..0
POPCNTL DX, CX // CX := uint32{#tokens that need VarMatchLen parameter}
LEAQ (AX)(AX*2), DX
LEAQ (DX)(AX*4), AX
KMOVQ K1, DX // DX := {VarMatchLen.Data[i] == 0xff} for i in 63..0
LEAQ (AX)(DX*8), AX
XORQ BX, AX
PEXTQ AX, DX, DX // DX := uint64{(VarMatchLen.Data[i] == 0xff) at the beginnings of item[i]} for i in 15..0
NOTQ BX // BX := {VarMatchLen.Data[i] > 0xfd} for i in 63..0
PEXTQ AX, BX, BX // BX := uint64{(VarMatchLen.Data[i] > 0xfd) at the beginnings of item[i]} for i in 15..0
MOVQ $0x1111_1111_1111_1111, AX
PDEPQ AX, DX, DX
PDEPQ AX, BX, BX
LEAQ (AX)(DX*4), DX
MOVQ $-1, AX
LEAQ (DX)(BX*2), DX // DX := uint4{0b0001 if len(item[i])== 1, 0b0011 if len(item[i])== 3, 0b0111 if len(item[i])== 4} for i in 15..0
SHLQ CX, AX // AX := uint32{-1 << CX)}
KMOVQ DX, K1 // K1 := uint4{0b0001 if len(item[i])== 1, 0b0011 if len(item[i])== 3, 0b0111 if len(item[i])== 4} for i in 15..0
LEAQ (DX)(BX*8), DX // DX := uint4{0b0001 if len(item[i])== 1, 0b1011 if len(item[i])== 3, 0b1111 if len(item[i])== 4} for i in 15..0
LEAL (CX)(CX*2), CX // CX := uint32{CX*3}
VPEXPANDB.Z Z2, K1, Z4 // Z4 := uint32{misencoded varuint_256[i]} for i in 15..0
SHLQ CX, AX // AX := uint64{-1 << (4*initial_CX); but returns 0 for CX==16!}
VPSRLD $8, Z4, Z2 // Z2 := 256*a2 + a1
VPSRLD $16, Z4, Z5 // Z5 := a2
ANDNQ DX, AX, DX // Trim the length vector to contain the requested lengths only
VPADDD Z2, Z2, Z2 // Z2 := 512*a2 + 2*a1
VPSLLD $9, Z5, Z6 // Z6 := 512*a2
POPCNTQ DX, DX // DX := the number of consumed VarMatchLen bytes
VPSUBD Z2, Z4, Z4 // Z4 := varuint_256 - 512*a2 - 2*a1
VPSLLD $2, Z5, Z2 // Z2 := 4*a2
VPSUBD Z6, Z4, Z4 // Z4 := varuint_256 - 1024*a2 - 2*a1
VPADDD Z2, Z4, Z4 // Z4 := uint32{corrected varuint_254[i]} for i in 15..0
JMP varmatchlen_decoded
// Copy completers
COPY_SINGLE_ITEM_COMPLETERS(copy_long_literal0, copy_long_literal0_completed)
COPY_SINGLE_ITEM_COMPLETERS(copy_long_literal1, copy_long_literal1_completed)
COPY_SINGLE_ITEM_COMPLETERS(copy_long_literal2, copy_long_literal2_completed)
COPY_SINGLE_ITEM_COMPLETERS(copy_long_literal3, copy_long_literal3_completed)
COPY_SINGLE_ITEM_COMPLETERS(copy_long_literal_f0, copy_long_literal_f0_completed)
// -------------------------------------------
//
// func decompressIguanaAVX512Generic(dst []byte, streams *streamPack, lastOffs *int64) ([]byte, errorCode)
TEXT ·decompressIguanaAVX512Generic(SB), NOSPLIT | NOFRAME, $0-40
MOVQ streams+24(FP), BX
VPTERNLOGQ $0xff, Z1, Z1, Z1 // Z1 := {-1*}
VPXORQ Z0, Z0, Z0 // Z0 := {0*}
MOVQ (stream__size*const_stridTokens+stream_data + const_offsSliceHeaderData)(BX), R11 // R11 := Tokens.Data cursor
MOVQ (stream__size*const_stridTokens+stream_data + const_offsSliceHeaderLen)(BX), R10 // R10 := token_count
VPABSB Z1, Z2 // Z2 := uint8{0x01*}
MOVQ (stream__size*const_stridOffset16+stream_data + const_offsSliceHeaderData)(BX), R14 // R14 := Offsets16.Data
MOVQ (stream__size*const_stridOffset24+stream_data + const_offsSliceHeaderData)(BX), R15 // R15 := Offsets24.Data
VPSLLD $3, Z2, Z29 // Z29 := uint8{0x08*}
VPADDD Z2, Z2, Z27 // Z27 := uint8{0x02*}
MOVQ (stream__size*const_stridVarLitLen+stream_data + const_offsSliceHeaderData)(BX), R12 // R12 := VarLitLen.Data cursor
MOVQ (stream__size*const_stridVarMatchLen+stream_data + const_offsSliceHeaderData)(BX), R13 // R13 := VarMatchLen.Data
VPSUBB Z27, Z1, Z25 // Z25 := uint8{0xfd*}
VPADDD Z29, Z29, Z30 // Z30 := uint8{0x10*}
VPADDD Z27, Z27, Z28 // Z28 := uint8{0x04*}
MOVQ (stream__size*const_stridLiterals+stream_data + const_offsSliceHeaderData)(BX), SI // SI := Literals.Data
MOVQ lastOffs+32(FP), R9 // R9 := &lastOffs
VMOVDQU8 CONST_GET_PTR(consts_uint24_expander_generic, 0), Z21
VPADDD Z30, Z30, Z31 // Z31 := uint8{0x20*}
VPADDB Z1, Z29, Z24 // Z24 := uint8{0x07*}
MOVQ dst_base+0(FP), DI // DI := dst.Data cursor
MOVQ dst_len+8(FP), DX // DX := dst.Len
VPADDB Z1, Z30, Z23 // Z23 := uint8{0x0f*}
VPADDB Z1, Z31, Z22 // Z22 := uint8{0x1f*}
MOVQ dst_cap+16(FP), CX // CX := dst.Cap
MOVQ DI, ret_base+40(FP) // Set the result base address
ADDQ DX, DI // DI := Move to the dst.Data end as required by append mode
MOVQ CX, ret_cap+56(FP) // Set the result capacity
MOVQ (R9), R9 // R9 := lastOffs
predecoded_tokens_exhausted:
SUBL $64, R10 // token_count -= 64
JLT fetch_last_tokens
// There are still at least 64 tokens available
VMOVDQU8 (R11), Z2 // Z2 := uint8{token[i]} for i in 63..0
MOVL $0b1_0_0111_1_1_0111_1_1_0111_1_1_0111_1, R8
ADDQ $64, R11 // Move the tokens cursor to the next group of 64 tokens
tokens_fetched:
// Decode the fetched tokens
//
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// Z0 := {0*}
// Z1 := {-1*}
// Z2 := uint8{token[i]} for i in 63..0
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
VPMINUB Z31, Z2, Z3 // Z3 := uint8{32 for token[i] >= 32, token[i] otherwise} for i in 63..0
VPANDD Z24, Z2, Z19 // Z19 := uint8{token[i].LLL for token[i] >= 32, garbage otherwise} for i in 63..0
VPSUBB Z31, Z3, Z3 // Z3 := uint8{0 for token[i] >= 32, negative otherwise} for i in 63..0
VPSRLD $3, Z2, Z18 // Z18 := uint8{token[i].GGG_S_MMMM for token[i] >= 32, GGG_S_GGGG otherwise} for i in 63..0
VPMAXSB Z1, Z3, Z3 // Z3 := uint8{0 for token[i] >= 32, 0xff otherwise} for i in 63..0
VPANDD Z23, Z18, Z20 // Z20 := uint8{token[i].MMMM for token[i] >= 32, garbage otherwise} for i in 63..0
VPMINUB Z22, Z2, Z4 // Z4 := uint8{31 for token[i] >= 31, token[i] otherwise} for i in 63..0
VPANDND Z19, Z3, Z19 // Z19 := uint8{token[i].LLL for token[i] >= 32, 0 otherwise} for i in 63..0
VPSUBB Z22, Z4, Z4 // Z4 := uint8{0 for token[i] >= 31, negative otherwise} for i in 63..0
VPANDND Z20, Z3, Z20 // Z20 := uint8{token[i].MMMM for token[i] >= 32, 0 otherwise} for i in 63..0
VPMAXSB Z1, Z4, Z4 // Z4 := uint8{0 for token[i] >= 31, 0xff otherwise} for i in 63..0
VPSUBB Z24, Z19, Z5 // Z5 := uint8{0 for (token[i] >= 32) && (token[i].LLL == 7), negative otherwise} for i in 63..0
VPSUBB Z23, Z20, Z6 // Z6 := uint8{0 for (token[i] >= 32) && (token[i].MMMM == 15), negative otherwise} for i in 63..0
VPMAXSB Z1, Z5, Z5 // Z5 := uint8{0 for (token[i] >= 32) && (token[i].LLL == 7), 0xff otherwise} for i in 63..0
VPTERNLOGD $0b0000_0010, Z30, Z3, Z18 // Z18 := uint8{(flOffset16)_0000} for i in 63..0
VPMAXSB Z1, Z6, Z6 // Z6 := uint8{0 for (token[i] >= 32) && (token[i].MMMM == 15), 0xff otherwise} for i in 63..0
VPTERNLOGD $0b1101_1000, Z29, Z3, Z18 // Z18 := uint8{(flOffset16)_(flOffset24)_000} for i in 63..0
VPADDB Z30, Z2, Z2 // Z2 := uint8{token[i] + 16} for i in 63..0
VPTERNLOGD $0b0111_0010, Z27, Z5, Z18 // Z18 := uint8{(flOffset16)_(flOffset24)_0_(flVarLitLen)_0} for i in 63..0
VPTERNLOGD $0b0010_0010, Z3, Z4, Z5 // Z5 := uint8{0xff for token[i] == 31, 0 otherwise} for i in 63..0
VPTERNLOGD $0b0111_0010, Z28, Z6, Z18 // Z18 := uint8{(flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 63..0
VPTERNLOGD $0b1011_1000, Z2, Z3, Z20 // Z20 := uint8{set match_length[i] for token[i] < 32} for i in 63..0
VPTERNLOGD $0b1111_1000, Z28, Z5, Z18 // Z18 := uint8{(flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 63..0
// Z18 := flags := uint8{(flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 63..0
// Z19 := literal length offsets
// Z20 := match length offsets
predecoded_tokens_available:
// Arm up to 16 predecoded tokens with parameters.
//
// DI := dst.Data cursor
// SI := Literals.Data cursor
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// R11 := Tokens.Data cursor
// R12 := VarLitLen.Data cursor
// R13 := VarMatchLen.Data cursor
// R14 := Offsets16.Data
// R15 := Offsets24.Data
// Z0 := {0*}
// Z1 := {-1*}
// Z18 := flags := uint8{(flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 63..0
// Z19 := literal length offsets
// Z20 := match length offsets
// Z21 := uint8{consts_uint24_expander_generic_generic}
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
VPTESTMB X29, X18, K5 // K5 := {token[i] needs an Offset24 parameter} for i in 15..0
VMOVDQU8 (R12), Z2 // Z2 := uint8{VarLitLen.Data[i]} for i in 63..0
VPTESTMB X30, X18, K6 // K6 := {token[i] needs an Offset16 parameter} for i in 15..0
VMOVDQU8 (R13), Z3 // Z3 := uint8{VarMatchLen.Data[i]} for i in 63..0
VPCMPUB $VPCMP_IMM_LE, Z25, Z2, K2 // K2 := {VarLitLen.Data[i] <= 0xfd} for i in 63..0
VMOVDQU16 (R14), Y15 // Y15 := uint16{Offsets16.Data[(i*2+1)..(i*2)} for i in 15..0
VPTESTMB X27, X18, K4 // K4 := {token[i] needs a VarLitLen parameter} for i in 15..0
KMOVW K5, AX // AX := {token[i] needs an Offset24 parameter}
VMOVDQU8 (R15), Z14 // Z14 := uint24{Offsets24.Data[(i*3+2)..(i*3)]}
VPMOVZXBD X2, Z4 // Z4 := uint32{VarLitLen.Data[i]} for i in 15..0
KMOVW K6, BX // BX := {token[i] needs an Offset16 parameter}
POPCNTL AX, AX // AX := the number of tokens requesting Offset24 parameters
VPMOVZXBD X19, Z16 // Z16 := uint32{litlen_token[i].offset} for i in 15..0
LEAQ (AX)(AX*2), AX // AX := AX * 3
POPCNTL BX, BX // BX := the number of tokens requesting Offset16 parameters
VPERMD Z14, Z21, Z14 // Z14 := uint32{[11, 11, 10, 9 | 8, 8, 7, 6 | 5, 5, 4, 3 | 2, 2, 1, 0]}
KORTESTW K2, K2 // EFLAGS.CF==0 <=> there are VarLitLen.Data[i] values greater than 253
LEAQ (R15)(AX*1), R15 // Skip the consumed bytes from the Offsets24 stream
VPMOVZXWD Y15, Z15 // Z15 := uint32{Offsets16.Data[(i*2+1)..(i*2)} for i in 15..0
KMOVW K4, DX // DX := {token[i] needs a VarLitLen parameter} for i in 15..0
LEAQ (R14)(BX*2), R14 // Skip the consumed bytes from the Offsets16 stream
VPSHUFB Z21, Z14, Z14 // Z14 := uint32{Offsets24.Data[(i*3+2)..(i*3)] << 8} for i in 15..0
JCC decode_wide_varlitlen
// {VarLitLen.Data[i] <= 0xfd} for i in 15..0, so the bytes can be simply skipped by adding the number of consumed ones
POPCNTL DX, DX // DX := the number of consumed VarLitLen bytes
varlitlen_decoded:
// DX := the number of consumed VarLitLen bytes
// DI := dst.Data cursor
// SI := Literals.Data cursor
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// R11 := Tokens.Data cursor
// R12 := VarLitLen.Data cursor
// R13 := VarMatchLen.Data cursor
// R14 := adjusted Offsets16.Data cursor
// R15 := adjusted Offsets24.Data cursor
// K4 := {token[i] needs a VarLitLen parameter} for i in 15..0
// K5 := {token[i] needs an Offset24 parameter} for i in 15..0
// K6 := {token[i] needs an Offset16 parameter} for i in 15..0
// Z0 := {0*}
// Z1 := {-1*}
// Z3 := uint8{VarMatchLen.Data[i]} for i in 63..0
// Z4 := uint32{VarLitLen.Data[i]} for i in 15..0
// Z14 := uint32{Offsets24.Data[(i*3+2)..(i*3)] << 8} for i in 15..0
// Z15 := uint32{Offsets16.Data[(i*2+1)..(i*2)} for i in 15..0
// Z16 := uint32{litlen_token[i].offset} for i in 15..0
// Z18 := flags := uint8{(flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 63..0
// Z19 := literal length offsets
// Z20 := match length offsets
// Z21 := uint8{consts_uint24_expander_generic}
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
VPEXPANDD.Z Z4, K4, Z4 // Scatter the subsequent varlitlen values to the requesting token slots
ADDQ DX, R12 // Skip the bytes consumed from the VarLitLen stream
VPCMPUB $VPCMP_IMM_LE, Z25, Z3, K2 // K2 := {VarMatchLen.Data[i] <= 0xfd} for i in 63..0
VPSRLD $8, Z14, Z14 // Z14 := uint32{Offsets24.Data[(i*3+2)..(i*3)]} for i in 15..0
VPTESTMB X28, X18, K4 // K4 := {token[i] needs a VarMatchLen parameter} for i in 15..0
//p0
VPMOVZXBD X20, Z17 // Z17 := uint32{token[i].matchlen} for i in 15..0
VPADDD Z4, Z16, Z16 // Z16 := uint32{token[i].litlen} for i in 15..0
VPMOVZXBD X3, Z4 // Z4 := uint32{VarMatchLen.Data[i]} for i in 15..0
KORTESTW K2, K2 // EFLAGS.CF==0 <=> there are VarMatchLen.Data[i] values greater than 253
VALIGND $4, Z18, Z0, Z18 // Skip the first 16 predecoded token flags entries
KMOVW K4, DX // DX := {token[i] needs a VarMatchLen parameter} for i in 15..0
JCC decode_wide_varmatchlen
// {VarMatchLen.Data[i] <= 0xfd} for i in 15..0, so the bytes can be simply skipped by adding the number of consumed ones
POPCNTL DX, DX // DX := the number of consumed VarMatchLen bytes
varmatchlen_decoded:
// DX := the number of consumed VarMatchLen bytes
// DI := dst.Data cursor
// SI := Literals.Data cursor
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// R11 := Tokens.Data cursor
// R12 := adjusted VarLitLen.Data cursor
// R13 := VarMatchLen.Data cursor
// R14 := adjusted Offsets16.Data cursor
// R15 := adjusted Offsets24.Data cursor
// K4 := {token[i] needs a VarMatchLen parameter} for i in 15..0
// K5 := {token[i] needs an Offset24 parameter} for i in 15..0
// K6 := {token[i] needs an Offset16 parameter} for i in 15..0
// Z0 := {0*}
// Z1 := {-1*}
// Z4 := uint32{VarMatchLen.Data[i]} for i in 15..0
// Z14 := uint32{Offsets24.Data[(i*3+2)..(i*3)]} for i in 15..0
// Z15 := uint32{Offsets16.Data[(i*2+1)..(i*2)} for i in 15..0
// Z16 := uint32{token[i].litlen} for i in 15..0
// Z17 := uint32{token[i].matchlen} for i in 15..0
// Z18 := adjusted flags := uint8{0 times 16, (flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 47..0
// Z19 := literal length offsets
// Z20 := match length offsets
// Z21 := uint8{consts_uint24_expander_generic}
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
VPEXPANDD.Z Z4, K4, Z4 // Scatter the subsequent varmatchlen values to the requesting token slots
ADDQ DX, R13 // Skip the bytes consumed from the VarMatchLen stream
VPEXPANDD.Z Z15, K6, Z15 // Scatter the subsequent Offset16 values to the requesting token slots
//p0
VPEXPANDD.Z Z14, K5, Z14 // Scatter the subsequent Offset24 values to the requesting token slots
SHRL $1, R8 // Sequencer: EFLAGS.CF==1 <=> loop_4x should be entered
VALIGND $4, Z19, Z0, Z19 // Skip the first 16 predecoded token literal length entries
VPADDD Z4, Z17, Z17 // Z17 := uint32{matchlen[i]} for i in 15..0
//p05
VALIGND $4, Z20, Z0, Z20 // Skip the first 16 predecoded token match length entries
VPORD Z14, Z15, Z15 // Z15 := uint32{Offset24 or Offset16 value for token[i]} for i in 15..0
JCC check_loop_1x // There remain 0..3 tokens to handle
// There are at least 4 opcodes available for processing
//
// Z16 contains token[i].litlen for i in 15..0
// Z17 contains matchlen[i] for i in 15..0
// Z18 contains token[i].offset for i in 15..0
//
// DI := dst.Data cursor
// SI := Literals.Data cursor
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// R11 := adjusted the Tokens.Data cursor
// R12 := adjusted VarLitLen.Data cursor
// R13 := adjusted VarMatchLen.Data cursor
// R14 := adjusted Offsets16.Data cursor
// R15 := adjusted Offsets24.Data cursor
// Z0 := {0*}
// Z1 := {-1*}
// Z15 := uint32{token[i].offset} for i in 15..0
// Z16 := uint32{token[i].litlen} for i in 15..0
// Z17 := uint32{token[i].matchlen} for i in 15..0
// Z18 := adjusted flags := uint8{0 times 16, (flOffset16)_(flOffset24)_(fmVarMatchLen)_(flVarLitLen)_0} for i in 47..0
// Z19 := adjusted literal length offsets
// Z20 := adjusted match length offsets
// Z21 := uint8{consts_uint24_expander_generic}
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
loop_4x:
COPY_SINGLE_ITEM(0, copy_long_literal0, copy_long_literal0_completed, match_loop0)
COPY_SINGLE_ITEM(1, copy_long_literal1, copy_long_literal1_completed, match_loop1)
COPY_SINGLE_ITEM(2, copy_long_literal2, copy_long_literal2_completed, match_loop2)
COPY_SINGLE_ITEM(3, copy_long_literal3, copy_long_literal3_completed, match_loop3)
// Rewind the opcode queue
VALIGND $4, Z15, Z0, Z15 // Skip the first 4 entries, part 1
SHRL $1, R8 // Sequencer: EFLAGS.CF==1 <=> loop_4x should be repeated
VALIGND $4, Z16, Z0, Z16 // Skip the first 4 entries, part 2
VALIGND $4, Z17, Z0, Z17 // Skip the first 4 entries, part 3
JCS loop_4x
SHRL $1, R8 // Sequencer: EFLAGS.CF==1 <=> predecoded tokens are present and should be armed with parameters
JCS predecoded_tokens_available
SHRL $1, R8 // Sequencer: EFLAGS.CF==1 <=> a new batch of tokens should be predecoded
JCS predecoded_tokens_exhausted
// There still are 0..3 tokens not processed by loop_4x
check_loop_1x:
TESTL R8, R8
JZ no_more_tokens
loop_1x:
COPY_SINGLE_ITEM(0, copy_long_literal_f0, copy_long_literal_f0_completed, match_loop_f0)
// Rewind the opcode queue
VALIGND $1, Z15, Z0, Z15 // Skip the first entry, part 1
VALIGND $1, Z16, Z0, Z16 // Skip the first entry, part 2
VALIGND $1, Z17, Z0, Z17 // Skip the first entry, part 3
SUBL $1, R8
JNZ loop_1x
no_more_tokens:
MOVQ streams+24(FP), BX
MOVQ lastOffs+32(FP), AX // AX := &lastOffs
MOVQ (stream__size*const_stridLiterals+stream_data + const_offsSliceHeaderData)(BX), DX // DX := Literals.Data
MOVQ (stream__size*const_stridLiterals+stream_data + const_offsSliceHeaderLen)(BX), CX // CX := Literals.Len
SUBQ SI, DX // DX := -consumed_literals_bytes
MOVQ R9, (AX) // Store the lastOffs value
ADDQ DX, CX // CX := the number of the remaining literals bytes
LEAQ (DI)(CX*1), DX // DX := just past the dst.Data
SUBQ dst_base+0(FP), DX // DX := the number of written bytes
// Append the remaining literals payload bytes
REP; MOVSB // TODO: is there a real need for being excessively smart here?
MOVQ DX, ret_len+48(FP)
MOVL $const_ecOK, ret1+64(FP)
RET
fetch_last_tokens:
// R9 := lastOffs
// R10 := token_count
// Z0 := {0*}
// Z1 := {-1*}
// Z22 := uint8{0x1f*}
// Z23 := uint8{0x0f*}
// Z24 := uint8{0x07*}
// Z25 := uint8{0xfd*}
// Z27 := uint8{0x02*}
// Z28 := uint8{0x04*}
// Z29 := uint8{0x08*}
// Z30 := uint8{0x10*}
// Z31 := uint8{0x20*}
LEAL 64(R10), AX
LEAQ CONST_GET_PTR(consts_composite_remainder, 0), BX
MOVQ $-1, DX
CMPL R10, $-64
JLE no_more_tokens
MOVL (BX)(AX*4), R8 // R8 := sequencer for the (R10 & 0x3f) remaining tokens (63..1)
SHLXQ R10, DX, DX // DX := uint64{-1 >> (R10 & 0x3f)}
NOTQ DX
KMOVQ DX, K1
VMOVDQU8.Z (R11), K1, Z2 // Z2 := uint8{token[i]} for i in K1 range
JMP tokens_fetched
decode_wide_varlitlen:
// DX := {token[i] needs a VarLitLen parameter} for i in 15..0
// DI := dst.Data cursor
// SI := Literals.Data cursor
// R8 := sequencer
// R9 := lastOffs
// R10 := token_count
// R11 := adjusted the Tokens.Data cursor
// R12 := VarLitLen.Data cursor
// R13 := VarMatchLen.Data cursor
// R14 := adjusted Offsets16.Data cursor
// R15 := adjusted Offsets24.Data cursor
// K2 := {VarLitLen.Data[i] <= 0xfd} for i in 63..0
// K4 := {token[i] needs a VarLitLen parameter} for i in 15..0
// K5 := {token[i] needs an Offset24 parameter} for i in 15..0
// K6 := {token[i] needs an Offset16 parameter} for i in 15..0
// Z0 := {0*}
// Z1 := {-1*}
// Z2 := uint8{VarLitLen.Data[i]} for i in 63..0
// Z3 := uint8{VarMatchLen.Data[i]} for i in 63..0
// Z14 := uint32{Offsets24.Data[(i*3+2)..(i*3)] << 8} for i in 15..0
// Z15 := uint32{Offsets16.Data[(i*2+1)..(i*2)} for i in 15..0
// Z16 := uint32{litlen_token[i].offset} for i in 15..0