-
Notifications
You must be signed in to change notification settings - Fork 9
/
f3dex2.s
2699 lines (2442 loc) · 135 KB
/
f3dex2.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
.rsp
.include "rsp/rsp_defs.inc"
.include "rsp/gbi.inc"
// This file assumes DATA_FILE and CODE_FILE are set on the command line
.if version() < 110
.error "armips 0.11 or newer is required"
.endif
// Tweak the li and la macros so that the output matches
.macro li, reg, imm
addi reg, $zero, imm
.endmacro
.macro la, reg, imm
addiu reg, $zero, imm
.endmacro
.macro move, dst, src
ori dst, src, 0
.endmacro
// Prohibit macros involving slt; this silently clobbers $1. You can of course
// manually write the slt and branch instructions if you want this behavior.
.macro blt, ra, rb, lbl
.error "blt is a macro using slt, and silently clobbers $1!"
.endmacro
.macro bgt, ra, rb, lbl
.error "bgt is a macro using slt, and silently clobbers $1!"
.endmacro
.macro ble, ra, rb, lbl
.error "ble is a macro using slt, and silently clobbers $1!"
.endmacro
.macro bge, ra, rb, lbl
.error "bge is a macro using slt, and silently clobbers $1!"
.endmacro
// Vector macros
.macro vcopy, dst, src
vadd dst, src, $v0[0]
.endmacro
.macro vclr, dst
vxor dst, dst, dst
.endmacro
ACC_UPPER equ 0
ACC_MIDDLE equ 1
ACC_LOWER equ 2
.macro vreadacc, dst, N
vsar dst, dst, dst[N]
.endmacro
/*
There are two different memory spaces for the overlays: (a) IMEM and (b) the
microcode file (which, plus an offset, is also the location in DRAM).
A label marks both an IMEM addresses and a file address, but evaluating the
label in an integer context (e.g. in a branch) gives the IMEM address.
`orga(your_label)` gets the file address of the label, and `.orga` sets the
file address.
`.headersize`, as well as the value after `.create`, sets the difference
between IMEM addresses and file addresses, so you can set the IMEM address
with `.headersize desired_imem_addr - orga()`.
In IMEM, the whole microcode is organized as (each row is the same address):
0x80 space | |
for boot code Overlay 0 Overlay 1
(End (More cmd
start task) handlers)
(initialization) | |
Many command
handlers
Overlay 2 Overlay 3
(Lighting) (Clipping)
Vertex and
tri handlers
DMA code
In the file, the microcode is organized as:
start (file addr 0x0 = IMEM 0x1080)
Many command handlers
Overlay 3
Vertex and tri handlers
DMA code (end of this = IMEM 0x2000 = file 0xF80)
Overlay 0
Overlay 1
Overlay 2
*/
// Overlay table data member offsets
overlay_load equ 0x0000
overlay_len equ 0x0004
overlay_imem equ 0x0006
.macro OverlayEntry, loadStart, loadEnd, imemAddr
.dw loadStart
.dh (loadEnd - loadStart - 1) & 0xFFFF
.dh (imemAddr) & 0xFFFF
.endmacro
.macro jumpTableEntry, addr
.dh addr & 0xFFFF
.endmacro
// RSP DMEM
.create DATA_FILE, 0x0000
/*
Matrices are stored and used in a transposed format compared to how they are
normally written in mathematics. For the integer part:
00 02 04 06 typical Xscl Rot Rot 0
08 0A 0C 0E use: Rot Yscl Rot 0
10 12 14 16 Rot Rot Zscl 0
18 1A 1C 1E Xpos Ypos Zpos 1
The fractional part comes next and is in the same format.
Applying this transformation is done by multiplying a row vector times the
matrix, like:
X Y Z 1 * Xscl Rot Rot 0 = NewX NewY NewZ 1
Rot Yscl Rot 0
Rot Rot Zscl 0
Xpos Ypos Zpos 1
In C, the matrix is accessed as matrix[row][col], and the vector is vector[row].
*/
// 0x0000-0x0040: modelview matrix
mvMatrix:
.fill 64
// 0x0040-0x0080: projection matrix
pMatrix:
.fill 64
// 0x0080-0x00C0: modelviewprojection matrix
mvpMatrix:
.fill 64
// 0x00C0-0x00C8: scissor (four 12-bit values)
scissorUpLeft: // the command byte is included since the command word is copied verbatim
.dw (G_SETSCISSOR << 24) | (( 0 * 4) << 12) | (( 0 * 4) << 0)
scissorBottomRight:
.dw ((320 * 4) << 12) | ((240 * 4) << 0)
// 0x00C8-0x00D0: othermode
otherMode0: // command byte included, same as above
.dw (G_RDPSETOTHERMODE << 24) | (0x080CFF)
otherMode1:
.dw 0x00000000
// 0x00D0-0x00D8: Saved texrect state for combining the multiple input commands into one RDP texrect command
texrectWord1:
.fill 4 // first word, has command byte, xh and yh
texrectWord2:
.fill 4 // second word, has tile, xl, yl
// 0x00D8: First half of RDP value for split commands (shared by perspNorm moveword to be able to write a 32-bit value)
rdpHalf1Val:
.fill 4
// 0x00DC: perspective norm
perspNorm:
.dh 0xFFFF
// 0x00DE: displaylist stack length
displayListStackLength:
.db 0x00 // starts at 0, increments by 4 for each "return address" pushed onto the stack
.db 0x48 // this seems to be the max displaylist length
// 0x00E0-0x00F0: viewport
viewport:
.fill 16
// 0x00F0-0x00F4: Current RDP fifo output position
rdpFifoPos:
.fill 4
// 0x00F4-0x00F8:
matrixStackPtr:
.dw 0x00000000
// 0x00F8-0x0138: segment table
segmentTable:
.fill (4 * 16) // 16 DRAM pointers
// 0x0138-0x0180: displaylist stack
displayListStack:
// 0x0138-0x0180: ucode text (shared with DL stack)
.if CFG_EXTRA_0A_BEFORE_ID_STR // F3DEX2 2.04H puts an extra 0x0A before the name
.db 0x0A
.endif
.ascii ID_STR, 0x0A
.align 16
.if . - displayListStack != 0x48
.warning "ID_STR incorrect length, affects displayListStack"
.endif
// Base address for RSP effects DMEM region (see discussion in lighting below).
// Could pick a better name, basically a global fixed DMEM pointer used with
// fixed offsets to things in this region. It seems potentially data below this
// could be shared by different running microcodes whereas data after this is
// only used by the current microcode. Also this is used for a base address in
// vtx write / lighting because vector load offsets can't reach all of DMEM.
spFxBase:
// 0x0180-0x1B0: clipping values
clipRatio: // This is an array of 6 doublewords
// G_MWO_CLIP_R** point to the second word of each of these, and end up setting
// the Z scale (always 0 for X and Y components) and the W scale (clip ratio)
.dw 0x00010000, 0x00000002 // 1 * x, G_MWO_CLIP_RNX * w = negative x clip
.dw 0x00000001, 0x00000002 // 1 * y, G_MWO_CLIP_RNY * w = negative y clip
.dw 0x00010000, 0x0000FFFE // 1 * x, (-)G_MWO_CLIP_RPX * w = positive x clip
.dw 0x00000001, 0x0000FFFE // 1 * x, (-)G_MWO_CLIP_RPY * w = positive y clip
.dw 0x00000000, 0x0001FFFF // 1 * z, -1 * w = far clip
.if CFG_NoN
.dw 0x00000000, 0x00000001 // 0 * all, 1 * w = no nearclipping
.else
.dw 0x00000000, 0x00010001 // 1 * z, 1 * w = nearclipping
.endif
// 0x1B0: constants for register $v31
.align 0x10 // loaded with lqv
// VCC patterns used:
// vlt xxx, $v31, $v31[3] = 11101110 in load_spfx_global_values
// vne xxx, $v31, $v31[3h] = 11101110 in lighting
// veq xxx, $v31, $v31[3h] = 00010001 in lighting
v31Value:
.dh -1 // used in init, clipping
.dh 4 // used in clipping, vtx write for Newton-Raphson reciprocal
.dh 8 // old ucode only: used in tri write
.dh 0x7F00 // used in vtx write and pre-jump instrs to there, also 4 put here during point lighting
.dh -4 // used in clipping, vtx write for Newton-Raphson reciprocal
.dh 0x4000 // used in tri write, texgen
.dh vertexBuffer // 0x420; used in tri write
.dh 0x7FFF // used in vtx write, tri write, lighting, point lighting
// 0x1C0: constants for register $v30
.align 0x10 // loaded with lqv
// VCC patterns used:
// vge xxx, $v30, $v30[7] = 11110001 in tri write
v30Value:
.dh 0x7FFC // not used!
.dh vtxSize << 7 // 0x1400; it's not 0x2800 because vertex indices are *2; used in tri write for vtx index to addr
.if CFG_OLD_TRI_WRITE // See discussion in tri write where v30 values used
.dh 0x01CC // used in tri write, vcr?
.dh 0x0200 // not used!
.dh -16 // used in tri write for Newton-Raphson reciprocal
.dh 0x0010 // used in tri write for Newton-Raphson reciprocal
.dh 0x0020 // used in tri write, both signed and unsigned multipliers
.dh 0x0100 // used in tri write, vertex color >>= 8; also in lighting
.else
.dh 0x1000 // used in tri write, some multiplier
.dh 0x0100 // used in tri write, vertex color >>= 8 and vcr?; also in lighting and point lighting
.dh -16 // used in tri write for Newton-Raphson reciprocal
.dh 0xFFF8 // used in tri write, mask away lower ST bits?
.dh 0x0010 // used in tri write for Newton-Raphson reciprocal; value moved to elem 7 for point lighting
.dh 0x0020 // used in tri write, both signed and unsigned multipliers; value moved from elem 6 from point lighting
.endif
/*
Quick note on Newton-Raphson:
https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division
Given input D, we want to find the reciprocal R. The base formula for refining
the estimate of R is R_new = R*(2 - D*R). However, since the RSP reciprocal
instruction moves the radix point 1 to the left, the result has to be multiplied
by 2. So it's 2*R*(2 - D*2*R) = R*(4 - 4*D*R) = R*(1*4 + D*R*-4). This is where
the 4 and -4 come from. For tri write, the result needs to be multiplied by 4
for subpixels, so it's 16 and -16.
*/
.align 0x10 // loaded with lqv
linearGenerateCoefficients:
.dh 0xC000
.dh 0x44D3
.dh 0x6CB3
.dh 2
// 0x01D8
.db 0x00 // Padding to allow mvpValid to be written to as a 32-bit word
mvpValid:
.db 0x01
// 0x01DA
.dh 0x0000 // Shared padding so that:
// -- mvpValid can be written on its own for G_MW_FORCEMTX
// -- Writing numLightsx18 with G_MW_NUMLIGHT sets lightsValid to 0
// -- do_popmtx and load_mtx can invalidate both with one zero word write
// 0x01DC
lightsValid: // Gets overwritten with 0 when numLights is written with moveword.
.db 1
numLightsx18:
.db 0
.db 11
.db 7 * 0x18
// 0x01E0
fogFactor:
.dw 0x00000000
// 0x01E4
textureSettings1:
.dw 0x00000000 // first word, has command byte, bowtie val, level, tile, and on
// 0x01E8
textureSettings2:
.dw 0x00000000 // second word, has s and t scale
// 0x01EC
geometryModeLabel:
.dw G_CLIPPING
// excluding ambient light
MAX_LIGHTS equ 7
// 0x01F0-0x02E0: Light data; a total of 10 * lightSize light slots.
// Each slot's data is either directional or point (each pair of letters is a byte):
// Directional lights:
// 0x00 RR GG BB 00 RR GG BB -- NX NY NZ -- -- -- -- --
// 0x10 TX TY TZ -- TX TY TZ -- (Normals transformed to camera space)
// Point lights:
// 0x00 RR GG BB CC RR GG BB LL XXXX YYYY ZZZZ QQ --
// 0x10 -- -- -- -- -- -- -- -- (Invalid transformed normals get stored here)
// CC: constant attenuation factor (0 indicates directional light)
// LL: linear attenuation factor
// QQ: quadratic attenuation factor
//
// First there are two lights, whose directions define the X and Y directions
// for texgen, via g(s)SPLookAtX/Y. The colors are ignored. These lights get
// transformed normals. g(s)SPLight which point here start copying at n*24+24,
// where n starts from 1 for one light (or zero lights), which effectively
// points at lightBufferMain.
lightBufferLookat:
.fill (2 * lightSize)
// Then there are the main 8 lights. This is between one and seven directional /
// point (if built with this enabled) lights, plus the ambient light at the end.
// Zero lights is not supported, and is encoded as one light with black color
// (does not affect the result). Directional and point lights can be mixed in
// any order; ambient is always at the end.
lightBufferMain:
.fill (8 * lightSize)
// Code uses pointers relative to spFxBase, with immediate offsets, so that
// another register isn't needed to store the start or end address of the array.
// Pointers are kept relative to spFxBase; this offset gets them to point to
// lightBufferMain instead.
ltBufOfs equ (lightBufferMain - spFxBase)
// One more topic on lighting: The point lighting code uses MV transpose instead
// of MV inverse to transform from camera space to model space. If MV has a
// uniform scale (same scale in X, Y, and Z), MV transpose = MV inverse times a
// scale factor. The lighting code effectively gets rid of the scale factor, so
// this is okay. But, if the matrix has nonuniform scaling, and especially if it
// has shear (nonuniform scaling applied somewhere in the middle of the matrix
// stack, such as to a whole skeletal / skinned mesh), this will not be correct.
// 0x02E0-0x02F0: Overlay 0/1 Table
overlayInfo0:
OverlayEntry orga(ovl0_start), orga(ovl0_end), ovl0_start
overlayInfo1:
OverlayEntry orga(ovl1_start), orga(ovl1_end), ovl1_start
// 0x02F0-0x02FE: Movemem table
movememTable:
// Temporary matrix in clipTempVerts scratch space, aligned to 16 bytes
.dh (clipTempVerts + 15) & ~0xF // G_MTX multiply temp matrix (model)
.dh mvMatrix // G_MV_MMTX
.dh (clipTempVerts + 15) & ~0xF // G_MTX multiply temp matrix (projection)
.dh pMatrix // G_MV_PMTX
.dh viewport // G_MV_VIEWPORT
.dh lightBufferLookat // G_MV_LIGHT
.dh vertexBuffer // G_MV_POINT
// Further entries in the movemem table come from the moveword table
// 0x02FE-0x030E: moveword table
movewordTable:
.dh mvpMatrix // G_MW_MATRIX
.dh numLightsx18 - 3 // G_MW_NUMLIGHT
.dh clipRatio // G_MW_CLIP
.dh segmentTable // G_MW_SEGMENT
.dh fogFactor // G_MW_FOG
.dh lightBufferMain // G_MW_LIGHTCOL
.dh mvpValid - 1 // G_MW_FORCEMTX
.dh perspNorm - 2 // G_MW_PERSPNORM
// 0x030E-0x0314: G_POPMTX, G_MTX, G_MOVEMEM Command Jump Table
movememHandlerTable:
jumpTableEntry G_POPMTX_end // G_POPMTX
jumpTableEntry G_MTX_end // G_MTX (multiply)
jumpTableEntry G_MOVEMEM_end // G_MOVEMEM, G_MTX (load)
// 0x0314-0x0370: RDP/Immediate Command Jump Table
jumpTableEntry G_SPECIAL_3_handler
jumpTableEntry G_SPECIAL_2_handler
jumpTableEntry G_SPECIAL_1_handler
jumpTableEntry G_DMA_IO_handler
jumpTableEntry G_TEXTURE_handler
jumpTableEntry G_POPMTX_handler
jumpTableEntry G_GEOMETRYMODE_handler
jumpTableEntry G_MTX_handler
jumpTableEntry G_MOVEWORD_handler
jumpTableEntry G_MOVEMEM_handler
jumpTableEntry G_LOAD_UCODE_handler
jumpTableEntry G_DL_handler
jumpTableEntry G_ENDDL_handler
jumpTableEntry G_SPNOOP_handler
jumpTableEntry G_RDPHALF_1_handler
jumpTableEntry G_SETOTHERMODE_L_handler
jumpTableEntry G_SETOTHERMODE_H_handler
jumpTableEntry G_TEXRECT_handler
jumpTableEntry G_TEXRECTFLIP_handler
jumpTableEntry G_SYNC_handler // G_RDPLOADSYNC
jumpTableEntry G_SYNC_handler // G_RDPPIPESYNC
jumpTableEntry G_SYNC_handler // G_RDPTILESYNC
jumpTableEntry G_SYNC_handler // G_RDPFULLSYNC
jumpTableEntry G_RDP_handler // G_SETKEYGB
jumpTableEntry G_RDP_handler // G_SETKEYR
jumpTableEntry G_RDP_handler // G_SETCONVERT
jumpTableEntry G_SETSCISSOR_handler
jumpTableEntry G_RDP_handler // G_SETPRIMDEPTH
jumpTableEntry G_RDPSETOTHERMODE_handler
jumpTableEntry G_RDP_handler // G_LOADTLUT
jumpTableEntry G_RDPHALF_2_handler
jumpTableEntry G_RDP_handler // G_SETTILESIZE
jumpTableEntry G_RDP_handler // G_LOADBLOCK
jumpTableEntry G_RDP_handler // G_LOADTILE
jumpTableEntry G_RDP_handler // G_SETTILE
jumpTableEntry G_RDP_handler // G_FILLRECT
jumpTableEntry G_RDP_handler // G_SETFILLCOLOR
jumpTableEntry G_RDP_handler // G_SETFOGCOLOR
jumpTableEntry G_RDP_handler // G_SETBLENDCOLOR
jumpTableEntry G_RDP_handler // G_SETPRIMCOLOR
jumpTableEntry G_RDP_handler // G_SETENVCOLOR
jumpTableEntry G_RDP_handler // G_SETCOMBINE
jumpTableEntry G_SETxIMG_handler // G_SETTIMG
jumpTableEntry G_SETxIMG_handler // G_SETZIMG
jumpTableEntry G_SETxIMG_handler // G_SETCIMG
commandJumpTable:
jumpTableEntry G_NOOP_handler
// 0x0370-0x0380: DMA Command Jump Table
jumpTableEntry G_VTX_handler
jumpTableEntry G_MODIFYVTX_handler
jumpTableEntry G_CULLDL_handler
jumpTableEntry G_BRANCH_WZ_handler // different for F3DZEX
jumpTableEntry G_TRI1_handler
jumpTableEntry G_TRI2_handler
jumpTableEntry G_QUAD_handler
jumpTableEntry G_LINE3D_handler
// 0x0380-0x03C4: vertex pointers
vertexTable:
// The vertex table is a list of pointers to the location of each vertex in the buffer
// After the last vertex pointer, there is a pointer to the address after the last vertex
// This means there are really 33 entries in the table
.macro vertexTableEntry, i
.dh vertexBuffer + (i * vtxSize)
.endmacro
.macro vertexTableEntries, i
.if i > 0
vertexTableEntries (i - 1)
.endif
vertexTableEntry i
.endmacro
vertexTableEntries 32
// 0x03C2-0x0410: ??
gCullMagicNumbers:
// Values added to cross product (16-bit sign extended).
// Then if sign bit is clear, cull the triangle.
.dh 0xFFFF // }-G_CULL_NEITHER -- makes any value negative.
.dh 0x8000 // }/ }-G_CULL_FRONT -- inverts the sign.
.dh 0x0000 // }/ }-G_CULL_BACK -- no change.
.dh 0x0000 // }/ }-G_CULL_BOTH -- makes any value positive.
.dh 0x8000 // }/
// G_CULL_BOTH is useless as the tri will always be culled, so might as well not
// bother drawing it at all. Guess they just wanted completeness, and it only
// costs two bytes of DMEM.
activeClipPlanes:
.dw ((CLIP_NX | CLIP_NY | CLIP_PX | CLIP_PY) << CLIP_SHIFT_SCAL) | ((CLIP_FAR | CLIP_NEAR) << CLIP_SHIFT_SCRN)
// 0x3D0: Clipping polygons, as lists of vertex addresses. When handling each
// clipping condition, the polygon is read off one list and the modified polygon
// is written to the next one.
// Max verts in each polygon:
clipPoly:
.fill 10 * 2 // 3 5 7 9
clipPoly2: // \ / \ / \ /
.fill 10 * 2 // 4 6 8
// but there needs to be room for the terminating 0, and clipMaskList below needs
// to be word-aligned. So this is why it's 10 each.
clipMaskList:
.dw CLIP_NX << CLIP_SHIFT_SCAL
.dw CLIP_NY << CLIP_SHIFT_SCAL
.dw CLIP_PX << CLIP_SHIFT_SCAL
.dw CLIP_PY << CLIP_SHIFT_SCAL
.dw CLIP_FAR << CLIP_SHIFT_SCRN
.dw CLIP_NEAR << CLIP_SHIFT_SCRN
// 0x0410-0x0420: Overlay 2/3 table
overlayInfo2:
OverlayEntry orga(ovl2_start), orga(ovl2_end), ovl2_start
overlayInfo3:
OverlayEntry orga(ovl3_start), orga(ovl3_end), ovl3_start
// 0x0420-0x0920: Vertex buffer in RSP internal format
vertexBuffer:
.skip (vtxSize * 32) // 32 vertices
.if . > OS_YIELD_DATA_SIZE - 8
// OS_YIELD_DATA_SIZE (0xC00) bytes of DMEM are saved; the last two words are
// the ucode and the DL pointer. Make sure anything past there is temporary.
// (Input buffer will be reloaded from next instruction in the source DL.)
.error "Important things in DMEM will not be saved at yield!"
.endif
// 0x0920-0x09C8: Input buffer
inputBuffer:
inputBufferLength equ 0xA8
.skip inputBufferLength
inputBufferEnd:
// 0x09C8-0x0BA8: Space for temporary verts for clipping code
clipTempVerts:
clipTempVertsCount equ 12 // Up to 2 temp verts can be created for each of the 6 clip conditions.
.skip clipTempVertsCount * vtxSize
// 0x09D0-0x0A10: Temp matrix for G_MTX multiplication mode, overlaps with clipTempVerts
RDP_CMD_BUFSIZE equ 0x158
RDP_CMD_BUFSIZE_EXCESS equ 0xB0 // Maximum size of an RDP triangle command
RDP_CMD_BUFSIZE_TOTAL equ RDP_CMD_BUFSIZE + RDP_CMD_BUFSIZE_EXCESS
// 0x0BA8-0x0D00: First RDP Command Buffer
rdpCmdBuffer1:
.skip RDP_CMD_BUFSIZE
rdpCmdBuffer1End:
.skip RDP_CMD_BUFSIZE_EXCESS
// 0x0DB0-0x0FB8: Second RDP Command Buffer
rdpCmdBuffer2:
.skip RDP_CMD_BUFSIZE
rdpCmdBuffer2End:
.skip RDP_CMD_BUFSIZE_EXCESS
.if . > 0x00000FC0
.error "Not enough room in DMEM"
.endif
.org 0xFC0
// 0x0FC0-0x1000: OSTask
OSTask:
.skip 0x40
.close // DATA_FILE
// RSP IMEM
.create CODE_FILE, 0x00001080
////////////////////////////////////////////////////////////////////////////////
/////////////////////////////// Register Use Map ///////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Registers marked as "global" are only used for one purpose in the vanilla
// microcode. However, this does not necessarily mean they can't be used for
// other things in mods--this depends on which group they're listed in below.
// Note that these lists do not cover registers which are just used locally in
// a particular region of code--you're still responsible for not breaking the
// code you modify. This is designed to help you avoid breaking one part of the
// code by modifying a different part.
// Local register definitions are included with their code, not here.
// These registers are used globally, and their values can't be rebuilt, so
// they should never be used for anything besides their original purpose.
// $zero // global
rdpCmdBufEnd equ $22 // global
rdpCmdBufPtr equ $23 // global
taskDataPtr equ $26 // global
inputBufferPos equ $27 // global
// $ra // global
// These registers are used throughout the codebase and expected to have
// certain values, but you're free to overwrite them as long as you
// reconstruct the normal values after you're done (in fact point lighting does
// this for $v30 and $v31).
vZero equ $v0 // global
vOne equ $v1 // global
// $v30 // global
// $v31 // global
// Must keep values during the full clipping process: clipping overlay, vertex
// write, tri drawing.
clipPolySelect equ $18 // global
clipPolyWrite equ $21 // also input_mtx_0
savedActiveClipPlanes equ $29 // global
savedRA equ $30 // global
// Must keep values during the first part of the clipping process only: polygon
// subdivision and vertex write.
// $2: vertex at end of edge
clipMaskIdx equ $5
secondVtxPos equ $8
outputVtxPos equ $15 // global
clipFlags equ $16 // global
clipPolyRead equ $17 // global
// Must keep values during tri drawing.
// They are also used throughout the codebase, but can be overwritten once their
// use has been fulfilled for the specific command.
cmd_w1_dram equ $24 // Command word 1, which is also DMA DRAM addr; almost global, occasionally used locally
cmd_w0 equ $25 // Command word 0; almost global, occasionally used locally
// Must keep values during the full vertex process: load, lighting, and vertex write
// $1: count of remaining vertices
topLightPtr equ $6 // Used locally elsewhere
curLight equ $9 // Used locally elsewhere
inputVtxPos equ $14 // global
mxr0i equ $v8 // "matrix row 0 int part"
mxr1i equ $v9 // All of these used locally elsewhere
mxr2i equ $v10
mxr3i equ $v11
mxr0f equ $v12
mxr1f equ $v13
mxr2f equ $v14
mxr3f equ $v15
vPairST equ $v22
vPairMVPPosF equ $v23
vPairMVPPosI equ $v24
// v25: prev vertex screen pos
// v26: prev vertex screen Z
// For point lighting
mvTc0f equ $v3
mvTc0i equ $v4
mvTc1i equ $v21
mvTc1f equ $v28 // same as vPairAlpha37
mvTc2i equ $v30
mvTc2f equ $v31
// Values set up by load_spfx_global_values, which must be kept during the full
// vertex process, and which are reloaded for each vert during clipping. See
// that routine for the detailed contents of each of these registers.
// secondVtxPos
spFxBaseReg equ $13 // global
vVpFgScale equ $v16 // All of these used locally elsewhere
vVpFgOffset equ $v17
vVpMisc equ $v18
vFogMask equ $v19
vVpNegScale equ $v21
// Arguments to mtx_multiply
output_mtx equ $19 // also dmaLen, also used by itself
input_mtx_1 equ $20 // also dmemAddr and xfrmLtPtr
input_mtx_0 equ $21 // also clipPolyWrite
// Arguments to dma_read_write
dmaLen equ $19 // also output_mtx, also used by itself
dmemAddr equ $20 // also input_mtx_1 and xfrmLtPtr
// cmd_w1_dram // used for all dma_read_write DRAM addresses, not just second word of command
// Arguments to load_overlay_and_enter
ovlTableEntry equ $11 // Commonly used locally
postOvlRA equ $12 // Commonly used locally
// ==== Summary of uses of all registers
// $zero: Hardwired zero scalar register
// $1: vertex 1 addr, count of remaining vertices, pointer to store texture coefficients, local
// $2: vertex 2 addr, vertex at end of edge in clipping, pointer to store shade coefficients, local
// $3: vertex 3 addr, vertex at start of edge in clipping, local
// $4: pre-shuffle vertex 1 addr for flat shading, local
// $5: clipMaskIdx, geometry mode high short during vertex load / lighting, local
// $6: topLightPtr, geometry mode low byte during tri write, local
// $7: fog flag in vtx write, local
// $8: secondVtxPos, local
// $9: curLight, local
// $10: briefly used local in vtx write
// $11: ovlTableEntry, very common local
// $12: postOvlRA, curMatrix, local
// $13: spFxBaseReg
// $14: inputVtxPos
// $15: outputVtxPos
// $16: clipFlags
// $17: clipPolyRead
// $18: clipPolySelect
// $19: dmaLen, output_mtx, briefly used local
// $20: dmemAddr, input_mtx_1, xfrmLtPtr
// $21: clipPolyWrite, input_mtx_0
// $22: rdpCmdBufEnd
// $23: rdpCmdBufPtr
// $24: cmd_w1_dram, local
// $25: cmd_w0
// $26: taskDataPtr
// $27: inputBufferPos
// $28: not used!
// $29: savedActiveClipPlanes
// $30: savedRA
// $ra: Return address for jal, b*al
// $v0: vZero (every element 0)
// $v1: vOne (every element 1)
// $v2: very common local
// $v3: mvTc0f, local
// $v4: mvTc0i, local
// $v5: vPairNZ, local
// $v6: vPairNY, local
// $v7: vPairNX, vPairRGBATemp, local
// $v8: mxr0i, local
// $v9: mxr1i, local
// $v10: mxr2i, local
// $v11: mxr3i, local
// $v12: mxr0f, local
// $v13: mxr1f, local
// $v14: mxr2f, local
// $v15: mxr3f, local
// $v16: vVpFgScale, local
// $v17: vVpFgOffset, local
// $v18: vVpMisc, local
// $v19: vFogMask, local
// $v20: local
// $v21: mvTc1i, vVpNegScale, local
// $v22: vPairST, local
// $v23: vPairMVPPosF, local
// $v24: vPairMVPPosI, local
// $v25: prev vertex data, local
// $v26: prev vertex data, local
// $v27: vPairRGBA, local
// $v28: mvTc1f, vPairAlpha37, local
// $v29: register to write to discard results, local
// $v30: mvTc2i, constant values for tri write
// $v31: mvTc2f, general constant values
// Initialization routines
// Everything up until displaylist_dma will get overwritten by ovl0 and/or ovl1
start: // This is at IMEM 0x1080, not the start of IMEM
.if BUG_WRONG_INIT_VZERO
vor vZero, $v16, $v16 // Sets vZero to $v16--maybe set to zero by the boot ucode?
.else
vclr vZero // Clear vZero
.endif
lqv $v31[0], (v31Value)($zero)
lqv $v30[0], (v30Value)($zero)
li rdpCmdBufPtr, rdpCmdBuffer1
.if !BUG_FAIL_IF_CARRY_SET_AT_INIT
vadd vOne, vZero, vZero // Consume VCO (carry) value possibly set by the previous ucode, before vsub below
.endif
li rdpCmdBufEnd, rdpCmdBuffer1End
vsub vOne, vZero, $v31[0] // Vector of 1s
.if !CFG_XBUS // FIFO version
lw $11, rdpFifoPos
lw $12, OSTask + OSTask_flags
li $1, SP_CLR_SIG2 | SP_CLR_SIG1 // task done and yielded signals
beqz $11, task_init
mtc0 $1, SP_STATUS
andi $12, $12, OS_TASK_YIELDED
beqz $12, calculate_overlay_addrs // skip overlay address calculations if resumed from yield?
sw $zero, OSTask + OSTask_flags
j load_overlay1_init // Skip the initialization and go straight to loading overlay 1
lw taskDataPtr, OS_YIELD_DATA_SIZE - 8 // Was previously saved here at yield time
task_init:
mfc0 $11, DPC_STATUS
andi $11, $11, DPC_STATUS_XBUS_DMA
bnez $11, wait_dpc_start_valid
mfc0 $2, DPC_END
lw $3, OSTask + OSTask_output_buff
sub $11, $3, $2
bgtz $11, wait_dpc_start_valid
mfc0 $1, DPC_CURRENT
lw $4, OSTask + OSTask_output_buff_size
beqz $1, wait_dpc_start_valid
sub $11, $1, $4
bgez $11, wait_dpc_start_valid
nop
bne $1, $2, f3dzex_0000111C
wait_dpc_start_valid:
mfc0 $11, DPC_STATUS
andi $11, $11, DPC_STATUS_START_VALID
bnez $11, wait_dpc_start_valid
li $11, DPC_STATUS_CLR_XBUS
mtc0 $11, DPC_STATUS
lw $2, OSTask + OSTask_output_buff_size
mtc0 $2, DPC_START
mtc0 $2, DPC_END
f3dzex_0000111C:
sw $2, rdpFifoPos
.else // CFG_XBUS
wait_dpc_start_valid:
mfc0 $11, DPC_STATUS
andi $11, $11, DPC_STATUS_DMA_BUSY | DPC_STATUS_START_VALID
bne $11, $zero, wait_dpc_start_valid
sw $zero, rdpFifoPos
addi $11, $zero, DPC_STATUS_SET_XBUS
mtc0 $11, DPC_STATUS
addi rdpCmdBufPtr, $zero, rdpCmdBuffer1
mtc0 rdpCmdBufPtr, DPC_START
mtc0 rdpCmdBufPtr, DPC_END
lw $12, OSTask + OSTask_flags
addi $1, $zero, SP_CLR_SIG2 | SP_CLR_SIG1
mtc0 $1, SP_STATUS
andi $12, $12, OS_TASK_YIELDED
beqz $12, f3dzex_xbus_0000111C
sw $zero, OSTask + OSTask_flags
j load_overlay1_init
lw taskDataPtr, OS_YIELD_DATA_SIZE - 8 // Was previously saved here at yield time
.fill 16 * 4 // Bunch of nops here to make it the same size as the fifo code.
f3dzex_xbus_0000111C:
.endif
lw $11, matrixStackPtr
bnez $11, calculate_overlay_addrs
lw $11, OSTask + OSTask_dram_stack
sw $11, matrixStackPtr
calculate_overlay_addrs:
lw $1, OSTask + OSTask_ucode
lw $2, overlayInfo0 + overlay_load
lw $3, overlayInfo1 + overlay_load
lw $4, overlayInfo2 + overlay_load
lw $5, overlayInfo3 + overlay_load
add $2, $2, $1
add $3, $3, $1
sw $2, overlayInfo0 + overlay_load
sw $3, overlayInfo1 + overlay_load
add $4, $4, $1
add $5, $5, $1
sw $4, overlayInfo2 + overlay_load
sw $5, overlayInfo3 + overlay_load
lw taskDataPtr, OSTask + OSTask_data_ptr
load_overlay1_init:
li ovlTableEntry, overlayInfo1 // set up loading of overlay 1
// Make room for overlays 0 and 1. Normally, overlay 1 ends exactly at ovl01_end,
// and overlay 0 is much shorter, but if things are modded this constraint must be met.
// The 0x88 is because the file starts 0x80 into IMEM, and the overlays can extend 8
// bytes over the next two instructions as well.
.orga max(orga(), max(ovl0_end - ovl0_start, ovl1_end - ovl1_start) - 0x88)
// Also needs to be aligned so that ovl01_end is a DMA word, in case ovl0 and ovl1
// are shorter than the code above and the code above is an odd number of instructions.
.align 8
// Unnecessarily clever code. The jal sets $ra to the address of the next instruction,
// which is displaylist_dma. So the padding has to be before these two instructions,
// so that this is immediately before displaylist_dma; otherwise the return address
// will be in the last few instructions of overlay 1. However, this was unnecessary--
// it could have been a jump and then `la postOvlRA, displaylist_dma`,
// and the padding put after this.
jal load_overlay_and_enter // load overlay 1 and enter
move postOvlRA, $ra // set up the return address, since load_overlay_and_enter returns to postOvlRA
ovl01_end:
// Overlays 0 and 1 overwrite everything up to this point (2.08 versions overwrite up to the previous .align 8)
displaylist_dma: // loads inputBufferLength bytes worth of displaylist data via DMA into inputBuffer
li dmaLen, inputBufferLength - 1 // set the DMA length
move cmd_w1_dram, taskDataPtr // set up the DRAM address to read from
jal dma_read_write // initiate the DMA read
la dmemAddr, inputBuffer // set the address to DMA read to
addiu taskDataPtr, taskDataPtr, inputBufferLength // increment the DRAM address to read from next time
li inputBufferPos, -inputBufferLength // reset the DL word index
wait_for_dma_and_run_next_command:
G_POPMTX_end:
G_MOVEMEM_end:
jal while_wait_dma_busy // wait for the DMA read to finish
G_LINE3D_handler:
G_SPNOOP_handler:
.if !CFG_G_SPECIAL_1_IS_RECALC_MVP // F3DEX2 2.04H has this as a real command
G_SPECIAL_1_handler:
.endif
G_SPECIAL_2_handler:
G_SPECIAL_3_handler:
run_next_DL_command:
mfc0 $1, SP_STATUS // load the status word into register $1
lw cmd_w0, (inputBufferEnd)(inputBufferPos) // load the command word into cmd_w0
beqz inputBufferPos, displaylist_dma // load more DL commands if none are left
andi $1, $1, SP_STATUS_SIG0 // check if the task should yield
sra $12, cmd_w0, 24 // extract DL command byte from command word
sll $11, $12, 1 // multiply command byte by 2 to get jump table offset
lhu $11, (commandJumpTable)($11) // get command subroutine address from command jump table
bnez $1, load_overlay_0_and_enter // load and execute overlay 0 if yielding; $1 > 0
lw cmd_w1_dram, (inputBufferEnd + 4)(inputBufferPos) // load the next DL word into cmd_w1_dram
jr $11 // jump to the loaded command handler; $1 == 0
addiu inputBufferPos, inputBufferPos, 0x0008 // increment the DL index by 2 words
.if CFG_G_SPECIAL_1_IS_RECALC_MVP // Microcodes besides F3DEX2 2.04H have this as a noop
G_SPECIAL_1_handler: // Seems to be a manual trigger for mvp recalculation
li $ra, run_next_DL_command
li input_mtx_0, pMatrix
li input_mtx_1, mvMatrix
li output_mtx, mvpMatrix
j mtx_multiply
sb cmd_w0, mvpValid
.endif
G_DMA_IO_handler:
jal segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one
lh dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command)
andi dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment
// At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size
// So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit
sra dmemAddr, dmemAddr, 2
j dma_read_write // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr)
li $ra, wait_for_dma_and_run_next_command // Setup the return address for running the next DL command
G_GEOMETRYMODE_handler:
lw $11, geometryModeLabel // load the geometry mode value
and $11, $11, cmd_w0 // clears the flags in cmd_w0 (set in g*SPClearGeometryMode)
or $11, $11, cmd_w1_dram // sets the flags in cmd_w1_dram (set in g*SPSetGeometryMode)
j run_next_DL_command // run the next DL command
sw $11, geometryModeLabel // update the geometry mode value
G_ENDDL_handler:
lbu $1, displayListStackLength // Load the DL stack index
beqz $1, load_overlay_0_and_enter // Load overlay 0 if there is no DL return address, to end the graphics task processing; $1 < 0
addi $1, $1, -4 // Decrement the DL stack index
j f3dzex_ovl1_00001020 // has a different version in ovl1
lw taskDataPtr, (displayListStack)($1) // Load the address of the DL to return to into the taskDataPtr (the current DL address)
G_RDPHALF_2_handler:
ldv $v29[0], (texrectWord1)($zero)
lw cmd_w0, rdpHalf1Val // load the RDPHALF1 value into w0
addi rdpCmdBufPtr, rdpCmdBufPtr, 8
sdv $v29[0], -8(rdpCmdBufPtr)
G_RDP_handler:
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Add the second word of the command to the RDP command buffer
G_SYNC_handler:
G_NOOP_handler:
sw cmd_w0, 0(rdpCmdBufPtr) // Add the command word to the RDP command buffer
j check_rdp_buffer_full_and_run_next_cmd
addi rdpCmdBufPtr, rdpCmdBufPtr, 8 // Increment the next RDP command pointer by 2 words
G_SETxIMG_handler:
li $ra, G_RDP_handler // Load the RDP command handler into the return address, then fall through to convert the address to virtual
// Converts the segmented address in cmd_w1_dram to the corresponding physical address
segmented_to_physical:
srl $11, cmd_w1_dram, 22 // Copy (segment index << 2) into $11
andi $11, $11, 0x3C // Clear the bottom 2 bits that remained during the shift
lw $11, (segmentTable)($11) // Get the current address of the segment
sll cmd_w1_dram, cmd_w1_dram, 8 // Shift the address to the left so that the top 8 bits are shifted out
srl cmd_w1_dram, cmd_w1_dram, 8 // Shift the address back to the right, resulting in the original with the top 8 bits cleared
jr $ra
add cmd_w1_dram, cmd_w1_dram, $11 // Add the segment's address to the masked input address, resulting in the virtual address
G_RDPSETOTHERMODE_handler:
sw cmd_w0, otherMode0 // Record the local otherMode0 copy
j G_RDP_handler // Send the command to the RDP
sw cmd_w1_dram, otherMode1 // Record the local otherMode1 copy
G_SETSCISSOR_handler:
sw cmd_w0, scissorUpLeft // Record the local scissorUpleft copy
j G_RDP_handler // Send the command to the RDP
sw cmd_w1_dram, scissorBottomRight // Record the local scissorBottomRight copy
check_rdp_buffer_full_and_run_next_cmd:
li $ra, run_next_DL_command // Set up running the next DL command as the return address
.if !CFG_XBUS // FIFO version
check_rdp_buffer_full:
sub $11, rdpCmdBufPtr, rdpCmdBufEnd
blez $11, return_routine // Return if rdpCmdBufEnd >= rdpCmdBufPtr
flush_rdp_buffer:
mfc0 $12, SP_DMA_BUSY
lw cmd_w1_dram, rdpFifoPos
addiu dmaLen, $11, RDP_CMD_BUFSIZE
bnez $12, flush_rdp_buffer
lw $12, OSTask + OSTask_output_buff_size
mtc0 cmd_w1_dram, DPC_END
add $11, cmd_w1_dram, dmaLen
sub $12, $12, $11
bgez $12, f3dzex_000012A8
@@await_start_valid:
mfc0 $11, DPC_STATUS
andi $11, $11, DPC_STATUS_START_VALID
bnez $11, @@await_start_valid
lw cmd_w1_dram, OSTask + OSTask_output_buff
f3dzex_00001298:
mfc0 $11, DPC_CURRENT
beq $11, cmd_w1_dram, f3dzex_00001298
nop
mtc0 cmd_w1_dram, DPC_START
f3dzex_000012A8:
mfc0 $11, DPC_CURRENT
sub $11, $11, cmd_w1_dram
blez $11, f3dzex_000012BC
sub $11, $11, dmaLen