Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fuse write back via VPM #86

Open
nomaddo opened this issue Apr 24, 2018 · 0 comments
Open

Fuse write back via VPM #86

nomaddo opened this issue Apr 24, 2018 · 0 comments
Labels
enhancement optimization related to an optimization step

Comments

@nomaddo
Copy link
Collaborator

nomaddo commented Apr 24, 2018

In testing/bugs/68_remove_moves_of_r4.cl, fusion of write-back via VPM is partially successful but failed mostly.

$ ./build/VC4C --asm -o /tmp/hoge.s testing/bugs/68_remove_moves_of_r4.cl
// Module with 1 kernels, global data with 0 words (64-bit each), starting at offset 1 words and 0 words of stack-frame
// Kernel 'test' with 147 instructions, offset 2, with following parameters: __global in out float* a (4 B, 1 items), int n (4 B, 1 items)
// label: %start_of_function
or ra0, unif, unif
or -, unif, unif
or -, mutex_acq, mutex_acq
ldi vpr_setup, vdr_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpr_setup, vdr_setup(stride: 0)
or vpr_addr, ra0, ra0                                      // address + 0
or r0, ra0, ra0
add r2, r0, 4 (4)
add r1, r0, 8 (8)
or -, vpr_wait, vpr_wait
ldi vpr_setup, vpm_setup(num: 1, size: 16 words, stride: 1 rows, address: h32(0))
or r0, vpm, vpm
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpw_setup, vpm_setup(size: 16 words, stride: 1 rows, address: h32(0))
fadd vpm, r0, 1.000000 (32)
ldi vpw_setup, vdw_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpw_setup, vdw_setup(stride: 0)
or vpw_addr, ra0, ra0
or -, vpw_wait, vpw_wait
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpr_setup, vdr_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpr_setup, vdr_setup(stride: 0)
or vpr_addr, r2, r2                                          // address + 4
or r0, ra0, ra0
add r3, r0, 12 (12)
or -, vpr_wait, vpr_wait
ldi vpr_setup, vpm_setup(num: 1, size: 16 words, stride: 1 rows, address: h32(0))
or r0, vpm, vpm
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpw_setup, vpm_setup(size: 16 words, stride: 1 rows, address: h32(0))
fadd vpm, r0, 1.000000 (32)
ldi vpw_setup, vdw_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpw_setup, vdw_setup(stride: 0)
or vpw_addr, r2, r2
or -, vpw_wait, vpw_wait
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpr_setup, vdr_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpr_setup, vdr_setup(stride: 0)
or vpr_addr, r1, r1                                           // address + 8
v8adds r0, 8 (8), 8 (8)
add r2, ra0, r0
or -, vpr_wait, vpr_wait
ldi vpr_setup, vpm_setup(num: 1, size: 16 words, stride: 1 rows, address: h32(0))
or r0, vpm, vpm
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpw_setup, vpm_setup(size: 16 words, stride: 1 rows, address: h32(0))
fadd vpm, r0, 1.000000 (32)
ldi vpw_setup, vdw_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpw_setup, vdw_setup(stride: 0)
or vpw_addr, r1, r1
or -, vpw_wait, vpw_wait
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpr_setup, vdr_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpr_setup, vdr_setup(stride: 0)
or vpr_addr, r3, r3                                            // address + 12
v8adds r0, 10 (10), 10 (10)
add r1, ra0, r0
or -, vpr_wait, vpr_wait
ldi vpr_setup, vpm_setup(num: 1, size: 16 words, stride: 1 rows, address: h32(0))
or r0, vpm, vpm
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpw_setup, vpm_setup(size: 16 words, stride: 1 rows, address: h32(0))
fadd vpm, r0, 1.000000 (32)
ldi vpw_setup, vdw_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpw_setup, vdw_setup(stride: 0)
or vpw_addr, r3, r3
or -, vpw_wait, vpw_wait
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpr_setup, vdr_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpr_setup, vdr_setup(stride: 0)
or vpr_addr, r2, r2                                              // address + 16
v8adds r0, 12 (12), 12 (12)
add r3, ra0, r0
or -, vpr_wait, vpr_wait
ldi vpr_setup, vpm_setup(num: 1, size: 16 words, stride: 1 rows, address: h32(0))
or r0, vpm, vpm
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpw_setup, vpm_setup(size: 16 words, stride: 1 rows, address: h32(0))
fadd vpm, r0, 1.000000 (32)
ldi vpw_setup, vdw_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpw_setup, vdw_setup(stride: 0)
or vpw_addr, r2, r2
or -, vpw_wait, vpw_wait
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpr_setup, vdr_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpr_setup, vdr_setup(stride: 0)
or vpr_addr, r1, r1                                               // address + 20
or -, vpr_wait, vpr_wait
ldi vpr_setup, vpm_setup(num: 1, size: 16 words, stride: 1 rows, address: h32(0))
or r0, vpm, vpm
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpw_setup, vpm_setup(size: 16 words, stride: 1 rows, address: h32(0))
fadd vpm, r0, 1.000000 (32)
ldi vpw_setup, vdw_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpw_setup, vdw_setup(stride: 0)
or vpw_addr, r1, r1
or -, vpw_wait, vpw_wait
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpr_setup, vdr_setup(rows: 1, elements: 1 words, address: h32(0))
ldi vpr_setup, vdr_setup(stride: 0)
or vpr_addr, r3, r3                                                // address + 24
or -, vpr_wait, vpr_wait
ldi vpr_setup, vpm_setup(num: 1, size: 16 words, stride: 1 rows, address: h32(0))
or r0, vpm, vpm
or mutex_rel, 1 (1), 1 (1)
or -, mutex_acq, mutex_acq
ldi vpw_setup, vpm_setup(size: 16 words, stride: 1 rows, address: h32(0))
fadd vpm, r0, 1.000000 (32)
ldi vpw_setup, vdw_setup(rows: 4, elements: 1 words, address: h32(0))  // only this write-back is 4-rows
ldi vpw_setup, vdw_setup(stride: 0)
v8adds r0, 14 (14), 14 (14)
add tmu0s, ra0, r0
nop.load_tmu0.never
fadd vpm, r4, 1.000000 (32)
ftoi r0, 32.000000 (37)
add tmu0s, ra0, r0
nop.load_tmu0.never
fadd vpm, r4, 1.000000 (32)
mul24 r0, 6 (6), 6 (6)
add tmu0s, ra0, r0
nop.load_tmu0.never
fadd vpm, r4, 1.000000 (32)
or vpw_addr, r3, r3
or -, vpw_wait, vpw_wait
or mutex_rel, 1 (1), 1 (1)
// label: %end_of_function
or r0, unif, unif
or.setf -, elem_num, r0
brr.ifallzc (pc+4) + -143 // to %start_of_function
nop.never
nop.never
nop.never
not irq, qpu_num
nop.thrend.never
nop.never
nop.never
@nomaddo nomaddo added enhancement optimization related to an optimization step labels Apr 24, 2018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement optimization related to an optimization step
Projects
None yet
Development

No branches or pull requests

1 participant