If you use a compiled language, you should periodically look at Godbolt and see what your code is doing and what changes to your code will do in the compiled output.
In this case a positively insane way of calculating squares and cubes generates 311 lines of ARM assembler output that will swallow your memory. With even something as simple as -O1
on the command line it’s replaced by one or two multiplications respectively. With -fwhole-program
it removes the functions entirely and interlaces them into the loop in main()
.
Know your tools. It makes huge differences!
I mean it could hurt:
cube: push {r4, r5, r6, r7, r8, r9, r10, fp} sub sp, sp, #112 add r7, sp, #0 str r0, [r7, #92] mov r3, sp mov ip, r3 ldr r1, [r7, #92] ldr r0, [r7, #92] ldr r6, [r7, #92] subs r3, r1, #1 str r3, [r7, #108] mov r2, r1 movs r3, #0 mov r4, r2 mov r5, r3 mov r2, #0 mov r3, #0 lsls r3, r5, #3 orr r3, r3, r4, lsr #29 lsls r2, r4, #3 subs r3, r0, #1 str r3, [r7, #104] mov r2, r1 movs r3, #0 str r2, [r7, #80] str r3, [r7, #84] mov r2, r0 movs r3, #0 str r2, [r7, #64] str r3, [r7, #68] ldrd r4, [r7, #80] mov r3, r5 ldr r2, [r7, #64] mul r2, r2, r3 ldr r3, [r7, #68] strd r4, [r7, #80] ldr r4, [r7, #80] mul r3, r4, r3 add r3, r3, r2 ldr r2, [r7, #80] ldr r4, [r7, #64] umull r8, r9, r2, r4 add r3, r3, r9 mov r9, r3 mov r2, #0 mov r3, #0 lsl r3, r9, #3 orr r3, r3, r8, lsr #29 lsl r2, r8, #3 subs r3, r6, #1 str r3, [r7, #100] mov r2, r1 movs r3, #0 str r2, [r7, #32] str r3, [r7, #36] mov r2, r0 movs r3, #0 str r2, [r7, #72] str r3, [r7, #76] ldrd r4, [r7, #32] mov r3, r5 ldrd r8, [r7, #72] mov r2, r8 mul r2, r2, r3 strd r8, [r7, #72] ldr r3, [r7, #76] mov r8, r4 mov r9, r5 mov r4, r8 mul r3, r4, r3 add r3, r3, r2 mov r2, r8 ldr r4, [r7, #72] umull r10, fp, r2, r4 add r3, r3, fp mov fp, r3 mov r2, r6 movs r3, #0 str r2, [r7, #24] str r3, [r7, #28] ldrd r4, [r7, #24] mov r3, r4 mul r2, r3, fp mov r3, r5 mul r3, r10, r3 add r3, r3, r2 mov r2, r4 umull r4, r2, r10, r2 str r2, [r7, #60] mov r2, r4 str r2, [r7, #56] ldr r2, [r7, #60] add r3, r3, r2 str r3, [r7, #60] mov r2, #0 mov r3, #0 ldrd r8, [r7, #56] mov r4, r9 lsls r3, r4, #3 mov r4, r8 orr r3, r3, r4, lsr #29 mov r4, r8 lsls r2, r4, #3 mov r2, r1 movs r3, #0 str r2, [r7, #16] str r3, [r7, #20] mov r2, r0 movs r3, #0 str r2, [r7, #8] str r3, [r7, #12] ldrd r8, [r7, #16] mov r3, r9 ldrd r10, [r7, #8] mov r2, r10 mul r2, r2, r3 mov r3, fp mov r4, r8 mul r3, r4, r3 add r3, r3, r2 mov r2, r8 mov r4, r10 umull r4, r2, r2, r4 str r2, [r7, #52] mov r2, r4 str r2, [r7, #48] ldr r2, [r7, #52] add r3, r3, r2 str r3, [r7, #52] mov r2, r6 movs r3, #0 str r2, [r7] str r3, [r7, #4] ldrd r8, [r7, #48] mov r3, r9 ldrd r10, [r7] mov r2, r10 mul r2, r2, r3 mov r3, fp mov r4, r8 mul r3, r4, r3 add r3, r3, r2 mov r2, r8 mov r4, r10 umull r4, r2, r2, r4 str r2, [r7, #44] mov r2, r4 str r2, [r7, #40] ldr r2, [r7, #44] add r3, r3, r2 str r3, [r7, #44] mov r2, #0 mov r3, #0 ldrd r8, [r7, #40] mov r4, r9 lsls r3, r4, #3 mov r4, r8 orr r3, r3, r4, lsr #29 mov r4, r8 lsls r2, r4, #3 mov r3, r1 mov r2, r0 mul r3, r2, r3 mov r2, r6 mul r3, r2, r3 adds r3, r3, #7 lsrs r3, r3, #3 lsls r3, r3, #3 sub sp, sp, r3 mov r3, sp str r3, [r7, #96] mov r3, r1 mov r2, r0 mul r3, r2, r3 mov r2, r6 mul r3, r2, r3 mov sp, ip mov r0, r3 adds r7, r7, #112 mov sp, r7 pop {r4, r5, r6, r7, r8, r9, r10, fp} bx lr