display | more...

Normally, 8-bit processors perform multiplication with the long multiplication method implemented in software. This is slow, to the tune of 200 or so cycles per multiply. Often, the best optimization is a new algorithm. Note that

(a + x)2 = a2 + x2 + 2ax
Solve for ax, and a multiplication becomes
ax = (a + x)2/2 - a2/2 + x2/2
that is, a couple adds and squares. Squares are easy to optimize into lookup tables, and this is where it gets fast. Here's the code for the 6502 processor (and variants such as 6510, 65c02, and 2a03):

; You can change these for whatever system you're working on.
mul_factor_a   = $f0
mul_factor_x   = $f1
mul_product_lo = $f2
mul_product_hi = $f3

;
; mul
; Wicked fast LUT-based multiplication
; input: a = factor a; x = factor b
; output: mul_factor_a = factor a; mul_factor_x = x = factor x;
;         mul_product_lo = low byte of product a*b;
;         mul_product_hi = a = high byte of product
; preserved: x, y
; max cycles: under 90
;
mul
  sta mul_factor_a      ; setup: 6 cycles
  stx mul_factor_x

  clc                   ; (a + x)^2/2: 23 cycles
  adc mul_factor_x
  tax
  bcc +
  lda mul_hibyte512,x
  bcs ++
+
  lda mul_hibyte256,x
  sec
++
  sta mul_product_hi
  lda mul_lobyte256,x

  ldx mul_factor_a      ; - a^2/2: 20 cycles
  sbc mul_lobyte256,x
  sta mul_product_lo
  lda mul_product_hi
  sbc mul_hibyte256,x
  sta mul_product_hi

  ldx mul_factor_x      ; + x & a & 1: 22 cycles
  txa                   ; (this is a kludge to correct a
  and mul_factor_a      ; roundoff error that makes odd * odd too low)
  and #1

  clc
  adc mul_product_lo
  bcc +
  inc mul_product_hi
+
  sec                   ; - x^2/2: 25 cycles
  sbc mul_lobyte256,x
  sta mul_product_lo
  lda mul_product_hi
  sbc mul_hibyte256,x
  sta mul_product_hi
  rts

; here are the big fat lookup tables
mul_lobyte256
  .db   0,  1,  2,  5,  8, 13, 18, 25, 32, 41, 50, 61, 72, 85, 98,113
  .db 128,145,162,181,200,221,242,  9, 32, 57, 82,109,136,165,194,225
  .db   0, 33, 66,101,136,173,210,249, 32, 73,114,157,200,245, 34, 81
  .db 128,177,226, 21, 72,125,178,233, 32, 89,146,205,  8, 69,130,193
  .db   0, 65,130,197,  8, 77,146,217, 32,105,178,253, 72,149,226, 49
  .db 128,209, 34,117,200, 29,114,201, 32,121,210, 45,136,229, 66,161
  .db   0, 97,194, 37,136,237, 82,185, 32,137,242, 93,200, 53,162, 17
  .db 128,241, 98,213, 72,189, 50,169, 32,153, 18,141,  8,133,  2,129
  .db   0,129,  2,133,  8,141, 18,153, 32,169, 50,189, 72,213, 98,241
  .db 128, 17,162, 53,200, 93,242,137, 32,185, 82,237,136, 37,194, 97
  .db   0,161, 66,229,136, 45,210,121, 32,201,114, 29,200,117, 34,209
  .db 128, 49,226,149, 72,253,178,105, 32,217,146, 77,  8,197,130, 65
  .db   0,193,130, 69,  8,205,146, 89, 32,233,178,125, 72, 21,226,177
  .db 128, 81, 34,245,200,157,114, 73, 32,249,210,173,136,101, 66, 33
  .db   0,225,194,165,136,109, 82, 57, 32,  9,242,221,200,181,162,145
  .db 128,113, 98, 85, 72, 61, 50, 41, 32, 25, 18, 13,  8,  5,  2,  1
mul_hibyte256
  .db   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
  .db   0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1
  .db   2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  4,  4
  .db   4,  4,  4,  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7
  .db   8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 12
  .db  12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17
  .db  18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24
  .db  24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31
  .db  32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39
  .db  40, 41, 41, 42, 42, 43, 43, 44, 45, 45, 46, 46, 47, 48, 48, 49
  .db  50, 50, 51, 51, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 59
  .db  60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 67, 68, 69, 69, 70, 71
  .db  72, 72, 73, 74, 75, 75, 76, 77, 78, 78, 79, 80, 81, 82, 82, 83
  .db  84, 85, 86, 86, 87, 88, 89, 90, 91, 91, 92, 93, 94, 95, 96, 97
  .db  98, 98, 99,100,101,102,103,104,105,106,106,107,108,109,110,111
  .db 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127
mul_hibyte512
  .db 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143
  .db 144,145,146,147,148,149,150,152,153,154,155,156,157,158,159,160
  .db 162,163,164,165,166,167,168,169,171,172,173,174,175,176,178,179
  .db 180,181,182,184,185,186,187,188,190,191,192,193,195,196,197,198
  .db 200,201,202,203,205,206,207,208,210,211,212,213,215,216,217,219
  .db 220,221,223,224,225,227,228,229,231,232,233,235,236,237,239,240
  .db 242,243,244,246,247,248,250,251,253,254,255,  1,  2,  4,  5,  7
  .db   8,  9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30
  .db  32, 33, 35, 36, 38, 39, 41, 42, 44, 45, 47, 48, 50, 51, 53, 54
  .db  56, 58, 59, 61, 62, 64, 65, 67, 69, 70, 72, 73, 75, 77, 78, 80
  .db  82, 83, 85, 86, 88, 90, 91, 93, 95, 96, 98,100,101,103,105,106
  .db 108,110,111,113,115,116,118,120,122,123,125,127,129,130,132,134
  .db 136,137,139,141,143,144,146,148,150,151,153,155,157,159,160,162
  .db 164,166,168,169,171,173,175,177,179,180,182,184,186,188,190,192
  .db 194,195,197,199,201,203,205,207,209,211,212,214,216,218,220,222
  .db 224,226,228,230,232,234,236,238,240,242,244,246,248,250,252,254

You can find test code for this subroutine, as well as C code to generate the lookup tables, at http://www.cs.rose-hulman.edu/~yerricde/.

Squeezing a multiplication into 90 cycles is a Good Thing. Unfortunately, this algorithm is not good for microcontrollers with limited ROM space, but for a few more cycles, the tables' size could be cut in half (the first is symmetrical, and the third is the second plus 128 plus 2i). But for microcontrollers that need fast multiplication that cannot be strength-reduced to addition, it'd probably be best to put a multiplier in the chipset. This is the approach taken by the customized 65c816 processor in the Super NES game console.

I also don't see a generalization to division.

Basic idea is the same as detailed in Fast 6502 Multiplication:

ab = [(a + b)² - (a - b)²]/4

But substitute:

m = a/2;
n = b/2;

Then:

ab = (m + n)² - (m - n)² + c

"c" is a correction factor - details below.

Advantages:

  • Lookup tables fit in 2 × 256 bytes total, so less memory.
  • Lookup tables are squares that are not multiplied or divided by anything, so can be reused more easily in other functions.
Disadvantages:
  • Must correct for odd "a" and/or odd "b" as follows:

If a is odd, then c = b
If b is odd, then c = a
If a and b are both odd, then c = a + b - 1

Average case is 127 cycles, with a low of 104 and a high of 151.

Code follows. Compatible with the 6502 emulator at http://skilldrick.github.io/easy6502/.


; multiplies var_a by var_b
; result stored in var_r and var_s
; code is compatible with http://skilldrick.github.io/easy6502/
* = $80
var_a:
DCB 0
var_b:
DCB 0
var_m:
DCB 0
var_n:
DCB 0
var_r:
DCB 0
var_s:
DCB 0

* = $600
LDA #15
STA var_a
LDA #16
STA var_b
JSR multiply
BRK

multiply:

;if a & b are odd, r = -1, else r = 0
LDA var_a
AND var_b
AND #1
BEQ _1
LDA #255

_1:
STA var_r
STA var_s

;m = a / 2
LDA var_a
LSR A
STA var_m
BCC _2

;a is odd, r += b
CLC
LDA var_r
ADC var_b
STA var_r
LDA var_s
ADC #0
STA var_s

_2:
;n = b / 2
LDA var_b
LSR A
STA var_n
BCC _3

;b is odd, r += a
CLC
LDA var_r
ADC var_a
STA var_r
LDA var_s
ADC #0
STA var_s

_3:
;normalize so that m >= n
LDA var_m
CMP var_n
BCS _4

;n > m, swap m and n
TAY
LDA var_n
STA var_m
STY var_n

_4:
;r += (m + n)^2
CLC
LDA var_m
ADC var_n
TAY
LDA var_r
ADC sqr_lo,Y
STA var_r
LDA var_s
ADC sqr_hi,Y
STA var_s

;r -= (m - n)^2
SEC
LDA var_m
SBC var_n
TAY
LDA var_r
SBC sqr_lo,Y
STA var_r
LDA var_s
SBC sqr_hi,Y
STA var_s

;done!
RTS

sqr_lo:
DCB 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225
DCB 0, 33, 68, 105, 144, 185, 228, 17, 64, 113, 164, 217, 16, 73, 132, 193
DCB 0, 65, 132, 201, 16, 89, 164, 241, 64, 145, 228, 57, 144, 233, 68, 161
DCB 0, 97, 196, 41, 144, 249, 100, 209, 64, 177, 36, 153, 16, 137, 4, 129
DCB 0, 129, 4, 137, 16, 153, 36, 177, 64, 209, 100, 249, 144, 41, 196, 97
DCB 0, 161, 68, 233, 144, 57, 228, 145, 64, 241, 164, 89, 16, 201, 132, 65
DCB 0, 193, 132, 73, 16, 217, 164, 113, 64, 17, 228, 185, 144, 105, 68, 33
DCB 0, 225, 196, 169, 144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1
DCB 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225
DCB 0, 33, 68, 105, 144, 185, 228, 17, 64, 113, 164, 217, 16, 73, 132, 193
DCB 0, 65, 132, 201, 16, 89, 164, 241, 64, 145, 228, 57, 144, 233, 68, 161
DCB 0, 97, 196, 41, 144, 249, 100, 209, 64, 177, 36, 153, 16, 137, 4, 129
DCB 0, 129, 4, 137, 16, 153, 36, 177, 64, 209, 100, 249, 144, 41, 196, 97
DCB 0, 161, 68, 233, 144, 57, 228, 145, 64, 241, 164, 89, 16, 201, 132, 65
DCB 0, 193, 132, 73, 16, 217, 164, 113, 64, 17, 228, 185, 144, 105, 68, 33
DCB 0, 225, 196, 169, 144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1

sqr_hi:
DCB 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
DCB 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3
DCB 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8
DCB 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15
DCB 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24
DCB 25, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35
DCB 36, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 48
DCB 49, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
DCB 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
DCB 81, 82, 83, 84, 85, 86, 87, 89, 90, 91, 92, 93, 95, 96, 97, 98
DCB 100, 101, 102, 103, 105, 106, 107, 108, 110, 111, 112, 114, 115, 116, 118, 119
DCB 121, 122, 123, 125, 126, 127, 129, 130, 132, 133, 135, 136, 138, 139, 141, 142
DCB 144, 145, 147, 148, 150, 151, 153, 154, 156, 157, 159, 160, 162, 164, 165, 167
DCB 169, 170, 172, 173, 175, 177, 178, 180, 182, 183, 185, 187, 189, 190, 192, 194
DCB 196, 197, 199, 201, 203, 204, 206, 208, 210, 212, 213, 215, 217, 219, 221, 223
DCB 225, 226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 246, 248, 250, 252, 254

Here is an fast unsigned 8-bit in / 16-bit out multiplication routine using 512 bytes of lookup in 116-119 cycles and 4 zero page registers.

It basically works like long multiplication with nibbles.

In:
a0 = first factor
a1 = second factor
Out:
r0 = product low
r1 = product high

a0 * a1 = { r0, r1 }

It is calculated in 4 stages:

A: { r1_l, r0_h } = a0_l * a1_h
B: { r1_l, r0_h } = a0_h * a1_l
C: { r0_h, r0_l } = a0_l * a1_l
D: { r1_h, r1_l } = a0_h * a1_l

r0_h means r0 high nibble, and r0_l means r0 low nibble.

This example runs in the skilldrick 6502 compiler.

define a0 $00
define a1 $01

define r0 $20
define r1 $21

jmp start

mul:
    ; --- A (21 cycles)
    ldx a0
    txa
    eor a1
    and #$f0
    eor a1
    tay
    lda table_mul, y
    sta r1
    ; --- B (47 cycles)
    txa
    eor a1
    and #$f
    eor a1
    tay
    lda table_mul, y
    clc
    adc r1
    tay
    lda table_flip, y
    tay
    and #$f
    bcc mul_1
    ; add carry to high nibble
    ora #$10
    clc
mul_1:
    sta r1
    tya
    and #$f0
    sta r0
    ; --- C (26 cycles)
    lda table_flip, x
    eor a1
    tax
    and #$f0
    eor a1
    tay
    lda table_mul, y
    adc r0
    sta r0
    ; --- D (19 cycles)
    txa
    and #$f
    eor a1
    tay
    lda table_mul, y
    adc r1
    sta r1
    rts ; mul(a,b)

start:
    lda #8
    sta a0
    lda #8
    sta a1
    jsr mul
    brk ; start()

table_mul:
dcb 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
dcb 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
dcb 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
dcb 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45
dcb 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
dcb 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75
dcb 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90
dcb 0, 7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91, 98, 105
dcb 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120
dcb 0, 9, 18, 27, 36, 45, 54, 63, 72, 81, 90, 99, 108, 117, 126, 135
dcb 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150
dcb 0, 11, 22, 33, 44, 55, 66, 77, 88, 99, 110, 121, 132, 143, 154, 165
dcb 0, 12, 24, 36, 48, 60, 72, 84, 96, 108, 120, 132, 144, 156, 168, 180
dcb 0, 13, 26, 39, 52, 65, 78, 91, 104, 117, 130, 143, 156, 169, 182, 195
dcb 0, 14, 28, 42, 56, 70, 84, 98, 112, 126, 140, 154, 168, 182, 196, 210
dcb 0, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225
table_flip:
dcb 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
dcb 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241
dcb 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242
dcb 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243
dcb 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244
dcb 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245
dcb 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246
dcb 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247
dcb 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248
dcb 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249
dcb 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250
dcb 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251
dcb 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252
dcb 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253
dcb 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254
dcb 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255

Log in or register to write something here or to contact authors.