dnl -*- mode: m4; comment-start: "%" -*-
include(`macros.m4')divert(-1)
% $Id: rc6-2.m4,v 1.2 1999/04/15 04:42:51 geoffk Exp $

The RC6 constants.
define(k_P, 0xb7e15163)
define(k_Q, 0x9e3779b9)


define(v_L0, 0xE0)	The key (may be in ROM), 16 bytes.
define(v_Li, 0xB0)	The key schedule, 48 bytes 
			(note that the first 16 bytes is not
			needed outside the encryption).
define(v_S, 0xA0)	More key schedule, 16 bytes
			(note that the first 8 bytes is not needed
			outside the encryption).
			The result of the schedule is in v_S+12.
define(v_A, 0x90)	The block to encrypt, 16 bytes.
define(v_B, v_A+4)
define(v_C, v_A+8)
define(v_D, v_A+12)

define(v_t, 0x8C)	Temporary for encrypt, schedule (4 bytes)
define(v_round, 0x8B)	The round number
define(v_crotate, 0x8A)	The amount by which C will be rotated

define(v_trotate, 0x89)	Size of rotation
define(v_rotvar, 0x88)	Variable to rotate
define(v_ssetup, 0x87)

define(test_ram, 0xF0)	Data for test program

define(NUMROUNDS,20)

% compute $1 <- $2*$2*2 + $2
% if x = 2^24 x_3 + 2^16 x_2 + 2^8 x_1 + x_0, then
% 2 x**2 + x =   2^24 (4 x_3 x_0 + x_2 x_1 + x_3) 
%		 + 2^16 (4 x_2 x_0 + 2 x_1 x_1 + x_2)
%		 + 2^8  (4 x_1 x_0 + x_1) 
%		 +      (2 x_0 x_0 + x_0)
define(quad,`dnl
%  do (4 x_3 + x_2>>6) x_0
	lda	$2+0
	ldx	$2+1
	aslx
	rola
	aslx
	rola
	ldx	$2+3
	mul
	sta	$1+0

%  do (4 x_2 + x_1>>7) x_1
	lda	$2+1
	ldx	$2+2
	asla
	aslx
	rola
	ldx	$2+2
	mul
	add	$1+0
	sta	$1+0

%  do (4 x_2 + x_1>>6) x_0
	lda	$2+1
	ldx	$2+2
	aslx
	rola
	aslx
	rola
	ldx	$2+3
	mul
	sta	$1+1
	txa
	adc	$1+0
	sta	$1+0

%  do (2 x_1) x_1
	lda	$2+2
	tax
	lsla
	mul
	add	$1+1
	sta	$1+1
	txa
	adc	$1+0
	sta	$1+0
	
%  do (4 x_1 + x_0>>7) x_0 + x_1
	lda	$2+2
	ldx	$2+3
	asla
	aslx
	rola
	ldx	$2+3
	mul
	add	$2+2
	sta	$1+2
	txa
	adc	$1+1
	sta	$1+1
	lda	#0
	adc	$1+0
	sta	$1+0

%  finally, do (2 x_0 + 1) x_0, and add in the x_3 and x_2 terms
	lda	$2+3
	tax
	sec
	rola
	mul
	sta	$1+3
	txa
	add	$1+2
	sta	$1+2
	lda	$2+1
	adc	$1+1
	sta	$1+1
	lda	$2+0
	adc	$1+0
	sta	$1+0')

%  compute $1 = $1 ^ ($2 <<< 5)
%  and leave the low 5 bits of ($2 <<< 5) in the the X register
define(`rot5andxor',`dnl
	rotr4($2,2)
	lsra
	lda	1+$2
	rora
	eor	0+$1
	sta	0+$1
	lda	2+$2
	rora
	eor	1+$1
	sta	1+$1
	lda	3+$2
	rora
	eor	2+$1
	sta	2+$1
	lda	0+$2
	rora
	tax
	eor	3+$1
	sta	3+$1
')dnl

divert`'dnl

% RC6 key-schedule-as-you-go
rc6_sched_step:
	add4i(v_S,v_S,k_Q)
rc6_sched_step_final:
forloop(`j',1,3,`dnl
	add4(v_S+eval(j*4),v_S+eval(j*4),v_S+eval((j-1)*4))
	lda	v_round
	add	#0x03
	and	#0x03
	lsla
	lsla
	add	#v_Li+eval((j-1)*16)
	tax
	add4(v_S+eval(j*4),v_S+eval(j*4),`0,X')
	rotl4(v_S+eval(j*4),3)
	add4(v_t,v_S+eval(j*4),`0,X')
	lda	v_round
	and	#0x03
	lsla
	lsla
	add	#v_Li+eval((j-1)*16)
	tax
	add4(`0,X',`0,X',v_t)
	lda	v_t+3
	jsr	do_rotate
ifelse(j,3,`',`dnl
	brclr`'eval(2-j)	v_ssetup,rc6_sched_more_`'j
')dnl
	rts
rc6_sched_more_`'j:
')dnl

const_initialS:
	const4(k_P)
	const4(rotli(k_P,3))

% Set up the key schedule.
rc6_ks_setup:
	ldx	#7
ks_setup_loop:
	lda	const_initialS,X
	sta	v_S,X
	lda	v_L0+8,X
	sta	v_Li+8,X
	decx
	bpl	ks_setup_loop
	add4i(v_Li,v_L0,eval(rotli(k_P,3)))
ifelse(eval(eval(rotli(k_P,3)&31) >= 28),0,
       fixme rotate amount needs tweaking )dnl
	rotr4(v_Li,eval(32-(rotli(k_P,3)&31)))
	set4(v_Li+4,v_L0+4)
	rts

% Return the key schedule to its initial state, ready for the
% next run.
rc6_ks_final:
	inc	v_round

	ldx	#3
ks_final_copy_loop:
	lda	const_initialS+4,X
	sta	v_S,X
	decx
	bpl	ks_final_copy_loop

	jsr	rc6_sched_step_final
	
	ldx	#7
ks_final_loop:
	lda	v_S+4,X
	sta	v_S+8,X
	lda	v_Li+16,X
	sta	v_Li+32,X
	lda	v_Li+16+8,X
	sta	v_Li+32+8,X
	lda	v_Li,X
	sta	v_Li+16,X
	lda	v_Li+8,X
	sta	v_Li+16+8,X
	decx
	bpl	ks_final_loop
	rts

% The rc6 encryption
% Input: block to encrypt in v_A, key schedule in v_S
% Output: encrypted block in v_A.
rc6:
	clr	v_ssetup
	jsr	rc6_ks_setup

% Do the initial key addition
	clr	v_round
	add4(v_B,v_B,v_S+eval(4*3))
	inc	v_round
	jsr	rc6_sched_step
	add4(v_D,v_D,v_S+eval(4*3))
	inc	v_round

round_loop:
	quad(v_t, v_B)
	rot5andxor(v_A,v_t)
	stx	v_crotate
	quad(v_t, v_D)
	rot5andxor(v_C,v_t)
	txa
	ldx	#v_A
	jsr	do_rotate
	ldx	#v_C
	lda	v_crotate
	jsr	do_rotate
	
	jsr	rc6_sched_step
	add4(v_A,v_A,v_S+eval(4*3))
	inc	v_round
	jsr	rc6_sched_step
	add4(v_C,v_C,v_S+eval(4*3))
	inc	v_round

	ldx	#3
rotate_loop:
	lda	v_A,X
	sta	v_t
	lda	v_B,X
	sta	v_A,X
	lda	v_C,X
	sta	v_B,X
	lda	v_D,X
	sta	v_C,X
	lda	v_t
	sta	v_D,X
	decx
	bpl	rotate_loop

	lda	v_round
	cmp	#eval(NUMROUNDS*2+2)
	bne	round_continues

	jsr	rc6_sched_step
	add4(v_A,v_A,v_S+eval(4*3))
	inc	v_round
	jsr	rc6_sched_step
	add4(v_C,v_C,v_S+eval(4*3))
	jsr	rc6_ks_final
	rts
round_continues:
	jmp	round_loop

%  Rotate the variable at X left by A.
do_rotate: variable_rotate

%  key schedule for rc6
% Input: key in v_A
% Output: key schedule in v_S, and v_A trashed.
rc6_schedule:
	lda	#2
	sta	v_ssetup
ks_outer_loop:
	jsr	rc6_ks_setup
	clr	v_round
ks_inner_loop:
	inc	v_round
	jsr	rc6_sched_step
	lda	v_round
	cmp	#eval(NUMROUNDS*2+3)
	bne	ks_inner_loop

	jsr	rc6_ks_final
	
	dec	v_ssetup
	bne	ks_outer_loop	

	rts

test_program(test_ram,v_L0,16,v_A,v_A,16,jsr rc6_schedule,jsr rc6)

% Note that the implementation here has its endianness reversed.
test_data:
define(t,`dnl
forloop(`i',0,2,`dnl
	forloop(`j',0,15,`byte 0x`'substr($1,eval(i*33+(j^3)*2),2);')
')')dnl
t(00000000000000000000000000000000 00000000000000000000000000000000 dnl
8fc3a53656b1f778c129df4e9848a41e)
t(0123456789abcdef0112233445566778 02132435465768798a9bacbdcedfe0f1 dnl
524e192f4715c6231f51f6367ea43f18)
test_data_end:
