dnl -*- mode: m4; comment-start: "%" -*-
include(`macros.m4')divert(-1)
% $Id: mars-2.m4,v 1.3 1999/04/15 03:01:06 geoffk Exp $

% Key schedule-related data.
define(v_k,0x80)		The key
define(v_T,0x90)		The scheduled key, T_j[0] (6*9 bits packed
%				into 6+1 bytes)
% Note that outside the schedule, 0x96 is free.
define(v_T_hi,0x97)		The high bits of v_Tj0, stored in bits 6-0
				for T_0[0] through T_6[0] (note order is
				reversed!)
define(KEYLEN,4)		Key length in words

define(v_K,0x98)		The next 10 words of key data
%	To be precise, if x=((v_round+2)/4)*8, then this contains words
%       x, x+1, x+2, ..., x+7, x+10, x+8
%	Only the LSBs of the last two words are stored 

define(v_round,0xBA)		The current round number

define(v_maxj,0xBB)		For the scheduler; specifies that v_T
%				is only partially valid (up to T_maxj).
define(v_schcnt,0xBC)		The scheduled word we're looking at
define(v_schrnd,0xBD)		The round of scheduling we're in


define(v_T0i,0xC0)		The expanded key material polynomial (7 words)
define(v_Tij,0xDC)		The current T_j[i] (word) 

define(v_D,0xE0)		The block to encrypt, MSB-first

define(v_sch_seenbits,0xBB)	Number of bits the same as this one seen.
define(v_sch_str,0xBC)		The number of stirring rounds to go
define(v_sch_bit,0xBD)		Counter of number of bits in 
%				search for weak key
define(v_sch_cnt,0xBE)		Counter for the scheduler
define(v_sch_j,0xBF)		Low byte of K[i] in pseudo-code
define(v_Kt,0xC0)		A temporary key word used by the schedule.
define(v_Kt2,0xC4)		Another temporary.

define(v_L,0xC8)		Output words of the E-function
define(v_M,0xCC)
define(v_R,0xD0)
define(v_tmp,0xD4)		Temporary, 4 bytes

define(v_trotate,0xD8)		Temporaries for the rotater.
define(v_rotvar,0xDA)
define(test_ram,0xF0)		Some RAM for the test program.

divert`'dnl
include(`mars-sbox-out.m4')

mars_sch_polytable:
forloop(`i',0,45,`dnl`'changecom(`')
	byte	v_T0i+eval((i+1)`%'7)
changecom(`%')')dnl

mars_sch_times7table:
	byte	0xFF	% First value is never useful.
forloop(`i',1,40,`dnl`'changecom(`')
	byte	eval(((i*7)`%'40)*4)+1
changecom(`%')')dnl

% MARS key schedule, plus assume v_T is fully valid
mars_sch_incipher:
	lda	#6
	sta	v_maxj
% and fall through...

% MARS key-schedule-as-you-go
% Inputs: The key in v_k; a (possibly partial) key schedule in v_T;
%	  the number of valid values in the key schedule in v_maxj;
%	  the round number in v_round (only if the key schedule is
%	  fully valid).
% Output: Either the key schedule, with one more valid value, in v_T;
%	  or the next 10 words of key data in v_K.
% note: v_T is modified and restored during the execution of this routine.
mars_sch_generic:

% Initialise v_T0i to be the first few words of the S-box.
	ldx	#6
sch_sbox_copy_loop:
	set4(`0*7+v_T0i,X',`0*512+mars_sbox,X')
	decx
	bne	sch_sbox_copy_loop

% Set v_T0i[0].
	eor4i(`0*7+v_T0i',v_k,rotli(mars_sbox_0 ^ mars_sbox_5, 3) ^ 0)

% Copy v_T forward (the big loop will set it back)
	ldx	#5
keysched_setup_loop:
	lda	v_T,X
	sta	v_T+1,X
	decx
	bpl	keysched_setup_loop
% Set up v_T[0]
	lda	3*7+v_T0i
	sta	v_T
	ror	v_T_hi
	brset0	2*7+v_T0i,v_T0_1
	bclr7	v_T_hi
	bra	v_T0_done
v_T0_1:	bset7	v_T_hi
	bra	v_T0_done
v_T0_done:

% Now loop through the key schedule.  Loop through the whole schedule,
% because this saves RAM (we get v_T back at the end).
	clr	v_schcnt
mars_sch_loop:
% At this point, a key word corresponding to v_round%40 will be in v_Tij
% unless either v_maxj != 6, or v_round == 0.
	lda	v_maxj
	cmp	#6
	bne	this_keyitem_not_important

% We may need this item if it's between (v_round+16)/4 and
% (v_round+16)/4+10, inclusive, and items after
% (v_round+16)/4+7 are treated specially.
	lda	v_round
	add	#16
	ldx	v_schcnt
	sub	mars_sch_times7table,X
	bcc	this_keyitem_not_important
	add	#4*10+1
	bcc	this_keyitem_not_important
	sub	#4*3
	bcc	normal_keyitem
	add	#4*2
	beq	this_keyitem_not_important
	ldx	#0
	rolx
	lda	v_Tij+3
	sta	v_K+32,X
	bra	this_keyitem_not_important % any more
normal_keyitem:
	eor	#0x1C
	tax
	set4(`v_K,X',v_Tij)

this_keyitem_not_important:
	lda	v_schcnt
	cmp	#38
	bmi	normal_keysched_poly
	beq	keysched_poly_keylen
	cmp	#39
	beq	keysched_poly_last
	rts

keysched_poly_last:
	eor4i(v_Tij,v_k,rotli(mars_sbox_0 ^ mars_sbox_5, 3) ^ 0)
	jmp	done_keysched_poly
keysched_poly_keylen:
	clr	v_Tij
	clr	v_Tij+1
	clr	v_Tij+2
	lda	#KEYLEN
	sta	v_Tij+3
	bra	done_keysched_poly
normal_keysched_poly:
	ldx	v_schcnt
	ldx	mars_sch_polytable,X
	set4(v_Tij,`0*7,X')
	ldx	v_schcnt
	ldx	mars_sch_polytable+5,X
	eor4(v_Tij,v_Tij,`0*7,X')
	rotl4(v_Tij,3)
	lda	v_schcnt
	add	#1
	eor	v_Tij+3
	sta	v_Tij+3
	lda	v_schcnt
	add	#1
	and	#ifelse(KEYLEN,4,3,KEYLEN,8,7,oops!)
	lsla
	lsla
	tax
	eor4(v_Tij,v_Tij,`v_k,X')
done_keysched_poly:
	ldx	v_schcnt
	ldx	mars_sch_polytable,X
	set4(`0*7,X',v_Tij)
	
	ldx	#0
keysched_round_loop:
	cpx	v_maxj
	bls	keysched_round_continues

keysched_rotfix_loop:
	lsl	v_T_hi
	incx
	cpx	#8
	bne	keysched_rotfix_loop

	inc	v_schcnt
	jmp	mars_sch_loop

keysched_round_continues:
	stx	v_schrnd
	ldx	v_T,X
	brset7	v_T_hi,keysched_sbox1
	add4(v_Tij,v_Tij,`0*512+mars_sbox,X')
	bra	keysched_sbox_done
keysched_sbox1:
	add4(v_Tij,v_Tij,`0*512+mars_sbox+256,X')
	bra	keysched_sbox_done   % for timing independence
keysched_sbox_done:
	rotbl4(v_Tij,1,`,X')
	lslx
	rol	3+v_Tij
	rol	2+v_Tij
	rol	1+v_Tij
	rol	v_Tij

	ldx	v_schrnd
	lda	v_Tij+3
	sta	v_T,X
	lda	v_Tij+2
	rora
	rol	v_T_hi

	incx
	jmp	keysched_round_loop


% MARS encryption
% Input: The scheduled key in v_K, the block to encrypt in v_D
% Output: The encrypted block in v_D
mars:
	lda	#0-16
	sta	v_round
	jsr	mars_sch_incipher

% Phase 1, forwards mixing
% First add subkeys to data
	ldx	#3*4
subkey_add_loop:
	add4(`v_D,X',`v_D,X',`v_K,X')
	txa
	sub	#4
	tax
	bpl	subkey_add_loop

% Then do eight rounds of forward mixing
	lda	#7
	sta	v_round
forwards_mix_loop:
	ldx	v_D+3
	stx	v_tmp+2
	eor4(`v_D+4',`v_D+4',`0*512+mars_sbox,X')
	ldx	v_D+2
	stx	v_tmp+1
	add4(`v_D+4',`v_D+4',`0*512+mars_sbox+256,X')
	ldx	v_D+1
	stx	v_tmp+0
	add4(`v_D+8',`v_D+8',`0*512+mars_sbox,X')
	ldx	v_D
	stx	v_tmp+3
	eor4(`v_D+12',`v_D+12',`0*512+mars_sbox+256,X')
	lda	v_round
	and	#3
	cmp	#2
	bmi	forwards_rounddep_done
	bne	forwards_not_15
	add4(v_tmp,v_tmp,`v_D+4')
	bra	forwards_rounddep_done
forwards_not_15:
	add4(v_tmp,v_tmp,`v_D+12')
forwards_rounddep_done:
	
	ldx	#3
forwards_shift_loop:
	lda	v_D+4,X
	sta	v_D,X
	lda	v_D+8,X
	sta	v_D+4,X
	lda	v_D+12,X
	sta	v_D+8,X
	lda	v_tmp,X
	sta	v_D+12,X
	decx
	bpl	forwards_shift_loop

	dec	v_round
	bmi	forwards_mix_loop_done
	jmp	forwards_mix_loop
forwards_mix_loop_done:

% Do 16 rounds of keyed transformation
	clr	v_round
keyed_transform_loop:

	lda	v_round
	add	#16
	bit	#0x18
	bne	no_sched_needed_now
	jsr	mars_sch_incipher
no_sched_needed_now:

% Compute the E-function
	lda	v_round
	add	#16
	and	#0x18
	tax
	add4(v_M,v_D,`v_K,X')
	set4(v_L,`v_K+4,X')

% Now, `fix' the word in v_L, by removing sequences of 10+ 0s or 1s.
% First, set the two least-significant-bits to 1
forloop(`i',0,3,`dnl
	lda	v_L+i
ifelse(i,3,`dnl
	sta	v_sch_j
	ora	#3
	sta	v_L+i
')dnl
	sta	v_Kt+i
')dnl

% Set v_Kt to be v_Kt ^ (v_Kt >> 1)
forloop(`i',0,3,`dnl
	lda	v_Kt+i
ifelse(i,0,`dnl
	lsr	v_Kt+i
',`dnl
	ror	v_Kt+i
')dnl
	eor	v_Kt+i
	sta	v_Kt+i
')dnl

% Set a bit in v_Kt2 if the corresponding bit in v_Kt is part of a sequence
% of 10 or more consecutive 0 bits.
	lda	#32
	sta	v_sch_bit
	clr	v_sch_seenbits
sch_bit_counting_loop:
	shr4(v_Kt2,1)
	shr4(v_Kt,1)	% also copies the low bit into the carry flag.
	lda	#0
	adc	#0xFF	% A is now 0xFF if the bit was 0, 0x00 if it was 1.
	and	v_sch_seenbits
	sbc	#0xFF	% Add 1 only if the bit was 0.
	sta	v_sch_seenbits
	cmp	#9
	lda	#0
	adc	#0xFF	% A is now 0xFF if 9 or more 0 bits have been seen.
	ora	v_Kt2
	sta	v_Kt2
	dec	v_sch_bit
	bne	sch_bit_counting_loop

	bclr1	v_Kt2+3
	
% Generate the masked word and eor it with the schedule item
	lda	v_sch_j
	and	#3
	tax
	set4(v_Kt,`0*512+mars_sbox+265,X')
	lda	v_round
	and	#0x8
	brset4	v_round,no_annoying_rotcount
	sbc	#8
	lda	#v_K+32
	adc	#0
	bra	rotcount_found
no_annoying_rotcount:
	add	#v_K+16+3
rotcount_found:
	tax
	lda	,X
	ldx	#v_Kt
	jsr	dorotate

forloop(`i',0,3,`dnl
	lda	v_Kt2+i
	and	v_Kt+i
	eor	v_L+i
	sta	v_L+i
')dnl

% Compute v_D[0] = v_D[0] << 13
	rotbl4(v_D,2,`,X')
forloop(`i',0,2,`dnl
	lsrx
	ror	v_D+1
	ror	v_D+2
	ror	v_D+3
	ror	v_D
')dnl

	mul4(v_R,v_D,v_L)
	ldx	v_M+3
	brset0	v_M+2,e_use_s1
	set4(v_L,`0*512+mars_sbox,X')
e_use_s1:
	brclr0	v_M+2,e_used_s0
	set4(v_L,`0*512+256+mars_sbox,X')
e_used_s0:
	rotl4(v_R,5)
	lda	v_R+3
	ldx	#v_M
	jsr	dorotate
	eor4(v_L,v_L,v_R)
	rotbl4(v_R,1,`,X')
	rotr4(v_R,3)
	eor4(v_L,v_L,v_R)
	lda	v_R+3
	ldx	#v_L
	jsr	dorotate

	add4(v_D+8,v_D+8,v_M)
	brset6	v_round,backward_mode
	add4(v_L,v_D+4,v_L)
	eor4(v_D+12,v_D+12,v_R)
	bra	done_addLR
backward_mode:
	add4(v_D+12,v_D+12,v_L)
	eor4(v_L,v_D+4,v_R)
done_addLR:
	ldx	#3
keyed_shift_loop:
	lda	v_D+8,X
	sta	v_D+4,X
	lda	v_D+12,X
	sta	v_D+8,X
	lda	v_D,X
	sta	v_D+12,X
	lda	v_L,X
	sta	v_D,X
	decx
	bpl	keyed_shift_loop

	lda	v_round
	add	#8
	sta	v_round
	bmi	keyed_transform_loop_done
	jmp	keyed_transform_loop
keyed_transform_loop_done:

% Do eight rounds of backward mixing
	clr	v_round
backwards_mix_loop:
	lda	v_round
	and	#3
	cmp	#2
	bmi	backwards_rounddep_done
	bne	backwards_not_26
	sub4(v_D,v_D,v_D+12)
	bra	backwards_rounddep_done
backwards_not_26:
	sub4(v_D,v_D,v_D+4)
backwards_rounddep_done:
	
	ldx	v_D+3
	stx	v_tmp+0
	eor4(`v_D+4',`v_D+4',`0*512+mars_sbox+256,X')
	ldx	v_D
	stx	v_tmp+1
	sub4(`v_D+8',`v_D+8',`0*512+mars_sbox,X')
	ldx	v_D+1
	stx	v_tmp+2
	sub4(`v_D+12',`v_D+12',`0*512+mars_sbox+256,X')
	ldx	v_D+2
	stx	v_tmp+3
	eor4(`v_D+12',`v_D+12',`0*512+mars_sbox,X')

	ldx	#3
backwards_shift_loop:
	lda	v_D+4,X
	sta	v_D,X
	lda	v_D+8,X
	sta	v_D+4,X
	lda	v_D+12,X
	sta	v_D+8,X
	lda	v_tmp,X
	sta	v_D+12,X
	decx
	bpl	backwards_shift_loop

	inc	v_round
	lda	v_round
	cmp	#8
	beq	backwards_mix_loop_done
	jmp	backwards_mix_loop
backwards_mix_loop_done:
	
% Then subtract subkeys from data
	ldx	#3*4
subkey_sub_loop:
	sub4(`v_D,X',`v_D,X',`v_K+eval(4*4),X')
	txa
	sub	#4
	tax
	bpl	subkey_sub_loop
	
% We're done!
	rts

%  Rotate the variable at X left by A.
dorotate: variable_rotate

% The MARS key schedule
% Input: The key (128 bits) in v_k
% Output: The key scheduled into v_T
mars_schedule:
	clr	v_maxj
mars_schedule_loop:
	jsr	mars_sch_generic
	inc	v_maxj
	lda	v_maxj
	cmp	#6
	bne	mars_schedule_loop
	rts

test_program(test_ram,v_k,16,v_D,v_D,16,jsr mars_schedule,jsr mars)

test_data:
xbytes(00000000000000000000000000000000
 00000000000000000000000000000000  deb35132 83c296de 39069e6b 994c2438)
xbytes(00000000000000000000000000000000
 deb35132 83c296de 39069e6b 994c2438  64fc8e9c b429181f 72141f4b bf87af3b)
xbytes(00000000000000000000000000000000
 64fc8e9c b429181f 72141f4b bf87af3b  c97823ec f4435929 4a2e679f 38174e5b)
xbytes(c97823ec f4435929 4a2e679f 38174e5b
 00000000000000000000000000000000  31fd9f10 284745f2 77b4d619 e92474b5)
xbytes(c97823ec f4435929 4a2e679f 38174e5b
 31fd9f10 284745f2 77b4d619 e92474b5  b694cfd6 22349ffa 834f2121 35c92a84)
test_data_end:
