;
; aes_sse41.asm
;
; Author: Pekka Riikonen <priikone@silcnet.org>
;
; Copyright (C) 2008 Pekka Riikonen
;
; This program is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; version 2 of the License.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
;

; SSE4.1 AES implementation.  This includes only the encryption and
; decryption functions.  This file is included from aes_x86_64.asm when
; SSE4.1 capable CPU was detected in configuration.
;
; Destroys rax, rcx, rdx, r8, r9, r10, xmm6, xmm7, xmm8

; round, %1 = input/output, %2 = index, xmm6 = temp, xmm7 = temp
%macro sse41_rnd 2
    pextrb  rsi, %1, 0
    pextrb  rdi, %1, 1
    pextrb  r9, %1, 2
    pextrb  r10, %1, 3

    pinsrd  xmm6, t_ref(0, rsi), 0
    pinsrd  xmm6, t_ref(1, rdi), 3
    pinsrd  xmm6, t_ref(2, r9), 2
    pinsrd  xmm6, t_ref(3, r10), 1
    pxor    xmm6, fk_ref(%2,0)

    pextrb  rsi, %1, 4
    pextrb  rdi, %1, 5
    pextrb  r9, %1, 6
    pextrb  r10, %1, 7

    pinsrd  xmm7, t_ref(0, rsi), 1
    pinsrd  xmm7, t_ref(1, rdi), 0
    pinsrd  xmm7, t_ref(2, r9), 3
    pinsrd  xmm7, t_ref(3, r10), 2
    pxor    xmm7, xmm6

    pextrb  rsi, %1, 8
    pextrb  rdi, %1, 9
    pextrb  r9, %1, 10
    pextrb  r10, %1, 11

    pinsrd  xmm6, t_ref(0, rsi), 2
    pinsrd  xmm6, t_ref(1, rdi), 1
    pinsrd  xmm6, t_ref(2, r9), 0
    pinsrd  xmm6, t_ref(3, r10), 3
    pxor    xmm7, xmm6

    pextrb  rsi, %1, 12
    pextrb  rdi, %1, 13
    pextrb  r9, %1, 14
    pextrb  r10, %1, 15

    pinsrd  %1, t_ref(0, rsi), 3
    pinsrd  %1, t_ref(1, rdi), 2
    pinsrd  %1, t_ref(2, r9), 1
    pinsrd  %1, t_ref(3, r10), 0
    pxor    %1, xmm7
%endmacro

    section .text align=16
    align   16
aes_encrypt:

%ifndef WIN32
    sub     rsp, 3*8        ; gnu/linux binary interface
    mov     [rsp+0*8], rsi  ; output pointer
    mov     r8, rdx         ; key
%else
    sub     rsp, 5*8        ; windows binary interface
    mov     [rsp+3*8], rsi
    mov     [rsp+4*8], rdi
    mov     [rsp+0*8], rdx  ; output pointer
    mov     rdi, rcx        ; input pointer
%endif
    mov     [rsp+1*8], rbx
    mov     [rsp+2*8], rbp

    movzx   esi, byte [kptr+4*KS_LENGTH]
    lea     tptr,[enc_tab wrt rip]
    sub     kptr, fofs

    movntdqa xmm8, [rdi+0*4]
    pxor     xmm8, [kptr+fofs]

    lea     kptr,[kptr+rsi]
    cmp     esi, 10*16
    je      .3
    cmp     esi, 12*16
    je      .2
    cmp     esi, 14*16
    je      .1
    mov     rax, -1
    jmp     .4

.1: sse41_rnd  xmm8, 13
    sse41_rnd  xmm8, 12
.2: sse41_rnd  xmm8, 11
    sse41_rnd  xmm8, 10
.3: sse41_rnd  xmm8, 9
    sse41_rnd  xmm8, 8
    sse41_rnd  xmm8, 7
    sse41_rnd  xmm8, 6
    sse41_rnd  xmm8, 5
    sse41_rnd  xmm8, 4
    sse41_rnd  xmm8, 3
    sse41_rnd  xmm8, 2
    sse41_rnd  xmm8, 1
    add        tptr, 2048
    sse41_rnd  xmm8, 0

    mov     rbx, [rsp]
    movdqa  [rbx], xmm8

    xor     rax, rax
.4:
    mov     rbx, [rsp+1*8]
    mov     rbp, [rsp+2*8]
%ifndef WIN32
    add     rsp, 3*8
%else
    mov     rsi, [rsp+3*8]
    mov     rdi, [rsp+4*8]
    add     rsp, 5*8
%endif
    ret
