godot/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm

;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"
extern sym(vp8_bilinear_filters_x86_8)


%define BLOCK_HEIGHT_WIDTH 4
%define vp8_filter_weight 128
%define VP8_FILTER_SHIFT  7


;void vp8_filter_block1d_h6_mmx
;(
;    unsigned char   *src_ptr,
;    unsigned short  *output_ptr,
;    unsigned int    src_pixels_per_line,
;    unsigned int    pixel_step,
;    unsigned int    output_height,
;    unsigned int    output_width,
;    short           * vp8_filter
;)
global sym(vp8_filter_block1d_h6_mmx) PRIVATE
sym(vp8_filter_block1d_h6_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

        mov         rdx,    arg(6) ;vp8_filter

        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
        movq        mm2,    [rdx + 32]         ;
        movq        mm6,    [rdx + 48]        ;
        movq        mm7,    [rdx + 64]        ;

        mov         rdi,    arg(1) ;output_ptr
        mov         rsi,    arg(0) ;src_ptr
        movsxd      rcx,    dword ptr arg(4) ;output_height
        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
        pxor        mm0,    mm0              ; mm0 = 00000000

.nextrow:
        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
        movq        mm4,    mm3              ; mm4 = p-2..p5
        psrlq       mm3,    8                ; mm3 = p-1..p5
        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.

        movq        mm5,    mm4              ; mm5 = p-2..p5
        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
        paddsw      mm3,    mm4              ; mm3 += mm5

        movq        mm4,    mm5              ; mm4 = p-2..p5;
        psrlq       mm5,    16               ; mm5 = p0..p5;
        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
        paddsw      mm3,    mm5              ; mm3 += mm5

        movq        mm5,    mm4              ; mm5 = p-2..p5
        psrlq       mm4,    24               ; mm4 = p1..p5
        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
        paddsw      mm3,    mm4              ; mm3 += mm5

        ; do outer positive taps
        movd        mm4,    [rsi+3]
        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
        paddsw      mm3,    mm4              ; mm3 += mm5

        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
        paddsw      mm3,    mm5              ; mm3 += mm5

        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
        psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
        packuswb    mm3,    mm0              ; pack and unpack to saturate
        punpcklbw   mm3,    mm0              ;

        movq        [rdi],  mm3              ; store the results in the destination

%if ABI_IS_32BIT
        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
        add         rdi,    rax;
%else
        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
        add         rdi,    rax;

        add         rsi,    r8               ; next line
%endif

        dec         rcx                      ; decrement count
        jnz         .nextrow                 ; next row

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret


;void vp8_filter_block1dc_v6_mmx
;(
;   short *src_ptr,
;   unsigned char *output_ptr,
;    int output_pitch,
;   unsigned int pixels_per_line,
;   unsigned int pixel_step,
;   unsigned int output_height,
;   unsigned int output_width,
;   short * vp8_filter
;)
global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
sym(vp8_filter_block1dc_v6_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 8
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

        movq      mm5, [GLOBAL(rd)]
        push        rbx
        mov         rbx, arg(7) ;vp8_filter
        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
        movq      mm2, [rbx + 32]         ;
        movq      mm6, [rbx + 48]        ;
        movq      mm7, [rbx + 64]        ;

        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
        mov         rdi, arg(1) ;output_ptr
        mov         rsi, arg(0) ;src_ptr
        sub         rsi, rdx
        sub         rsi, rdx
        movsxd      rcx, DWORD PTR arg(5) ;output_height
        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
        pxor        mm0, mm0              ; mm0 = 00000000


.nextrow_cv:
        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.


        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
        paddsw      mm3, mm4              ; mm3 += mm4

        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
        paddsw      mm3, mm4              ; mm3 += mm4

        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
        paddsw      mm3, mm4              ; mm3 += mm4


        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
        paddsw      mm3, mm4              ; mm3 += mm4

        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
        paddsw      mm3, mm4              ; mm3 += mm4


        paddsw      mm3, mm5               ; mm3 += round value
        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and saturate

        movd        [rdi],mm3             ; store the results in the destination
        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
        ; recon block should be in cache this shouldn't cost much.  Its obviously
        ; avoidable!!!.
        lea         rdi,  [rdi+rax] ;
        dec         rcx                   ; decrement count
        jnz         .nextrow_cv           ; next row

        pop         rbx

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret


;void bilinear_predict8x8_mmx
;(
;    unsigned char  *src_ptr,
;    int   src_pixels_per_line,
;    int  xoffset,
;    int  yoffset,
;   unsigned char *dst_ptr,
;    int dst_pitch
;)
global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
sym(vp8_bilinear_predict8x8_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];

        movsxd      rax,        dword ptr arg(2) ;xoffset
        mov         rdi,        arg(4) ;dst_ptr           ;

        shl         rax,        5 ; offset * 32
        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]

        add         rax,        rcx ; HFilter
        mov         rsi,        arg(0) ;src_ptr              ;

        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
        movq        mm1,        [rax]               ;

        movq        mm2,        [rax+16]            ;
        movsxd      rax,        dword ptr arg(3) ;yoffset

        pxor        mm0,        mm0                 ;

        shl         rax,        5 ; offset*32
        add         rax,        rcx ; VFilter

        lea         rcx,        [rdi+rdx*8]          ;
        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;


        ; get the first horizontal line done       ;
        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
        movq        mm4,        mm3                 ; make a copy of current line

        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
        punpckhbw   mm4,        mm0                 ;

        pmullw      mm3,        mm1                 ;
        pmullw      mm4,        mm1                 ;

        movq        mm5,        [rsi+1]             ;
        movq        mm6,        mm5                 ;

        punpcklbw   mm5,        mm0                 ;
        punpckhbw   mm6,        mm0                 ;

        pmullw      mm5,        mm2                 ;
        pmullw      mm6,        mm2                 ;

        paddw       mm3,        mm5                 ;
        paddw       mm4,        mm6                 ;

        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

        paddw       mm4,        [GLOBAL(rd)]                 ;
        psraw       mm4,        VP8_FILTER_SHIFT        ;

        movq        mm7,        mm3                 ;
        packuswb    mm7,        mm4                 ;

        add         rsi,        rdx                 ; next line
.next_row_8x8:
        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
        movq        mm4,        mm3                 ; make a copy of current line

        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
        punpckhbw   mm4,        mm0                 ;

        pmullw      mm3,        mm1                 ;
        pmullw      mm4,        mm1                 ;

        movq        mm5,        [rsi+1]             ;
        movq        mm6,        mm5                 ;

        punpcklbw   mm5,        mm0                 ;
        punpckhbw   mm6,        mm0                 ;

        pmullw      mm5,        mm2                 ;
        pmullw      mm6,        mm2                 ;

        paddw       mm3,        mm5                 ;
        paddw       mm4,        mm6                 ;

        movq        mm5,        mm7                 ;
        movq        mm6,        mm7                 ;

        punpcklbw   mm5,        mm0                 ;
        punpckhbw   mm6,        mm0

        pmullw      mm5,        [rax]               ;
        pmullw      mm6,        [rax]               ;

        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

        paddw       mm4,        [GLOBAL(rd)]                 ;
        psraw       mm4,        VP8_FILTER_SHIFT        ;

        movq        mm7,        mm3                 ;
        packuswb    mm7,        mm4                 ;


        pmullw      mm3,        [rax+16]            ;
        pmullw      mm4,        [rax+16]            ;

        paddw       mm3,        mm5                 ;
        paddw       mm4,        mm6                 ;


        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

        paddw       mm4,        [GLOBAL(rd)]                 ;
        psraw       mm4,        VP8_FILTER_SHIFT        ;

        packuswb    mm3,        mm4

        movq        [rdi],      mm3                 ; store the results in the destination

%if ABI_IS_32BIT
        add         rsi,        rdx                 ; next line
        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
%else
        movsxd      r8,         dword ptr arg(5) ;dst_pitch
        add         rsi,        rdx                 ; next line
        add         rdi,        r8                  ;dst_pitch
%endif
        cmp         rdi,        rcx                 ;
        jne         .next_row_8x8

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret


;void bilinear_predict8x4_mmx
;(
;    unsigned char  *src_ptr,
;    int   src_pixels_per_line,
;    int  xoffset,
;    int  yoffset,
;    unsigned char *dst_ptr,
;    int dst_pitch
;)
global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
sym(vp8_bilinear_predict8x4_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];

        movsxd      rax,        dword ptr arg(2) ;xoffset
        mov         rdi,        arg(4) ;dst_ptr           ;

        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
        shl         rax,        5

        mov         rsi,        arg(0) ;src_ptr              ;
        add         rax,        rcx

        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
        movq        mm1,        [rax]               ;

        movq        mm2,        [rax+16]            ;
        movsxd      rax,        dword ptr arg(3) ;yoffset

        pxor        mm0,        mm0                 ;
        shl         rax,        5

        add         rax,        rcx
        lea         rcx,        [rdi+rdx*4]          ;

        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;

        ; get the first horizontal line done       ;
        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
        movq        mm4,        mm3                 ; make a copy of current line

        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
        punpckhbw   mm4,        mm0                 ;

        pmullw      mm3,        mm1                 ;
        pmullw      mm4,        mm1                 ;

        movq        mm5,        [rsi+1]             ;
        movq        mm6,        mm5                 ;

        punpcklbw   mm5,        mm0                 ;
        punpckhbw   mm6,        mm0                 ;

        pmullw      mm5,        mm2                 ;
        pmullw      mm6,        mm2                 ;

        paddw       mm3,        mm5                 ;
        paddw       mm4,        mm6                 ;

        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

        paddw       mm4,        [GLOBAL(rd)]                 ;
        psraw       mm4,        VP8_FILTER_SHIFT        ;

        movq        mm7,        mm3                 ;
        packuswb    mm7,        mm4                 ;

        add         rsi,        rdx                 ; next line
.next_row_8x4:
        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
        movq        mm4,        mm3                 ; make a copy of current line

        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
        punpckhbw   mm4,        mm0                 ;

        pmullw      mm3,        mm1                 ;
        pmullw      mm4,        mm1                 ;

        movq        mm5,        [rsi+1]             ;
        movq        mm6,        mm5                 ;

        punpcklbw   mm5,        mm0                 ;
        punpckhbw   mm6,        mm0                 ;

        pmullw      mm5,        mm2                 ;
        pmullw      mm6,        mm2                 ;

        paddw       mm3,        mm5                 ;
        paddw       mm4,        mm6                 ;

        movq        mm5,        mm7                 ;
        movq        mm6,        mm7                 ;

        punpcklbw   mm5,        mm0                 ;
        punpckhbw   mm6,        mm0

        pmullw      mm5,        [rax]               ;
        pmullw      mm6,        [rax]               ;

        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

        paddw       mm4,        [GLOBAL(rd)]                 ;
        psraw       mm4,        VP8_FILTER_SHIFT        ;

        movq        mm7,        mm3                 ;
        packuswb    mm7,        mm4                 ;


        pmullw      mm3,        [rax+16]            ;
        pmullw      mm4,        [rax+16]            ;

        paddw       mm3,        mm5                 ;
        paddw       mm4,        mm6                 ;


        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

        paddw       mm4,        [GLOBAL(rd)]                 ;
        psraw       mm4,        VP8_FILTER_SHIFT        ;

        packuswb    mm3,        mm4

        movq        [rdi],      mm3                 ; store the results in the destination

%if ABI_IS_32BIT
        add         rsi,        rdx                 ; next line
        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
%else
        movsxd      r8,         dword ptr arg(5) ;dst_pitch
        add         rsi,        rdx                 ; next line
        add         rdi,        r8
%endif
        cmp         rdi,        rcx                 ;
        jne         .next_row_8x4

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret


;void bilinear_predict4x4_mmx
;(
;    unsigned char  *src_ptr,
;    int   src_pixels_per_line,
;    int  xoffset,
;    int  yoffset,
;    unsigned char *dst_ptr,
;    int dst_pitch
;)
global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
sym(vp8_bilinear_predict4x4_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];

        movsxd      rax,        dword ptr arg(2) ;xoffset
        mov         rdi,        arg(4) ;dst_ptr           ;

        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
        shl         rax,        5

        add         rax,        rcx ; HFilter
        mov         rsi,        arg(0) ;src_ptr              ;

        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
        movq        mm1,        [rax]               ;

        movq        mm2,        [rax+16]            ;
        movsxd      rax,        dword ptr arg(3) ;yoffset

        pxor        mm0,        mm0                 ;
        shl         rax,        5

        add         rax,        rcx
        lea         rcx,        [rdi+rdx*4]          ;

        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;

        ; get the first horizontal line done       ;
        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

        pmullw      mm3,        mm1                 ;
        movd        mm5,        [rsi+1]             ;

        punpcklbw   mm5,        mm0                 ;
        pmullw      mm5,        mm2                 ;

        paddw       mm3,        mm5                 ;
        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

        movq        mm7,        mm3                 ;
        packuswb    mm7,        mm0                 ;

        add         rsi,        rdx                 ; next line
.next_row_4x4:
        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

        pmullw      mm3,        mm1                 ;
        movd        mm5,        [rsi+1]             ;

        punpcklbw   mm5,        mm0                 ;
        pmullw      mm5,        mm2                 ;

        paddw       mm3,        mm5                 ;

        movq        mm5,        mm7                 ;
        punpcklbw   mm5,        mm0                 ;

        pmullw      mm5,        [rax]               ;
        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
        movq        mm7,        mm3                 ;

        packuswb    mm7,        mm0                 ;

        pmullw      mm3,        [rax+16]            ;
        paddw       mm3,        mm5                 ;


        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128

        packuswb    mm3,        mm0
        movd        [rdi],      mm3                 ; store the results in the destination

%if ABI_IS_32BIT
        add         rsi,        rdx                 ; next line
        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
%else
        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
        add         rsi,        rdx                 ; next line
        add         rdi,        r8
%endif

        cmp         rdi,        rcx                 ;
        jne         .next_row_4x4

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret


SECTION_RODATA
align 16
rd:
    times 4 dw 0x40

align 16
global HIDDEN_DATA(sym(vp8_six_tap_mmx))
sym(vp8_six_tap_mmx):
    times 8 dw 0
    times 8 dw 0
    times 8 dw 128
    times 8 dw 0
    times 8 dw 0
    times 8 dw 0

    times 8 dw 0
    times 8 dw -6
    times 8 dw 123
    times 8 dw 12
    times 8 dw -1
    times 8 dw 0

    times 8 dw 2
    times 8 dw -11
    times 8 dw 108
    times 8 dw 36
    times 8 dw -8
    times 8 dw 1

    times 8 dw 0
    times 8 dw -9
    times 8 dw 93
    times 8 dw 50
    times 8 dw -6
    times 8 dw 0

    times 8 dw 3
    times 8 dw -16
    times 8 dw 77
    times 8 dw 77
    times 8 dw -16
    times 8 dw 3

    times 8 dw 0
    times 8 dw -6
    times 8 dw 50
    times 8 dw 93
    times 8 dw -9
    times 8 dw 0

    times 8 dw 1
    times 8 dw -8
    times 8 dw 36
    times 8 dw 108
    times 8 dw -11
    times 8 dw 2

    times 8 dw 0
    times 8 dw -1
    times 8 dw 12
    times 8 dw 123
    times 8 dw -6
    times 8 dw 0
Add libvpx thirdparty library Only necessary files 2016-09-14 20:10:03 +00:00			`;`
			`; Copyright (c) 2010 The WebM project authors. All Rights Reserved.`
			`;`
			`; Use of this source code is governed by a BSD-style license`
			`; that can be found in the LICENSE file in the root of the source`
			`; tree. An additional intellectual property rights grant can be found`
			`; in the file PATENTS. All contributing project authors may`
			`; be found in the AUTHORS file in the root of the source tree.`
			`;`


			`%include "vpx_ports/x86_abi_support.asm"`
			`extern sym(vp8_bilinear_filters_x86_8)`


			`%define BLOCK_HEIGHT_WIDTH 4`
			`%define vp8_filter_weight 128`
			`%define VP8_FILTER_SHIFT 7`


			`;void vp8_filter_block1d_h6_mmx`
			`;(`
			`; unsigned char *src_ptr,`
			`; unsigned short *output_ptr,`
			`; unsigned int src_pixels_per_line,`
			`; unsigned int pixel_step,`
			`; unsigned int output_height,`
			`; unsigned int output_width,`
			`; short * vp8_filter`
			`;)`
			`global sym(vp8_filter_block1d_h6_mmx) PRIVATE`
			`sym(vp8_filter_block1d_h6_mmx):`
			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 7`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`mov rdx, arg(6) ;vp8_filter`

			`movq mm1, [rdx + 16] ; do both the negative taps first!!!`
			`movq mm2, [rdx + 32] ;`
			`movq mm6, [rdx + 48] ;`
			`movq mm7, [rdx + 64] ;`

			`mov rdi, arg(1) ;output_ptr`
			`mov rsi, arg(0) ;src_ptr`
			`movsxd rcx, dword ptr arg(4) ;output_height`
			`movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?`
			`pxor mm0, mm0 ; mm0 = 00000000`

			`.nextrow:`
			`movq mm3, [rsi-2] ; mm3 = p-2..p5`
			`movq mm4, mm3 ; mm4 = p-2..p5`
			`psrlq mm3, 8 ; mm3 = p-1..p5`
			`punpcklbw mm3, mm0 ; mm3 = p-1..p2`
			`pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.`

			`movq mm5, mm4 ; mm5 = p-2..p5`
			`punpckhbw mm4, mm0 ; mm5 = p2..p5`
			`pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers`
			`paddsw mm3, mm4 ; mm3 += mm5`

			`movq mm4, mm5 ; mm4 = p-2..p5;`
			`psrlq mm5, 16 ; mm5 = p0..p5;`
			`punpcklbw mm5, mm0 ; mm5 = p0..p3`
			`pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers`
			`paddsw mm3, mm5 ; mm3 += mm5`

			`movq mm5, mm4 ; mm5 = p-2..p5`
			`psrlq mm4, 24 ; mm4 = p1..p5`
			`punpcklbw mm4, mm0 ; mm4 = p1..p4`
			`pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers`
			`paddsw mm3, mm4 ; mm3 += mm5`

			`; do outer positive taps`
			`movd mm4, [rsi+3]`
			`punpcklbw mm4, mm0 ; mm5 = p3..p6`
			`pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers`
			`paddsw mm3, mm4 ; mm3 += mm5`

			`punpcklbw mm5, mm0 ; mm5 = p-2..p1`
			`pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers`
			`paddsw mm3, mm5 ; mm3 += mm5`

			`paddsw mm3, [GLOBAL(rd)] ; mm3 += round value`
			`psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128`
			`packuswb mm3, mm0 ; pack and unpack to saturate`
			`punpcklbw mm3, mm0 ;`

			`movq [rdi], mm3 ; store the results in the destination`

			`%if ABI_IS_32BIT`
			`add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line`
			`add rdi, rax;`
			`%else`
			`movsxd r8, dword ptr arg(2) ;src_pixels_per_line`
			`add rdi, rax;`

			`add rsi, r8 ; next line`
			`%endif`

			`dec rcx ; decrement count`
			`jnz .nextrow ; next row`

			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`


			`;void vp8_filter_block1dc_v6_mmx`
			`;(`
			`; short *src_ptr,`
			`; unsigned char *output_ptr,`
			`; int output_pitch,`
			`; unsigned int pixels_per_line,`
			`; unsigned int pixel_step,`
			`; unsigned int output_height,`
			`; unsigned int output_width,`
			`; short * vp8_filter`
			`;)`
			`global sym(vp8_filter_block1dc_v6_mmx) PRIVATE`
			`sym(vp8_filter_block1dc_v6_mmx):`
			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 8`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`movq mm5, [GLOBAL(rd)]`
			`push rbx`
			`mov rbx, arg(7) ;vp8_filter`
			`movq mm1, [rbx + 16] ; do both the negative taps first!!!`
			`movq mm2, [rbx + 32] ;`
			`movq mm6, [rbx + 48] ;`
			`movq mm7, [rbx + 64] ;`

			`movsxd rdx, dword ptr arg(3) ;pixels_per_line`
			`mov rdi, arg(1) ;output_ptr`
			`mov rsi, arg(0) ;src_ptr`
			`sub rsi, rdx`
			`sub rsi, rdx`
			`movsxd rcx, DWORD PTR arg(5) ;output_height`
			`movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?`
			`pxor mm0, mm0 ; mm0 = 00000000`


			`.nextrow_cv:`
			`movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1`
			`pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.`


			`movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2`
			`pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.`
			`paddsw mm3, mm4 ; mm3 += mm4`

			`movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0`
			`pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.`
			`paddsw mm3, mm4 ; mm3 += mm4`

			`movq mm4, [rsi] ; mm4 = p0..p3 = row -2`
			`pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.`
			`paddsw mm3, mm4 ; mm3 += mm4`


			`add rsi, rdx ; move source forward 1 line to avoid 3 * pitch`
			`movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1`
			`pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.`
			`paddsw mm3, mm4 ; mm3 += mm4`

			`movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3`
			`pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.`
			`paddsw mm3, mm4 ; mm3 += mm4`


			`paddsw mm3, mm5 ; mm3 += round value`
			`psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128`
			`packuswb mm3, mm0 ; pack and saturate`

			`movd [rdi],mm3 ; store the results in the destination`
			`; the subsequent iterations repeat 3 out of 4 of these reads. Since the`
			`; recon block should be in cache this shouldn't cost much. Its obviously`
			`; avoidable!!!.`
			`lea rdi, [rdi+rax] ;`
			`dec rcx ; decrement count`
			`jnz .nextrow_cv ; next row`

			`pop rbx`

			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`


			`;void bilinear_predict8x8_mmx`
			`;(`
			`; unsigned char *src_ptr,`
			`; int src_pixels_per_line,`
			`; int xoffset,`
			`; int yoffset,`
			`; unsigned char *dst_ptr,`
			`; int dst_pitch`
			`;)`
			`global sym(vp8_bilinear_predict8x8_mmx) PRIVATE`
			`sym(vp8_bilinear_predict8x8_mmx):`
			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 6`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];`
			`;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];`

			`movsxd rax, dword ptr arg(2) ;xoffset`
			`mov rdi, arg(4) ;dst_ptr ;`

			`shl rax, 5 ; offset * 32`
			`lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]`

			`add rax, rcx ; HFilter`
			`mov rsi, arg(0) ;src_ptr ;`

			`movsxd rdx, dword ptr arg(5) ;dst_pitch`
			`movq mm1, [rax] ;`

			`movq mm2, [rax+16] ;`
			`movsxd rax, dword ptr arg(3) ;yoffset`

			`pxor mm0, mm0 ;`

			`shl rax, 5 ; offset*32`
			`add rax, rcx ; VFilter`

			`lea rcx, [rdi+rdx*8] ;`
			`movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;`



			`; get the first horizontal line done ;`
			`movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14`
			`movq mm4, mm3 ; make a copy of current line`

			`punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06`
			`punpckhbw mm4, mm0 ;`

			`pmullw mm3, mm1 ;`
			`pmullw mm4, mm1 ;`

			`movq mm5, [rsi+1] ;`
			`movq mm6, mm5 ;`

			`punpcklbw mm5, mm0 ;`
			`punpckhbw mm6, mm0 ;`

			`pmullw mm5, mm2 ;`
			`pmullw mm6, mm2 ;`

			`paddw mm3, mm5 ;`
			`paddw mm4, mm6 ;`

			`paddw mm3, [GLOBAL(rd)] ; xmm3 += round value`
			`psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128`

			`paddw mm4, [GLOBAL(rd)] ;`
			`psraw mm4, VP8_FILTER_SHIFT ;`

			`movq mm7, mm3 ;`
			`packuswb mm7, mm4 ;`

			`add rsi, rdx ; next line`
			`.next_row_8x8:`
			`movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14`
			`movq mm4, mm3 ; make a copy of current line`

			`punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06`
			`punpckhbw mm4, mm0 ;`

			`pmullw mm3, mm1 ;`
			`pmullw mm4, mm1 ;`

			`movq mm5, [rsi+1] ;`
			`movq mm6, mm5 ;`

			`punpcklbw mm5, mm0 ;`
			`punpckhbw mm6, mm0 ;`

			`pmullw mm5, mm2 ;`
			`pmullw mm6, mm2 ;`

			`paddw mm3, mm5 ;`
			`paddw mm4, mm6 ;`

			`movq mm5, mm7 ;`
			`movq mm6, mm7 ;`

			`punpcklbw mm5, mm0 ;`
			`punpckhbw mm6, mm0`

			`pmullw mm5, [rax] ;`
			`pmullw mm6, [rax] ;`

			`paddw mm3, [GLOBAL(rd)] ; xmm3 += round value`
			`psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128`

			`paddw mm4, [GLOBAL(rd)] ;`
			`psraw mm4, VP8_FILTER_SHIFT ;`

			`movq mm7, mm3 ;`
			`packuswb mm7, mm4 ;`


			`pmullw mm3, [rax+16] ;`
			`pmullw mm4, [rax+16] ;`

			`paddw mm3, mm5 ;`
			`paddw mm4, mm6 ;`


			`paddw mm3, [GLOBAL(rd)] ; xmm3 += round value`
			`psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128`

			`paddw mm4, [GLOBAL(rd)] ;`
			`psraw mm4, VP8_FILTER_SHIFT ;`

			`packuswb mm3, mm4`

			`movq [rdi], mm3 ; store the results in the destination`

			`%if ABI_IS_32BIT`
			`add rsi, rdx ; next line`
			`add rdi, dword ptr arg(5) ;dst_pitch ;`
			`%else`
			`movsxd r8, dword ptr arg(5) ;dst_pitch`
			`add rsi, rdx ; next line`
			`add rdi, r8 ;dst_pitch`
			`%endif`
			`cmp rdi, rcx ;`
			`jne .next_row_8x8`

			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`


			`;void bilinear_predict8x4_mmx`
			`;(`
			`; unsigned char *src_ptr,`
			`; int src_pixels_per_line,`
			`; int xoffset,`
			`; int yoffset,`
			`; unsigned char *dst_ptr,`
			`; int dst_pitch`
			`;)`
			`global sym(vp8_bilinear_predict8x4_mmx) PRIVATE`
			`sym(vp8_bilinear_predict8x4_mmx):`
			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 6`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];`
			`;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];`

			`movsxd rax, dword ptr arg(2) ;xoffset`
			`mov rdi, arg(4) ;dst_ptr ;`

			`lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]`
			`shl rax, 5`

			`mov rsi, arg(0) ;src_ptr ;`
			`add rax, rcx`

			`movsxd rdx, dword ptr arg(5) ;dst_pitch`
			`movq mm1, [rax] ;`

			`movq mm2, [rax+16] ;`
			`movsxd rax, dword ptr arg(3) ;yoffset`

			`pxor mm0, mm0 ;`
			`shl rax, 5`

			`add rax, rcx`
			`lea rcx, [rdi+rdx*4] ;`

			`movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;`

			`; get the first horizontal line done ;`
			`movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14`
			`movq mm4, mm3 ; make a copy of current line`

			`punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06`
			`punpckhbw mm4, mm0 ;`

			`pmullw mm3, mm1 ;`
			`pmullw mm4, mm1 ;`

			`movq mm5, [rsi+1] ;`
			`movq mm6, mm5 ;`

			`punpcklbw mm5, mm0 ;`
			`punpckhbw mm6, mm0 ;`

			`pmullw mm5, mm2 ;`
			`pmullw mm6, mm2 ;`

			`paddw mm3, mm5 ;`
			`paddw mm4, mm6 ;`

			`paddw mm3, [GLOBAL(rd)] ; xmm3 += round value`
			`psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128`

			`paddw mm4, [GLOBAL(rd)] ;`
			`psraw mm4, VP8_FILTER_SHIFT ;`

			`movq mm7, mm3 ;`
			`packuswb mm7, mm4 ;`

			`add rsi, rdx ; next line`
			`.next_row_8x4:`
			`movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14`
			`movq mm4, mm3 ; make a copy of current line`

			`punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06`
			`punpckhbw mm4, mm0 ;`

			`pmullw mm3, mm1 ;`
			`pmullw mm4, mm1 ;`

			`movq mm5, [rsi+1] ;`
			`movq mm6, mm5 ;`

			`punpcklbw mm5, mm0 ;`
			`punpckhbw mm6, mm0 ;`

			`pmullw mm5, mm2 ;`
			`pmullw mm6, mm2 ;`

			`paddw mm3, mm5 ;`
			`paddw mm4, mm6 ;`

			`movq mm5, mm7 ;`
			`movq mm6, mm7 ;`

			`punpcklbw mm5, mm0 ;`
			`punpckhbw mm6, mm0`

			`pmullw mm5, [rax] ;`
			`pmullw mm6, [rax] ;`

			`paddw mm3, [GLOBAL(rd)] ; xmm3 += round value`
			`psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128`

			`paddw mm4, [GLOBAL(rd)] ;`
			`psraw mm4, VP8_FILTER_SHIFT ;`

			`movq mm7, mm3 ;`
			`packuswb mm7, mm4 ;`


			`pmullw mm3, [rax+16] ;`
			`pmullw mm4, [rax+16] ;`

			`paddw mm3, mm5 ;`
			`paddw mm4, mm6 ;`


			`paddw mm3, [GLOBAL(rd)] ; xmm3 += round value`
			`psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128`

			`paddw mm4, [GLOBAL(rd)] ;`
			`psraw mm4, VP8_FILTER_SHIFT ;`

			`packuswb mm3, mm4`

			`movq [rdi], mm3 ; store the results in the destination`

			`%if ABI_IS_32BIT`
			`add rsi, rdx ; next line`
			`add rdi, dword ptr arg(5) ;dst_pitch ;`
			`%else`
			`movsxd r8, dword ptr arg(5) ;dst_pitch`
			`add rsi, rdx ; next line`
			`add rdi, r8`
			`%endif`
			`cmp rdi, rcx ;`
			`jne .next_row_8x4`

			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`


			`;void bilinear_predict4x4_mmx`
			`;(`
			`; unsigned char *src_ptr,`
			`; int src_pixels_per_line,`
			`; int xoffset,`
			`; int yoffset,`
			`; unsigned char *dst_ptr,`
			`; int dst_pitch`
			`;)`
			`global sym(vp8_bilinear_predict4x4_mmx) PRIVATE`
			`sym(vp8_bilinear_predict4x4_mmx):`
			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 6`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];`
			`;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];`

			`movsxd rax, dword ptr arg(2) ;xoffset`
			`mov rdi, arg(4) ;dst_ptr ;`

			`lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]`
			`shl rax, 5`

			`add rax, rcx ; HFilter`
			`mov rsi, arg(0) ;src_ptr ;`

			`movsxd rdx, dword ptr arg(5) ;ldst_pitch`
			`movq mm1, [rax] ;`

			`movq mm2, [rax+16] ;`
			`movsxd rax, dword ptr arg(3) ;yoffset`

			`pxor mm0, mm0 ;`
			`shl rax, 5`

			`add rax, rcx`
			`lea rcx, [rdi+rdx*4] ;`

			`movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;`

			`; get the first horizontal line done ;`
			`movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14`
			`punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06`

			`pmullw mm3, mm1 ;`
			`movd mm5, [rsi+1] ;`

			`punpcklbw mm5, mm0 ;`
			`pmullw mm5, mm2 ;`

			`paddw mm3, mm5 ;`
			`paddw mm3, [GLOBAL(rd)] ; xmm3 += round value`

			`psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128`

			`movq mm7, mm3 ;`
			`packuswb mm7, mm0 ;`

			`add rsi, rdx ; next line`
			`.next_row_4x4:`
			`movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14`
			`punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06`

			`pmullw mm3, mm1 ;`
			`movd mm5, [rsi+1] ;`

			`punpcklbw mm5, mm0 ;`
			`pmullw mm5, mm2 ;`

			`paddw mm3, mm5 ;`

			`movq mm5, mm7 ;`
			`punpcklbw mm5, mm0 ;`

			`pmullw mm5, [rax] ;`
			`paddw mm3, [GLOBAL(rd)] ; xmm3 += round value`

			`psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128`
			`movq mm7, mm3 ;`

			`packuswb mm7, mm0 ;`

			`pmullw mm3, [rax+16] ;`
			`paddw mm3, mm5 ;`


			`paddw mm3, [GLOBAL(rd)] ; xmm3 += round value`
			`psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128`

			`packuswb mm3, mm0`
			`movd [rdi], mm3 ; store the results in the destination`

			`%if ABI_IS_32BIT`
			`add rsi, rdx ; next line`
			`add rdi, dword ptr arg(5) ;dst_pitch ;`
			`%else`
			`movsxd r8, dword ptr arg(5) ;dst_pitch ;`
			`add rsi, rdx ; next line`
			`add rdi, r8`
			`%endif`

			`cmp rdi, rcx ;`
			`jne .next_row_4x4`

			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`



			`SECTION_RODATA`
			`align 16`
			`rd:`
			`times 4 dw 0x40`

			`align 16`
			`global HIDDEN_DATA(sym(vp8_six_tap_mmx))`
			`sym(vp8_six_tap_mmx):`
			`times 8 dw 0`
			`times 8 dw 0`
			`times 8 dw 128`
			`times 8 dw 0`
			`times 8 dw 0`
			`times 8 dw 0`

			`times 8 dw 0`
			`times 8 dw -6`
			`times 8 dw 123`
			`times 8 dw 12`
			`times 8 dw -1`
			`times 8 dw 0`

			`times 8 dw 2`
			`times 8 dw -11`
			`times 8 dw 108`
			`times 8 dw 36`
			`times 8 dw -8`
			`times 8 dw 1`

			`times 8 dw 0`
			`times 8 dw -9`
			`times 8 dw 93`
			`times 8 dw 50`
			`times 8 dw -6`
			`times 8 dw 0`

			`times 8 dw 3`
			`times 8 dw -16`
			`times 8 dw 77`
			`times 8 dw 77`
			`times 8 dw -16`
			`times 8 dw 3`

			`times 8 dw 0`
			`times 8 dw -6`
			`times 8 dw 50`
			`times 8 dw 93`
			`times 8 dw -9`
			`times 8 dw 0`

			`times 8 dw 1`
			`times 8 dw -8`
			`times 8 dw 36`
			`times 8 dw 108`
			`times 8 dw -11`
			`times 8 dw 2`

			`times 8 dw 0`
			`times 8 dw -1`
			`times 8 dw 12`
			`times 8 dw 123`
			`times 8 dw -6`
			`times 8 dw 0`