| /* |
| * Author: Anton Blanchard <anton@au.ibm.com> |
| * Copyright 2015 IBM Corporation. |
| * |
| * This program is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU General Public License |
| * as published by the Free Software Foundation; either version |
| * 2 of the License, or (at your option) any later version. |
| */ |
| #include <asm/ppc_asm.h> |
| #include <asm/export.h> |
| #include <asm/ppc-opcode.h> |
| |
| #define off8 r6 |
| #define off16 r7 |
| #define off24 r8 |
| |
| #define rA r9 |
| #define rB r10 |
| #define rC r11 |
| #define rD r27 |
| #define rE r28 |
| #define rF r29 |
| #define rG r30 |
| #define rH r31 |
| |
| #ifdef __LITTLE_ENDIAN__ |
| #define LH lhbrx |
| #define LW lwbrx |
| #define LD ldbrx |
| #define LVS lvsr |
| #define VPERM(_VRT,_VRA,_VRB,_VRC) \ |
| vperm _VRT,_VRB,_VRA,_VRC |
| #else |
| #define LH lhzx |
| #define LW lwzx |
| #define LD ldx |
| #define LVS lvsl |
| #define VPERM(_VRT,_VRA,_VRB,_VRC) \ |
| vperm _VRT,_VRA,_VRB,_VRC |
| #endif |
| |
| #define VMX_THRESH 4096 |
| #define ENTER_VMX_OPS \ |
| mflr r0; \ |
| std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ |
| std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ |
| std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ |
| std r0,16(r1); \ |
| stdu r1,-STACKFRAMESIZE(r1); \ |
| bl enter_vmx_ops; \ |
| cmpwi cr1,r3,0; \ |
| ld r0,STACKFRAMESIZE+16(r1); \ |
| ld r3,STK_REG(R31)(r1); \ |
| ld r4,STK_REG(R30)(r1); \ |
| ld r5,STK_REG(R29)(r1); \ |
| addi r1,r1,STACKFRAMESIZE; \ |
| mtlr r0 |
| |
| #define EXIT_VMX_OPS \ |
| mflr r0; \ |
| std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ |
| std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ |
| std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ |
| std r0,16(r1); \ |
| stdu r1,-STACKFRAMESIZE(r1); \ |
| bl exit_vmx_ops; \ |
| ld r0,STACKFRAMESIZE+16(r1); \ |
| ld r3,STK_REG(R31)(r1); \ |
| ld r4,STK_REG(R30)(r1); \ |
| ld r5,STK_REG(R29)(r1); \ |
| addi r1,r1,STACKFRAMESIZE; \ |
| mtlr r0 |
| |
| /* |
| * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with |
| * 16 bytes boundary and permute the result with the 1st 16 bytes. |
| |
| * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | |
| * ^ ^ ^ |
| * 0xbbbb10 0xbbbb20 0xbbb30 |
| * ^ |
| * _vaddr |
| * |
| * |
| * _vmask is the mask generated by LVS |
| * _v1st_qw is the 1st aligned QW of current addr which is already loaded. |
| * for example: 0xyyyyyyyyyyyyy012 for big endian |
| * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. |
| * for example: 0x3456789abcdefzzz for big endian |
| * The permute result is saved in _v_res. |
| * for example: 0x0123456789abcdef for big endian. |
| */ |
| #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ |
| lvx _v2nd_qw,_vaddr,off16; \ |
| VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) |
| |
| /* |
| * There are 2 categories for memcmp: |
| * 1) src/dst has the same offset to the 8 bytes boundary. The handlers |
| * are named like .Lsameoffset_xxxx |
| * 2) src/dst has different offset to the 8 bytes boundary. The handlers |
| * are named like .Ldiffoffset_xxxx |
| */ |
| _GLOBAL_TOC(memcmp) |
| cmpdi cr1,r5,0 |
| |
| /* Use the short loop if the src/dst addresses are not |
| * with the same offset of 8 bytes align boundary. |
| */ |
| xor r6,r3,r4 |
| andi. r6,r6,7 |
| |
| /* Fall back to short loop if compare at aligned addrs |
| * with less than 8 bytes. |
| */ |
| cmpdi cr6,r5,7 |
| |
| beq cr1,.Lzero |
| bgt cr6,.Lno_short |
| |
| .Lshort: |
| mtctr r5 |
| 1: lbz rA,0(r3) |
| lbz rB,0(r4) |
| subf. rC,rB,rA |
| bne .Lnon_zero |
| bdz .Lzero |
| |
| lbz rA,1(r3) |
| lbz rB,1(r4) |
| subf. rC,rB,rA |
| bne .Lnon_zero |
| bdz .Lzero |
| |
| lbz rA,2(r3) |
| lbz rB,2(r4) |
| subf. rC,rB,rA |
| bne .Lnon_zero |
| bdz .Lzero |
| |
| lbz rA,3(r3) |
| lbz rB,3(r4) |
| subf. rC,rB,rA |
| bne .Lnon_zero |
| |
| addi r3,r3,4 |
| addi r4,r4,4 |
| |
| bdnz 1b |
| |
| .Lzero: |
| li r3,0 |
| blr |
| |
| .Lno_short: |
| dcbt 0,r3 |
| dcbt 0,r4 |
| bne .Ldiffoffset_8bytes_make_align_start |
| |
| |
| .Lsameoffset_8bytes_make_align_start: |
| /* attempt to compare bytes not aligned with 8 bytes so that |
| * rest comparison can run based on 8 bytes alignment. |
| */ |
| andi. r6,r3,7 |
| |
| /* Try to compare the first double word which is not 8 bytes aligned: |
| * load the first double word at (src & ~7UL) and shift left appropriate |
| * bits before comparision. |
| */ |
| rlwinm r6,r3,3,26,28 |
| beq .Lsameoffset_8bytes_aligned |
| clrrdi r3,r3,3 |
| clrrdi r4,r4,3 |
| LD rA,0,r3 |
| LD rB,0,r4 |
| sld rA,rA,r6 |
| sld rB,rB,r6 |
| cmpld cr0,rA,rB |
| srwi r6,r6,3 |
| bne cr0,.LcmpAB_lightweight |
| subfic r6,r6,8 |
| subf. r5,r6,r5 |
| addi r3,r3,8 |
| addi r4,r4,8 |
| beq .Lzero |
| |
| .Lsameoffset_8bytes_aligned: |
| /* now we are aligned with 8 bytes. |
| * Use .Llong loop if left cmp bytes are equal or greater than 32B. |
| */ |
| cmpdi cr6,r5,31 |
| bgt cr6,.Llong |
| |
| .Lcmp_lt32bytes: |
| /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ |
| cmpdi cr5,r5,7 |
| srdi r0,r5,3 |
| ble cr5,.Lcmp_rest_lt8bytes |
| |
| /* handle 8 ~ 31 bytes */ |
| clrldi r5,r5,61 |
| mtctr r0 |
| 2: |
| LD rA,0,r3 |
| LD rB,0,r4 |
| cmpld cr0,rA,rB |
| addi r3,r3,8 |
| addi r4,r4,8 |
| bne cr0,.LcmpAB_lightweight |
| bdnz 2b |
| |
| cmpwi r5,0 |
| beq .Lzero |
| |
| .Lcmp_rest_lt8bytes: |
| /* Here we have only less than 8 bytes to compare with. at least s1 |
| * Address is aligned with 8 bytes. |
| * The next double words are load and shift right with appropriate |
| * bits. |
| */ |
| subfic r6,r5,8 |
| slwi r6,r6,3 |
| LD rA,0,r3 |
| LD rB,0,r4 |
| srd rA,rA,r6 |
| srd rB,rB,r6 |
| cmpld cr0,rA,rB |
| bne cr0,.LcmpAB_lightweight |
| b .Lzero |
| |
| .Lnon_zero: |
| mr r3,rC |
| blr |
| |
| .Llong: |
| #ifdef CONFIG_ALTIVEC |
| BEGIN_FTR_SECTION |
| /* Try to use vmx loop if length is equal or greater than 4K */ |
| cmpldi cr6,r5,VMX_THRESH |
| bge cr6,.Lsameoffset_vmx_cmp |
| END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) |
| |
| .Llong_novmx_cmp: |
| #endif |
| /* At least s1 addr is aligned with 8 bytes */ |
| li off8,8 |
| li off16,16 |
| li off24,24 |
| |
| std r31,-8(r1) |
| std r30,-16(r1) |
| std r29,-24(r1) |
| std r28,-32(r1) |
| std r27,-40(r1) |
| |
| srdi r0,r5,5 |
| mtctr r0 |
| andi. r5,r5,31 |
| |
| LD rA,0,r3 |
| LD rB,0,r4 |
| |
| LD rC,off8,r3 |
| LD rD,off8,r4 |
| |
| LD rE,off16,r3 |
| LD rF,off16,r4 |
| |
| LD rG,off24,r3 |
| LD rH,off24,r4 |
| cmpld cr0,rA,rB |
| |
| addi r3,r3,32 |
| addi r4,r4,32 |
| |
| bdz .Lfirst32 |
| |
| LD rA,0,r3 |
| LD rB,0,r4 |
| cmpld cr1,rC,rD |
| |
| LD rC,off8,r3 |
| LD rD,off8,r4 |
| cmpld cr6,rE,rF |
| |
| LD rE,off16,r3 |
| LD rF,off16,r4 |
| cmpld cr7,rG,rH |
| bne cr0,.LcmpAB |
| |
| LD rG,off24,r3 |
| LD rH,off24,r4 |
| cmpld cr0,rA,rB |
| bne cr1,.LcmpCD |
| |
| addi r3,r3,32 |
| addi r4,r4,32 |
| |
| bdz .Lsecond32 |
| |
| .balign 16 |
| |
| 1: LD rA,0,r3 |
| LD rB,0,r4 |
| cmpld cr1,rC,rD |
| bne cr6,.LcmpEF |
| |
| LD rC,off8,r3 |
| LD rD,off8,r4 |
| cmpld cr6,rE,rF |
| bne cr7,.LcmpGH |
| |
| LD rE,off16,r3 |
| LD rF,off16,r4 |
| cmpld cr7,rG,rH |
| bne cr0,.LcmpAB |
| |
| LD rG,off24,r3 |
| LD rH,off24,r4 |
| cmpld cr0,rA,rB |
| bne cr1,.LcmpCD |
| |
| addi r3,r3,32 |
| addi r4,r4,32 |
| |
| bdnz 1b |
| |
| .Lsecond32: |
| cmpld cr1,rC,rD |
| bne cr6,.LcmpEF |
| |
| cmpld cr6,rE,rF |
| bne cr7,.LcmpGH |
| |
| cmpld cr7,rG,rH |
| bne cr0,.LcmpAB |
| |
| bne cr1,.LcmpCD |
| bne cr6,.LcmpEF |
| bne cr7,.LcmpGH |
| |
| .Ltail: |
| ld r31,-8(r1) |
| ld r30,-16(r1) |
| ld r29,-24(r1) |
| ld r28,-32(r1) |
| ld r27,-40(r1) |
| |
| cmpdi r5,0 |
| beq .Lzero |
| b .Lshort |
| |
| .Lfirst32: |
| cmpld cr1,rC,rD |
| cmpld cr6,rE,rF |
| cmpld cr7,rG,rH |
| |
| bne cr0,.LcmpAB |
| bne cr1,.LcmpCD |
| bne cr6,.LcmpEF |
| bne cr7,.LcmpGH |
| |
| b .Ltail |
| |
| .LcmpAB: |
| li r3,1 |
| bgt cr0,.Lout |
| li r3,-1 |
| b .Lout |
| |
| .LcmpCD: |
| li r3,1 |
| bgt cr1,.Lout |
| li r3,-1 |
| b .Lout |
| |
| .LcmpEF: |
| li r3,1 |
| bgt cr6,.Lout |
| li r3,-1 |
| b .Lout |
| |
| .LcmpGH: |
| li r3,1 |
| bgt cr7,.Lout |
| li r3,-1 |
| |
| .Lout: |
| ld r31,-8(r1) |
| ld r30,-16(r1) |
| ld r29,-24(r1) |
| ld r28,-32(r1) |
| ld r27,-40(r1) |
| blr |
| |
| .LcmpAB_lightweight: /* skip NV GPRS restore */ |
| li r3,1 |
| bgtlr |
| li r3,-1 |
| blr |
| |
| #ifdef CONFIG_ALTIVEC |
| .Lsameoffset_vmx_cmp: |
| /* Enter with src/dst addrs has the same offset with 8 bytes |
| * align boundary. |
| * |
| * There is an optimization based on following fact: memcmp() |
| * prones to fail early at the first 32 bytes. |
| * Before applying VMX instructions which will lead to 32x128bits |
| * VMX regs load/restore penalty, we compare the first 32 bytes |
| * so that we can catch the ~80% fail cases. |
| */ |
| |
| li r0,4 |
| mtctr r0 |
| .Lsameoffset_prechk_32B_loop: |
| LD rA,0,r3 |
| LD rB,0,r4 |
| cmpld cr0,rA,rB |
| addi r3,r3,8 |
| addi r4,r4,8 |
| bne cr0,.LcmpAB_lightweight |
| addi r5,r5,-8 |
| bdnz .Lsameoffset_prechk_32B_loop |
| |
| ENTER_VMX_OPS |
| beq cr1,.Llong_novmx_cmp |
| |
| 3: |
| /* need to check whether r4 has the same offset with r3 |
| * for 16 bytes boundary. |
| */ |
| xor r0,r3,r4 |
| andi. r0,r0,0xf |
| bne .Ldiffoffset_vmx_cmp_start |
| |
| /* len is no less than 4KB. Need to align with 16 bytes further. |
| */ |
| andi. rA,r3,8 |
| LD rA,0,r3 |
| beq 4f |
| LD rB,0,r4 |
| cmpld cr0,rA,rB |
| addi r3,r3,8 |
| addi r4,r4,8 |
| addi r5,r5,-8 |
| |
| beq cr0,4f |
| /* save and restore cr0 */ |
| mfocrf r5,128 |
| EXIT_VMX_OPS |
| mtocrf 128,r5 |
| b .LcmpAB_lightweight |
| |
| 4: |
| /* compare 32 bytes for each loop */ |
| srdi r0,r5,5 |
| mtctr r0 |
| clrldi r5,r5,59 |
| li off16,16 |
| |
| .balign 16 |
| 5: |
| lvx v0,0,r3 |
| lvx v1,0,r4 |
| VCMPEQUD_RC(v0,v0,v1) |
| bnl cr6,7f |
| lvx v0,off16,r3 |
| lvx v1,off16,r4 |
| VCMPEQUD_RC(v0,v0,v1) |
| bnl cr6,6f |
| addi r3,r3,32 |
| addi r4,r4,32 |
| bdnz 5b |
| |
| EXIT_VMX_OPS |
| cmpdi r5,0 |
| beq .Lzero |
| b .Lcmp_lt32bytes |
| |
| 6: |
| addi r3,r3,16 |
| addi r4,r4,16 |
| |
| 7: |
| /* diff the last 16 bytes */ |
| EXIT_VMX_OPS |
| LD rA,0,r3 |
| LD rB,0,r4 |
| cmpld cr0,rA,rB |
| li off8,8 |
| bne cr0,.LcmpAB_lightweight |
| |
| LD rA,off8,r3 |
| LD rB,off8,r4 |
| cmpld cr0,rA,rB |
| bne cr0,.LcmpAB_lightweight |
| b .Lzero |
| #endif |
| |
| .Ldiffoffset_8bytes_make_align_start: |
| /* now try to align s1 with 8 bytes */ |
| rlwinm r6,r3,3,26,28 |
| beq .Ldiffoffset_align_s1_8bytes |
| |
| clrrdi r3,r3,3 |
| LD rA,0,r3 |
| LD rB,0,r4 /* unaligned load */ |
| sld rA,rA,r6 |
| srd rA,rA,r6 |
| srd rB,rB,r6 |
| cmpld cr0,rA,rB |
| srwi r6,r6,3 |
| bne cr0,.LcmpAB_lightweight |
| |
| subfic r6,r6,8 |
| subf. r5,r6,r5 |
| addi r3,r3,8 |
| add r4,r4,r6 |
| |
| beq .Lzero |
| |
| .Ldiffoffset_align_s1_8bytes: |
| /* now s1 is aligned with 8 bytes. */ |
| #ifdef CONFIG_ALTIVEC |
| BEGIN_FTR_SECTION |
| /* only do vmx ops when the size equal or greater than 4K bytes */ |
| cmpdi cr5,r5,VMX_THRESH |
| bge cr5,.Ldiffoffset_vmx_cmp |
| END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) |
| |
| .Ldiffoffset_novmx_cmp: |
| #endif |
| |
| |
| cmpdi cr5,r5,31 |
| ble cr5,.Lcmp_lt32bytes |
| |
| #ifdef CONFIG_ALTIVEC |
| b .Llong_novmx_cmp |
| #else |
| b .Llong |
| #endif |
| |
| #ifdef CONFIG_ALTIVEC |
| .Ldiffoffset_vmx_cmp: |
| /* perform a 32 bytes pre-checking before |
| * enable VMX operations. |
| */ |
| li r0,4 |
| mtctr r0 |
| .Ldiffoffset_prechk_32B_loop: |
| LD rA,0,r3 |
| LD rB,0,r4 |
| cmpld cr0,rA,rB |
| addi r3,r3,8 |
| addi r4,r4,8 |
| bne cr0,.LcmpAB_lightweight |
| addi r5,r5,-8 |
| bdnz .Ldiffoffset_prechk_32B_loop |
| |
| ENTER_VMX_OPS |
| beq cr1,.Ldiffoffset_novmx_cmp |
| |
| .Ldiffoffset_vmx_cmp_start: |
| /* Firstly try to align r3 with 16 bytes */ |
| andi. r6,r3,0xf |
| li off16,16 |
| beq .Ldiffoffset_vmx_s1_16bytes_align |
| |
| LVS v3,0,r3 |
| LVS v4,0,r4 |
| |
| lvx v5,0,r3 |
| lvx v6,0,r4 |
| LD_VSR_CROSS16B(r3,v3,v5,v7,v9) |
| LD_VSR_CROSS16B(r4,v4,v6,v8,v10) |
| |
| VCMPEQUB_RC(v7,v9,v10) |
| bnl cr6,.Ldiffoffset_vmx_diff_found |
| |
| subfic r6,r6,16 |
| subf r5,r6,r5 |
| add r3,r3,r6 |
| add r4,r4,r6 |
| |
| .Ldiffoffset_vmx_s1_16bytes_align: |
| /* now s1 is aligned with 16 bytes */ |
| lvx v6,0,r4 |
| LVS v4,0,r4 |
| srdi r6,r5,5 /* loop for 32 bytes each */ |
| clrldi r5,r5,59 |
| mtctr r6 |
| |
| .balign 16 |
| .Ldiffoffset_vmx_32bytesloop: |
| /* the first qw of r4 was saved in v6 */ |
| lvx v9,0,r3 |
| LD_VSR_CROSS16B(r4,v4,v6,v8,v10) |
| VCMPEQUB_RC(v7,v9,v10) |
| vor v6,v8,v8 |
| bnl cr6,.Ldiffoffset_vmx_diff_found |
| |
| addi r3,r3,16 |
| addi r4,r4,16 |
| |
| lvx v9,0,r3 |
| LD_VSR_CROSS16B(r4,v4,v6,v8,v10) |
| VCMPEQUB_RC(v7,v9,v10) |
| vor v6,v8,v8 |
| bnl cr6,.Ldiffoffset_vmx_diff_found |
| |
| addi r3,r3,16 |
| addi r4,r4,16 |
| |
| bdnz .Ldiffoffset_vmx_32bytesloop |
| |
| EXIT_VMX_OPS |
| |
| cmpdi r5,0 |
| beq .Lzero |
| b .Lcmp_lt32bytes |
| |
| .Ldiffoffset_vmx_diff_found: |
| EXIT_VMX_OPS |
| /* anyway, the diff will appear in next 16 bytes */ |
| li r5,16 |
| b .Lcmp_lt32bytes |
| |
| #endif |
| EXPORT_SYMBOL(memcmp) |