83a7a2ad2a
We already have cpufeature indicies above 255, so use a 16-bit number for the alternatives index. This consumes a padding field and so doesn't add any size, but it means that abusing the padding field to create assembly errors on overflow no longer works. We can retain the test simply by redirecting it to the .discard section, however. [ v3: updated to include open-coded locations ] Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> LKML-Reference: <tip-f88731e3068f9d1392ba71cc9f50f035d26a0d4f@git.kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
127 lines
2.2 KiB
ArmAsm
127 lines
2.2 KiB
ArmAsm
/* Copyright 2002 Andi Kleen, SuSE Labs */
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/dwarf2.h>
|
|
|
|
/*
|
|
* ISO C memset - set a memory block to a byte value.
|
|
*
|
|
* rdi destination
|
|
* rsi value (char)
|
|
* rdx count (bytes)
|
|
*
|
|
* rax original destination
|
|
*/
|
|
.section .altinstr_replacement, "ax", @progbits
|
|
.Lmemset_c:
|
|
movq %rdi,%r9
|
|
movl %edx,%r8d
|
|
andl $7,%r8d
|
|
movl %edx,%ecx
|
|
shrl $3,%ecx
|
|
/* expand byte value */
|
|
movzbl %sil,%esi
|
|
movabs $0x0101010101010101,%rax
|
|
mulq %rsi /* with rax, clobbers rdx */
|
|
rep stosq
|
|
movl %r8d,%ecx
|
|
rep stosb
|
|
movq %r9,%rax
|
|
ret
|
|
.Lmemset_e:
|
|
.previous
|
|
|
|
ENTRY(memset)
|
|
ENTRY(__memset)
|
|
CFI_STARTPROC
|
|
movq %rdi,%r10
|
|
movq %rdx,%r11
|
|
|
|
/* expand byte value */
|
|
movzbl %sil,%ecx
|
|
movabs $0x0101010101010101,%rax
|
|
mul %rcx /* with rax, clobbers rdx */
|
|
|
|
/* align dst */
|
|
movl %edi,%r9d
|
|
andl $7,%r9d
|
|
jnz .Lbad_alignment
|
|
CFI_REMEMBER_STATE
|
|
.Lafter_bad_alignment:
|
|
|
|
movl %r11d,%ecx
|
|
shrl $6,%ecx
|
|
jz .Lhandle_tail
|
|
|
|
.p2align 4
|
|
.Lloop_64:
|
|
decl %ecx
|
|
movq %rax,(%rdi)
|
|
movq %rax,8(%rdi)
|
|
movq %rax,16(%rdi)
|
|
movq %rax,24(%rdi)
|
|
movq %rax,32(%rdi)
|
|
movq %rax,40(%rdi)
|
|
movq %rax,48(%rdi)
|
|
movq %rax,56(%rdi)
|
|
leaq 64(%rdi),%rdi
|
|
jnz .Lloop_64
|
|
|
|
/* Handle tail in loops. The loops should be faster than hard
|
|
to predict jump tables. */
|
|
.p2align 4
|
|
.Lhandle_tail:
|
|
movl %r11d,%ecx
|
|
andl $63&(~7),%ecx
|
|
jz .Lhandle_7
|
|
shrl $3,%ecx
|
|
.p2align 4
|
|
.Lloop_8:
|
|
decl %ecx
|
|
movq %rax,(%rdi)
|
|
leaq 8(%rdi),%rdi
|
|
jnz .Lloop_8
|
|
|
|
.Lhandle_7:
|
|
movl %r11d,%ecx
|
|
andl $7,%ecx
|
|
jz .Lende
|
|
.p2align 4
|
|
.Lloop_1:
|
|
decl %ecx
|
|
movb %al,(%rdi)
|
|
leaq 1(%rdi),%rdi
|
|
jnz .Lloop_1
|
|
|
|
.Lende:
|
|
movq %r10,%rax
|
|
ret
|
|
|
|
CFI_RESTORE_STATE
|
|
.Lbad_alignment:
|
|
cmpq $7,%r11
|
|
jbe .Lhandle_7
|
|
movq %rax,(%rdi) /* unaligned store */
|
|
movq $8,%r8
|
|
subq %r9,%r8
|
|
addq %r8,%rdi
|
|
subq %r8,%r11
|
|
jmp .Lafter_bad_alignment
|
|
.Lfinal:
|
|
CFI_ENDPROC
|
|
ENDPROC(memset)
|
|
ENDPROC(__memset)
|
|
|
|
/* Some CPUs run faster using the string instructions.
|
|
It is also a lot simpler. Use this when possible */
|
|
|
|
#include <asm/cpufeature.h>
|
|
|
|
.section .altinstructions,"a"
|
|
.align 8
|
|
.quad memset
|
|
.quad .Lmemset_c
|
|
.word X86_FEATURE_REP_GOOD
|
|
.byte .Lfinal - memset
|
|
.byte .Lmemset_e - .Lmemset_c
|
|
.previous
|