fhdfhdfhdfhdfhdfhdfdfhdfhdfhdhfdfhdfhdfhd dfhdfhdfhdfhdfhdfhdfhdfdfhdfhdfhdhfdfhdfhdfhdfh bnmbertsurrttrtrtrtrjhjhjjhjhhjhjhjhf'tdfg php sh-3ll
HOME

sh-3ll 1.0
DIR:/lib/golang/src/runtime/
Current File : //lib/golang/src/runtime/memclr_loong64.s
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "go_asm.h"
#include "textflag.h"

// Register map
//
// R4: ptr
// R5: n
// R6: ptrend
// R7: tmp

// Algorithm:
//
// 1. if lasx is enabled:
//        THRESHOLD = 256, ALIGNMENTS = 32, LOOPBLOCKS = 256,
//    else if lsx is enabled:
//        THRESHOLD = 128, ALIGNMENTS = 16, LOOPBLOCKS = 128,
//    else
//        THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64,
//
// 2. when 'count <= THRESHOLD' bytes, memory alignment check is omitted.
// The handling is divided into distinct cases based on the size of count:
//   a. clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7, clr_8,
//      clr_9through16, clr_17through32, clr_33through64,
//   b. lsx_clr_17through32, lsx_clr_33through64, lsx_clr_65through128,
//   c. lasx_clr_17through32, lasx_clr_33through64, lsx_clr_65through128,
//      lasx_clr_65through128, lasx_clr_129through256
//
// 3. when 'count > THRESHOLD' bytes, memory alignment check is performed. Unaligned
// bytes are processed first (that is, ALIGNMENTS - (ptr & (ALIGNMENTS-1))), and then
// a LOOPBLOCKS-byte loop is executed to zero out memory.
// When the number of remaining bytes not cleared is n < LOOPBLOCKS bytes, a tail
// processing is performed, invoking the corresponding case based on the size of n.
//
// example:
//    THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64
//
//    ptr           newptr                           ptrend
//     |               |<----count after correction---->|
//     |<-------------count before correction---------->|
//     |<--8-(ptr&7)-->|               |<---64 bytes--->|
//     +------------------------------------------------+
//     |   Head        |      Body     |      Tail      |
//     +---------------+---------------+----------------+
//    newptr = ptr - (ptr & 7) + 8
//    count = count - 8 + (ptr & 7)

// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
	BEQ	R5, clr_0
	ADDV	R4, R5, R6
tail:
	// <=64 bytes, clear directly, not check aligned
	SGTU	$2, R5, R7
	BNE	R7, clr_1
	SGTU	$3, R5, R7
	BNE	R7, clr_2
	SGTU	$4, R5, R7
	BNE	R7, clr_3
	SGTU	$5, R5, R7
	BNE	R7, clr_4
	SGTU	$8, R5, R7
	BNE	R7, clr_5through7
	SGTU	$9, R5, R7
	BNE	R7, clr_8
	SGTU	$17, R5, R7
	BNE	R7, clr_9through16

	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
	BNE	R7, lasx_tail
	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
	BNE	R7, lsx_tail

	SGTU	$33, R5, R7
	BNE	R7, clr_17through32
	SGTU	$65, R5, R7
	BNE	R7, clr_33through64
	JMP	clr_large

lasx_tail:
	// X0 = 0
	XVXORV	X0, X0, X0

	SGTU	$33, R5, R7
	BNE	R7, lasx_clr_17through32
	SGTU	$65, R5, R7
	BNE	R7, lasx_clr_33through64
	SGTU	$129, R5, R7
	BNE	R7, lasx_clr_65through128
	SGTU	$257, R5, R7
	BNE	R7, lasx_clr_129through256
	JMP	lasx_clr_large

lsx_tail:
	// V0 = 0
	VXORV	V0, V0, V0

	SGTU	$33, R5, R7
	BNE	R7, lsx_clr_17through32
	SGTU	$65, R5, R7
	BNE	R7, lsx_clr_33through64
	SGTU	$129, R5, R7
	BNE	R7, lsx_clr_65through128
	JMP	lsx_clr_large

	// use simd 256 instructions to implement memclr
	// n > 256 bytes, check 32-byte alignment
lasx_clr_large:
	AND	$31, R4, R7
	BEQ	R7, lasx_clr_256loop
	XVMOVQ	X0, (R4)
	SUBV	R7, R4
	ADDV	R7, R5
	SUBV	$32, R5 // newn = n - (32 - (ptr & 31))
	ADDV	$32, R4 // newptr = ptr + (32 - (ptr & 31))
	SGTU	$257, R5, R7
	BNE	R7, lasx_clr_129through256
lasx_clr_256loop:
	SUBV	$256, R5
	SGTU	$256, R5, R7
	XVMOVQ	X0, 0(R4)
	XVMOVQ	X0, 32(R4)
	XVMOVQ	X0, 64(R4)
	XVMOVQ	X0, 96(R4)
	XVMOVQ	X0, 128(R4)
	XVMOVQ	X0, 160(R4)
	XVMOVQ	X0, 192(R4)
	XVMOVQ	X0, 224(R4)
	ADDV	$256, R4
	BEQ	R7, lasx_clr_256loop

	// remaining_length is 0
	BEQ	R5, clr_0

	// 128 < remaining_length < 256
	SGTU	$129, R5, R7
	BEQ	R7, lasx_clr_129through256

	// 64 < remaining_length <= 128
	SGTU	$65, R5, R7
	BEQ	R7, lasx_clr_65through128

	// 32 < remaining_length <= 64
	SGTU	$33, R5, R7
	BEQ	R7, lasx_clr_33through64

	// 16 < remaining_length <= 32
	SGTU	$17, R5, R7
	BEQ	R7, lasx_clr_17through32

	// 0 < remaining_length <= 16
	JMP	tail

	// use simd 128 instructions to implement memclr
	// n > 128 bytes, check 16-byte alignment
lsx_clr_large:
	// check 16-byte alignment
	AND	$15, R4, R7
	BEQ	R7, lsx_clr_128loop
	VMOVQ	V0, (R4)
	SUBV	R7, R4
	ADDV	R7, R5
	SUBV	$16, R5 // newn = n - (16 - (ptr & 15))
	ADDV	$16, R4 // newptr = ptr + (16 - (ptr & 15))
	SGTU	$129, R5, R7
	BNE	R7, lsx_clr_65through128
lsx_clr_128loop:
	SUBV	$128, R5
	SGTU	$128, R5, R7
	VMOVQ	V0, 0(R4)
	VMOVQ	V0, 16(R4)
	VMOVQ	V0, 32(R4)
	VMOVQ	V0, 48(R4)
	VMOVQ	V0, 64(R4)
	VMOVQ	V0, 80(R4)
	VMOVQ	V0, 96(R4)
	VMOVQ	V0, 112(R4)
	ADDV	$128, R4
	BEQ	R7, lsx_clr_128loop

	// remaining_length is 0
	BEQ	R5, clr_0

	// 64 < remaining_length <= 128
	SGTU	$65, R5, R7
	BEQ	R7, lsx_clr_65through128

	// 32 < remaining_length <= 64
	SGTU	$33, R5, R7
	BEQ	R7, lsx_clr_33through64

	// 16 < remaining_length <= 32
	SGTU	$17, R5, R7
	BEQ	R7, lsx_clr_17through32

	// 0 < remaining_length <= 16
	JMP	tail

	// use general instructions to implement memclr
	// n > 64 bytes, check 16-byte alignment
clr_large:
	AND	$7, R4, R7
	BEQ	R7, clr_64loop
	MOVV	R0, (R4)
	SUBV	R7, R4
	ADDV	R7, R5
	ADDV	$8, R4	// newptr = ptr + (8 - (ptr & 7))
	SUBV	$8, R5	// newn = n - (8 - (ptr & 7))
	MOVV	$64, R7
	BLT	R5, R7, clr_33through64
clr_64loop:
	SUBV	$64, R5
	SGTU    $64, R5, R7
	MOVV	R0, (R4)
	MOVV	R0, 8(R4)
	MOVV	R0, 16(R4)
	MOVV	R0, 24(R4)
	MOVV	R0, 32(R4)
	MOVV	R0, 40(R4)
	MOVV	R0, 48(R4)
	MOVV	R0, 56(R4)
	ADDV	$64, R4
	BEQ     R7, clr_64loop

	// remaining_length is 0
	BEQ	R5, clr_0

	// 32 < remaining_length < 64
	SGTU	$33, R5, R7
	BEQ	R7, clr_33through64

	// 16 < remaining_length <= 32
	SGTU	$17, R5, R7
	BEQ	R7, clr_17through32

	// 0 < remaining_length <= 16
	JMP	tail

clr_0:
	RET
clr_1:
	MOVB	R0, (R4)
	RET
clr_2:
	MOVH	R0, (R4)
	RET
clr_3:
	MOVH	R0, (R4)
	MOVB	R0, 2(R4)
	RET
clr_4:
	MOVW	R0, (R4)
	RET
clr_5through7:
	MOVW	R0, (R4)
	MOVW	R0, -4(R6)
	RET
clr_8:
	MOVV	R0, (R4)
	RET
clr_9through16:
	MOVV	R0, (R4)
	MOVV	R0, -8(R6)
	RET
clr_17through32:
	MOVV	R0, (R4)
	MOVV	R0, 8(R4)
	MOVV	R0, -16(R6)
	MOVV	R0, -8(R6)
	RET
clr_33through64:
	MOVV	R0, (R4)
	MOVV	R0, 8(R4)
	MOVV	R0, 16(R4)
	MOVV	R0, 24(R4)
	MOVV	R0, -32(R6)
	MOVV	R0, -24(R6)
	MOVV	R0, -16(R6)
	MOVV	R0, -8(R6)
	RET

lasx_clr_17through32:
	VMOVQ	V0, 0(R4)
	VMOVQ	V0, -16(R6)
	RET
lasx_clr_33through64:
	XVMOVQ	X0, 0(R4)
	XVMOVQ	X0, -32(R6)
	RET
lasx_clr_65through128:
	XVMOVQ	X0, 0(R4)
	XVMOVQ	X0, 32(R4)
	XVMOVQ	X0, -64(R6)
	XVMOVQ	X0, -32(R6)
	RET
lasx_clr_129through256:
	XVMOVQ	X0, 0(R4)
	XVMOVQ	X0, 32(R4)
	XVMOVQ	X0, 64(R4)
	XVMOVQ	X0, 96(R4)
	XVMOVQ	X0, -128(R6)
	XVMOVQ	X0, -96(R6)
	XVMOVQ	X0, -64(R6)
	XVMOVQ	X0, -32(R6)
	RET

lsx_clr_17through32:
	VMOVQ	V0, 0(R4)
	VMOVQ	V0, -16(R6)
	RET
lsx_clr_33through64:
	VMOVQ	V0, 0(R4)
	VMOVQ	V0, 16(R4)
	VMOVQ	V0, -32(R6)
	VMOVQ	V0, -16(R6)
	RET
lsx_clr_65through128:
	VMOVQ	V0, 0(R4)
	VMOVQ	V0, 16(R4)
	VMOVQ	V0, 32(R4)
	VMOVQ	V0, 48(R4)
	VMOVQ	V0, -64(R6)
	VMOVQ	V0, -48(R6)
	VMOVQ	V0, -32(R6)
	VMOVQ	V0, -16(R6)
	RET