The following routine is the block-allocation procedure in a fixed-size allocator being written for library use. It is designed to be accessed from C++ (the un-mangled symbol name is void * Superblock::alloc_block()). The performance of this routine is the critical to the library; and I, with almost no prior experience in assembly, am looking for suggestions about performance and also any holes in the logic. It seems to be working correctly right now, but I'm worried about edge cases.
Execution context: written for x86_64 System V abi, full allocator may be found here: https://github.com/cmura81/experimental-allocator.
alloc_sb_alloc_block.s:
.globl __ZN10Superblock11alloc_blockEv
// 'this is passed in %rdi
// Variables:
// this->beginning = 0(%rdi) +8
// this->last_privately_freed_block = 8(%rdi) +8
// this->last_publicly_freed_block = 16(%rdi) +8
// this->block_size = 24(%rdi) +2
// this->free_blocks = 26(%rdi) +2
// this->max_blocks = 28(%rdi) +2
__ZN10Superblock11alloc_blockEv:
xorq %rcx, %rcx
// this->beginning -> rsi
movq 0(%rdi), %rsi
// allocated_block = this->last_privately_freed_block
movq 8(%rdi), %rax
// if this->last_privately_freed_block == NULL
testq %rax, %rax
jz __ZN10Superblock11alloc_blockEv.i_pubchalloc
// this->free_blocks--
decw 26(%rdi)
// Move 2 bytes from rax into edx
movzwl (%rax), %edx
notw %dx
jz __ZN10Superblock11alloc_blockEv.i_nbi_ffff
notw %dx
movslq %edx, %rcx
// this->last_privately_freed_block = this->beginning + next_block_index
addq %rcx, %rsi
movq %rsi, 8(%rdi)
ret
__ZN10Superblock11alloc_blockEv.i_nbi_ffff:
movq $0, 8(%rdi)
ret
__ZN10Superblock11alloc_blockEv.i_pubchalloc:
movq 16(%rdi), %rax
// if this->last_publicly_freed_block == NULL
testq %rax, %rax
jz __ZN10Superblock11alloc_blockEv.i_noalloc
// this->free_blocks--
decw 26(%rdi)
// if *(uint16_t)(this->last_publicly_freed_block + next_block_index) == 0xFFFF
movzwl (%rax), %edx
notw %dx
jz __ZN10Superblock11alloc_blockEv.i_pubnbi_ffff
notw %dx
movslq %edx, %rcx
// this->last_privately_freed_block = this->beginning + next_block_index
addq %rcx, %rsi
movq %rsi, 16(%rdi)
ret
__ZN10Superblock11alloc_blockEv.i_pubnbi_ffff:
movq $0, 8(%rdi)
ret
__ZN10Superblock11alloc_blockEv.i_noalloc:
movq $0, %rax
ret