Auto merge of #118264 - lukas-code:optimized-draining, r=the8472
Optimize `VecDeque::drain` for (half-)open ranges
The most common use cases of `VecDeque::drain` consume either the entire queue or elements from the front or back.[^1] This PR makes these operations faster by optimizing the generated code of the destructor of the drain:
* `.drain(..)` is now the same as `.clear()`.
* `.drain(n..)` is now (almost[^2]) the same as `.truncate(n)`.
* `.drain(..n)` is now an efficient "advance" function. This operation is not provided by a dedicated function and optimizing it is my main motivation for this PR.
Previously, all of these cases generated a function call to the destructor of the `DropGuard`, emitting a lot of unused machine code as well as unnecessary branches and loads/stores of stack variables.
There are no algorithmic changes in this PR, but it simplifies the code enough to allow LLVM to recognize the special cases and optimize accordingly. Most notably, it allows elimination of the rather large [`wrap_copy`] function.
Some [rudimentary microbenchmarks][benches] show a performance improvement of **~3x-4x** on my machine for the special cases and roughly equal performance for the general case.
Best reviewed commit by commit.
[^1]: source: GitHub code search: [full range `drain(..)` = 7.5k results][full], [from front `drain(..n)` = 3.2k results][front], [from back `drain(n..)` = 1.6k results][back], [from middle `drain(n..m)` = <500 results][middle]
[^2]: `.drain(0..)` and `.clear()` reset the head to 0, but `.truncate(0)` does not.
[full]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%280%3F%5C.%5C.%5C%29%2F+lang%3ARust
[front]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%280%3F%5C.%5C.%5B%5E%29%5D.*%5C%29%2F+lang%3ARust
[back]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%28%5B%5E0%5D.*%5C.%5C.%5C%29%2F+lang%3ARust
[middle]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%28%5B%5E0%5D.*%5C.%5C.%5B%5E%29%5D.*%5C%29%2F+lang%3ARust
[`wrap_copy`]: 4fd68eb47b/library/alloc/src/collections/vec_deque/mod.rs (L262-L391)
[benches]: https://gist.github.com/lukas-code/c97bd707d074c4cc31f241edbc7fd2a2
<details>
<summary>generated assembly</summary>
before:
```asm
clear:
sub rsp, 40
mov rax, qword ptr [rdi + 24]
mov qword ptr [rdi + 24], 0
mov qword ptr [rsp], rdi
mov qword ptr [rsp + 8], rax
xorps xmm0, xmm0
movups xmmword ptr [rsp + 16], xmm0
mov qword ptr [rsp + 32], rax
test rax, rax
je .LBB1_2
mov rcx, qword ptr [rdi]
mov rdx, qword ptr [rdi + 16]
xor esi, esi
cmp rdx, rcx
cmovae rsi, rcx
sub rdx, rsi
mov rsi, rcx
sub rsi, rdx
lea rdi, [rdx + rax]
cmp rsi, rax
cmovb rdi, rcx
sub rdi, rdx
mov qword ptr [rsp + 16], rdi
mov qword ptr [rsp + 32], 0
.LBB1_2:
mov rdi, rsp
call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>
add rsp, 40
ret
truncate:
mov rax, qword ptr [rdi + 24]
sub rax, rsi
jbe .LBB2_2
sub rsp, 40
mov qword ptr [rdi + 24], rsi
mov qword ptr [rsp], rdi
mov qword ptr [rsp + 8], rax
mov rcx, qword ptr [rdi]
mov rdx, qword ptr [rdi + 16]
add rdx, rsi
xor edi, edi
cmp rdx, rcx
cmovae rdi, rcx
mov qword ptr [rsp + 24], 0
sub rdx, rdi
mov rdi, rcx
sub rdi, rdx
lea r8, [rdx + rax]
cmp rdi, rax
cmovb r8, rcx
sub rsi, rdx
add rsi, r8
mov qword ptr [rsp + 16], rsi
mov qword ptr [rsp + 32], 0
mov rdi, rsp
call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>
add rsp, 40
advance:
mov rcx, qword ptr [rdi + 24]
mov rax, rcx
sub rax, rsi
jbe .LBB3_1
sub rsp, 40
mov qword ptr [rdi + 24], 0
mov qword ptr [rsp], rdi
mov qword ptr [rsp + 8], rsi
mov qword ptr [rsp + 16], 0
mov qword ptr [rsp + 24], rax
mov qword ptr [rsp + 32], rsi
test rsi, rsi
je .LBB3_6
mov rax, qword ptr [rdi]
mov rcx, qword ptr [rdi + 16]
xor edx, edx
cmp rcx, rax
cmovae rdx, rax
sub rcx, rdx
mov rdx, rax
sub rdx, rcx
lea rdi, [rcx + rsi]
cmp rdx, rsi
cmovb rdi, rax
sub rdi, rcx
mov qword ptr [rsp + 16], rdi
mov qword ptr [rsp + 32], 0
.LBB3_6:
mov rdi, rsp
call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>
add rsp, 40
ret
.LBB3_1:
test rcx, rcx
je .LBB3_3
mov qword ptr [rdi + 24], 0
.LBB3_3:
mov qword ptr [rdi + 16], 0
ret
remove:
sub rsp, 40
cmp rdx, rsi
jb .LBB4_5
mov rax, qword ptr [rdi + 24]
mov rcx, rax
sub rcx, rdx
jb .LBB4_6
mov qword ptr [rdi + 24], rsi
mov qword ptr [rsp], rdi
sub rdx, rsi
mov qword ptr [rsp + 8], rdx
mov qword ptr [rsp + 16], rsi
mov qword ptr [rsp + 24], rcx
mov qword ptr [rsp + 32], rdx
je .LBB4_4
mov rax, qword ptr [rdi]
mov rcx, qword ptr [rdi + 16]
add rcx, rsi
xor edi, edi
cmp rcx, rax
cmovae rdi, rax
sub rcx, rdi
mov rdi, rax
sub rdi, rcx
lea r8, [rcx + rdx]
cmp rdi, rdx
cmovb r8, rax
sub rsi, rcx
add rsi, r8
mov qword ptr [rsp + 16], rsi
mov qword ptr [rsp + 32], 0
.LBB4_4:
mov rdi, rsp
call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>
add rsp, 40
ret
.LBB4_5:
lea rax, [rip + .L__unnamed_2]
mov rdi, rsi
mov rsi, rdx
mov rdx, rax
call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL]
.LBB4_6:
lea rcx, [rip + .L__unnamed_2]
mov rdi, rdx
mov rsi, rax
mov rdx, rcx
call qword ptr [rip + core::slice::index::slice_end_index_len_fail@GOTPCREL]
core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>:
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 24
mov rsi, qword ptr [rdi + 32]
test rsi, rsi
je .LBB0_2
mov rax, qword ptr [rdi + 16]
add rsi, rax
jb .LBB0_45
.LBB0_2:
mov r13, qword ptr [rdi]
mov rbp, qword ptr [rdi + 8]
mov rbx, qword ptr [r13 + 24]
lea r12, [rbx + rbp]
mov r15, qword ptr [rdi + 24]
lea rsi, [r15 + r12]
test rbx, rbx
je .LBB0_10
test r15, r15
je .LBB0_42
cmp rbx, r15
jbe .LBB0_12
mov r14, qword ptr [r13]
mov rax, qword ptr [r13 + 16]
add r12, rax
xor ecx, ecx
cmp r12, r14
mov rdx, r14
cmovb rdx, rcx
sub r12, rdx
add rbx, rax
cmp rbx, r14
cmovae rcx, r14
sub rbx, rcx
mov rcx, rbx
sub rcx, r12
je .LBB0_42
mov rdi, qword ptr [r13 + 8]
mov rax, rcx
add rax, r14
cmovae rax, rcx
mov r8, r14
sub r8, r12
mov rcx, r14
sub rcx, rbx
mov rdx, r15
sub rdx, r8
mov qword ptr [rsp + 16], rsi
jbe .LBB0_18
cmp rax, r15
jae .LBB0_24
mov rdx, r15
sub rdx, r8
shl rdx, 2
cmp r15, rcx
jbe .LBB0_30
sub r8, rcx
mov qword ptr [rsp], rdi
mov rax, qword ptr [rsp]
lea rdi, [rax + 4*r8]
mov rsi, qword ptr [rsp]
mov qword ptr [rsp + 8], rcx
mov r15, r8
call qword ptr [rip + memmove@GOTPCREL]
sub r14, r15
mov rax, qword ptr [rsp]
lea rsi, [rax + 4*r14]
shl r15, 2
mov rdi, qword ptr [rsp]
mov rdx, r15
call qword ptr [rip + memmove@GOTPCREL]
mov rdi, qword ptr [rsp]
lea rsi, [rdi + 4*r12]
lea rdi, [rdi + 4*rbx]
mov r15, qword ptr [rsp + 8]
jmp .LBB0_36
.LBB0_10:
test r15, r15
je .LBB0_17
mov rax, qword ptr [r13]
sub rsi, rbp
add rbp, qword ptr [r13 + 16]
xor ecx, ecx
cmp rbp, rax
cmovae rcx, rax
sub rbp, rcx
mov qword ptr [r13 + 16], rbp
jmp .LBB0_43
.LBB0_12:
mov rdx, qword ptr [r13 + 16]
mov r15, qword ptr [r13]
lea rax, [rdx + rbp]
xor ecx, ecx
cmp rax, r15
cmovae rcx, r15
mov r12, rax
sub r12, rcx
mov rcx, r12
sub rcx, rdx
je .LBB0_41
mov rdi, qword ptr [r13 + 8]
mov rax, rcx
add rax, r15
cmovae rax, rcx
mov r8, r15
sub r8, rdx
mov rcx, r15
sub rcx, r12
mov r14, rbx
sub r14, r8
mov qword ptr [rsp + 16], rsi
jbe .LBB0_21
cmp rax, rbx
jae .LBB0_26
mov qword ptr [rsp], rdx
mov rdx, rbx
sub rdx, r8
shl rdx, 2
cmp rbx, rcx
jbe .LBB0_32
sub r8, rcx
mov rbx, rdi
lea rdi, [rdi + 4*r8]
mov rsi, rbx
mov qword ptr [rsp + 8], rcx
mov r14, r8
call qword ptr [rip + memmove@GOTPCREL]
sub r15, r14
lea rsi, [rbx + 4*r15]
shl r14, 2
mov rdi, rbx
mov rdx, r14
call qword ptr [rip + memmove@GOTPCREL]
mov rdi, rbx
mov rax, qword ptr [rsp]
lea rsi, [rbx + 4*rax]
lea rdi, [rbx + 4*r12]
mov rbx, qword ptr [rsp + 8]
jmp .LBB0_40
.LBB0_17:
xorps xmm0, xmm0
movups xmmword ptr [r13 + 16], xmm0
jmp .LBB0_44
.LBB0_18:
mov r14, r15
sub r14, rcx
jbe .LBB0_28
cmp rax, r15
jae .LBB0_33
lea rax, [rcx + r12]
sub r15, rcx
lea rsi, [rdi + 4*rax]
shl r15, 2
mov r14, rdi
mov rdx, r15
mov r15, rcx
jmp .LBB0_31
.LBB0_21:
mov r14, rbx
sub r14, rcx
jbe .LBB0_29
cmp rax, rbx
jae .LBB0_34
lea rax, [rcx + rdx]
sub rbx, rcx
lea rsi, [rdi + 4*rax]
shl rbx, 2
mov r14, rdi
mov r15, rdx
mov rdx, rbx
mov rbx, rcx
call qword ptr [rip + memmove@GOTPCREL]
mov rdi, r14
lea rsi, [r14 + 4*r15]
lea rdi, [r14 + 4*r12]
jmp .LBB0_40
.LBB0_24:
sub r15, rcx
jbe .LBB0_35
sub rcx, r8
mov qword ptr [rsp + 8], rcx
lea rsi, [rdi + 4*r12]
mov r12, rdi
lea rdi, [rdi + 4*rbx]
lea rdx, [4*r8]
mov r14, r8
call qword ptr [rip + memmove@GOTPCREL]
add r14, rbx
lea rdi, [r12 + 4*r14]
mov rbx, qword ptr [rsp + 8]
lea rdx, [4*rbx]
mov rsi, r12
call qword ptr [rip + memmove@GOTPCREL]
mov rdi, r12
lea rsi, [r12 + 4*rbx]
jmp .LBB0_36
.LBB0_26:
sub rbx, rcx
jbe .LBB0_37
sub rcx, r8
lea rsi, [rdi + 4*rdx]
mov r15, rdi
lea rdi, [rdi + 4*r12]
lea rdx, [4*r8]
mov r14, rcx
mov qword ptr [rsp], r8
call qword ptr [rip + memmove@GOTPCREL]
add r12, qword ptr [rsp]
lea rdi, [r15 + 4*r12]
lea rdx, [4*r14]
mov rsi, r15
call qword ptr [rip + memmove@GOTPCREL]
mov rdi, r15
lea rsi, [r15 + 4*r14]
jmp .LBB0_40
.LBB0_28:
lea rsi, [rdi + 4*r12]
lea rdi, [rdi + 4*rbx]
jmp .LBB0_36
.LBB0_29:
lea rsi, [rdi + 4*rdx]
lea rdi, [rdi + 4*r12]
jmp .LBB0_40
.LBB0_30:
lea rax, [r8 + rbx]
mov r14, rdi
lea rdi, [rdi + 4*rax]
mov rsi, r14
mov r15, r8
.LBB0_31:
call qword ptr [rip + memmove@GOTPCREL]
mov rdi, r14
lea rsi, [r14 + 4*r12]
lea rdi, [r14 + 4*rbx]
jmp .LBB0_36
.LBB0_32:
lea rax, [r12 + r8]
mov rbx, rdi
lea rdi, [rdi + 4*rax]
mov rsi, rbx
mov r14, r8
call qword ptr [rip + memmove@GOTPCREL]
mov rdi, rbx
mov rax, qword ptr [rsp]
lea rsi, [rbx + 4*rax]
jmp .LBB0_38
.LBB0_33:
lea rsi, [rdi + 4*r12]
mov r15, rdi
lea rdi, [rdi + 4*rbx]
lea rdx, [4*rcx]
mov rbx, rcx
call qword ptr [rip + memmove@GOTPCREL]
mov rdi, r15
add rbx, r12
lea rsi, [r15 + 4*rbx]
mov r15, r14
jmp .LBB0_36
.LBB0_34:
lea rsi, [rdi + 4*rdx]
mov rbx, rdi
lea rdi, [rdi + 4*r12]
mov r15, rdx
lea rdx, [4*rcx]
mov r12, rcx
call qword ptr [rip + memmove@GOTPCREL]
mov rdi, rbx
add r12, r15
lea rsi, [rbx + 4*r12]
jmp .LBB0_39
.LBB0_35:
lea rsi, [rdi + 4*r12]
mov r14, rdi
lea rdi, [rdi + 4*rbx]
mov r12, rdx
lea rdx, [4*r8]
mov r15, r8
call qword ptr [rip + memmove@GOTPCREL]
add r15, rbx
mov rsi, r14
lea rdi, [r14 + 4*r15]
mov r15, r12
.LBB0_36:
shl r15, 2
mov rdx, r15
call qword ptr [rip + memmove@GOTPCREL]
mov rsi, qword ptr [rsp + 16]
jmp .LBB0_42
.LBB0_37:
lea rsi, [rdi + 4*rdx]
mov rbx, rdi
lea rdi, [rdi + 4*r12]
lea rdx, [4*r8]
mov r15, r8
call qword ptr [rip + memmove@GOTPCREL]
add r12, r15
mov rsi, rbx
.LBB0_38:
lea rdi, [rbx + 4*r12]
.LBB0_39:
mov rbx, r14
.LBB0_40:
shl rbx, 2
mov rdx, rbx
call qword ptr [rip + memmove@GOTPCREL]
mov r15, qword ptr [r13]
mov rax, qword ptr [r13 + 16]
add rax, rbp
mov rsi, qword ptr [rsp + 16]
.LBB0_41:
xor ecx, ecx
cmp rax, r15
cmovae rcx, r15
sub rax, rcx
mov qword ptr [r13 + 16], rax
.LBB0_42:
sub rsi, rbp
.LBB0_43:
mov qword ptr [r13 + 24], rsi
.LBB0_44:
add rsp, 24
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
.LBB0_45:
lea rdx, [rip + .L__unnamed_1]
mov rdi, rax
call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL]
```
after:
```asm
clear:
movups xmmword ptr [rdi + 16], xmm0
ret
truncate:
cmp qword ptr [rdi + 24], rsi
jbe .LBB2_4
test rsi, rsi
jne .LBB2_3
mov qword ptr [rdi + 16], 0
.LBB2_3:
mov qword ptr [rdi + 24], rsi
.LBB2_4:
ret
advance:
mov rcx, qword ptr [rdi + 24]
mov rax, rcx
sub rax, rsi
jbe .LBB3_1
mov rcx, qword ptr [rdi]
add rsi, qword ptr [rdi + 16]
xor edx, edx
cmp rsi, rcx
cmovae rdx, rcx
sub rsi, rdx
mov qword ptr [rdi + 16], rsi
mov qword ptr [rdi + 24], rax
ret
.LBB3_1:
test rcx, rcx
je .LBB3_3
mov qword ptr [rdi + 24], 0
.LBB3_3:
mov qword ptr [rdi + 16], 0
ret
remove:
push rbp
push r15
push r14
push r13
push r12
push rbx
push rax
mov r15, rsi
mov r14, rdx
sub r14, rsi
jb .LBB4_9
mov rbx, rdi
mov r12, qword ptr [rdi + 24]
mov r13, r12
sub r13, rdx
jb .LBB4_10
mov qword ptr [rbx + 24], r15
mov rbp, r12
sub rbp, r14
test r15, r15
je .LBB4_4
cmp rbp, r15
jne .LBB4_11
.LBB4_4:
cmp r12, r14
jne .LBB4_6
.LBB4_5:
mov qword ptr [rbx + 16], 0
jmp .LBB4_8
.LBB4_11:
mov rdi, rbx
mov rsi, r14
mov rdx, r15
mov rcx, r13
call <<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<T,A> as core::ops::drop::Drop>::drop::copy_data
cmp r12, r14
je .LBB4_5
.LBB4_6:
cmp r13, r15
jbe .LBB4_8
mov rax, qword ptr [rbx]
add r14, qword ptr [rbx + 16]
xor ecx, ecx
cmp r14, rax
cmovae rcx, rax
sub r14, rcx
mov qword ptr [rbx + 16], r14
.LBB4_8:
mov qword ptr [rbx + 24], rbp
add rsp, 8
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
.LBB4_9:
lea rax, [rip + .L__unnamed_1]
mov rdi, r15
mov rsi, rdx
mov rdx, rax
call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL]
.LBB4_10:
lea rax, [rip + .L__unnamed_1]
mov rdi, rdx
mov rsi, r12
mov rdx, rax
call qword ptr [rip + core::slice::index::slice_end_index_len_fail@GOTPCREL]
<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<T,A> as core::ops::drop::Drop>::drop::copy_data:
push rbp
push r15
push r14
push r13
push r12
push rbx
push rax
mov r14, rsi
cmp rdx, rcx
jae .LBB0_1
mov r12, qword ptr [rdi]
mov rax, qword ptr [rdi + 16]
add r14, rax
xor ecx, ecx
cmp r14, r12
cmovae rcx, r12
sub r14, rcx
mov r15, rdx
mov r13, r14
mov r14, rax
mov rcx, r13
sub rcx, r14
je .LBB0_18
.LBB0_4:
mov rdi, qword ptr [rdi + 8]
mov rax, rcx
add rax, r12
cmovae rax, rcx
mov rbx, r12
sub rbx, r14
mov rcx, r12
sub rcx, r13
mov rbp, r15
sub rbp, rbx
jbe .LBB0_5
cmp rax, r15
jae .LBB0_12
mov rdx, r15
sub rdx, rbx
shl rdx, 2
cmp r15, rcx
jbe .LBB0_16
sub rbx, rcx
mov rbp, rdi
lea rdi, [rdi + 4*rbx]
mov r15, qword ptr [rip + memmove@GOTPCREL]
mov rsi, rbp
mov qword ptr [rsp], rcx
call r15
sub r12, rbx
lea rsi, [4*r12]
add rsi, rbp
shl rbx, 2
mov rdi, rbp
mov rdx, rbx
call r15
mov rdi, rbp
lea rsi, [4*r14]
add rsi, rbp
lea rdi, [4*r13]
add rdi, rbp
mov r15, qword ptr [rsp]
jmp .LBB0_7
.LBB0_1:
mov r15, rcx
add r14, rdx
mov r12, qword ptr [rdi]
mov r13, qword ptr [rdi + 16]
add r14, r13
xor eax, eax
cmp r14, r12
mov rcx, r12
cmovb rcx, rax
sub r14, rcx
add r13, rdx
cmp r13, r12
cmovae rax, r12
sub r13, rax
mov rcx, r13
sub rcx, r14
jne .LBB0_4
.LBB0_18:
add rsp, 8
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
.LBB0_5:
mov rbx, r15
sub rbx, rcx
jbe .LBB0_6
cmp rax, r15
jae .LBB0_9
lea rax, [rcx + r14]
sub r15, rcx
lea rsi, [rdi + 4*rax]
shl r15, 2
mov rbx, rdi
mov rdx, r15
mov r15, rcx
call qword ptr [rip + memmove@GOTPCREL]
mov rdi, rbx
lea rsi, [rbx + 4*r14]
lea rdi, [rbx + 4*r13]
jmp .LBB0_7
.LBB0_12:
sub r15, rcx
jbe .LBB0_13
sub rcx, rbx
lea rsi, [rdi + 4*r14]
mov r12, rdi
lea rdi, [rdi + 4*r13]
lea rdx, [4*rbx]
mov r14, qword ptr [rip + memmove@GOTPCREL]
mov rbp, rcx
call r14
add rbx, r13
lea rdi, [r12 + 4*rbx]
lea rdx, [4*rbp]
mov rsi, r12
call r14
mov rdi, r12
lea rsi, [r12 + 4*rbp]
jmp .LBB0_7
.LBB0_6:
lea rsi, [rdi + 4*r14]
lea rdi, [rdi + 4*r13]
jmp .LBB0_7
.LBB0_16:
lea rax, [rbx + r13]
mov r15, rdi
lea rdi, [rdi + 4*rax]
mov rsi, r15
call qword ptr [rip + memmove@GOTPCREL]
mov rdi, r15
lea rsi, [r15 + 4*r14]
lea rdi, [r15 + 4*r13]
mov r15, rbx
jmp .LBB0_7
.LBB0_9:
lea rsi, [rdi + 4*r14]
mov r15, rdi
lea rdi, [rdi + 4*r13]
lea rdx, [4*rcx]
mov r12, rcx
call qword ptr [rip + memmove@GOTPCREL]
mov rdi, r15
add r12, r14
lea rsi, [r15 + 4*r12]
mov r15, rbx
jmp .LBB0_7
.LBB0_13:
lea rsi, [rdi + 4*r14]
mov r14, rdi
lea rdi, [rdi + 4*r13]
lea rdx, [4*rbx]
call qword ptr [rip + memmove@GOTPCREL]
add rbx, r13
mov rsi, r14
lea rdi, [r14 + 4*rbx]
mov r15, rbp
.LBB0_7:
shl r15, 2
mov rdx, r15
add rsp, 8
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
jmp qword ptr [rip + memmove@GOTPCREL]
```
</details>
This commit is contained in:
commit
158f00a1c5
@ -27,8 +27,8 @@ pub struct Drain<
|
||||
drain_len: usize,
|
||||
// index into the logical array, not the physical one (always lies in [0..deque.len))
|
||||
idx: usize,
|
||||
// number of elements after the drain range
|
||||
tail_len: usize,
|
||||
// number of elements remaining after dropping the drain
|
||||
new_len: usize,
|
||||
remaining: usize,
|
||||
// Needed to make Drain covariant over T
|
||||
_marker: PhantomData<&'a T>,
|
||||
@ -41,12 +41,12 @@ pub(super) unsafe fn new(
|
||||
drain_len: usize,
|
||||
) -> Self {
|
||||
let orig_len = mem::replace(&mut deque.len, drain_start);
|
||||
let tail_len = orig_len - drain_start - drain_len;
|
||||
let new_len = orig_len - drain_len;
|
||||
Drain {
|
||||
deque: NonNull::from(deque),
|
||||
drain_len,
|
||||
idx: drain_start,
|
||||
tail_len,
|
||||
new_len,
|
||||
remaining: drain_len,
|
||||
_marker: PhantomData,
|
||||
}
|
||||
@ -79,7 +79,7 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_tuple("Drain")
|
||||
.field(&self.drain_len)
|
||||
.field(&self.idx)
|
||||
.field(&self.tail_len)
|
||||
.field(&self.new_len)
|
||||
.field(&self.remaining)
|
||||
.finish()
|
||||
}
|
||||
@ -95,70 +95,9 @@ impl<T, A: Allocator> Drop for Drain<'_, T, A> {
|
||||
fn drop(&mut self) {
|
||||
struct DropGuard<'r, 'a, T, A: Allocator>(&'r mut Drain<'a, T, A>);
|
||||
|
||||
impl<'r, 'a, T, A: Allocator> Drop for DropGuard<'r, 'a, T, A> {
|
||||
fn drop(&mut self) {
|
||||
if self.0.remaining != 0 {
|
||||
unsafe {
|
||||
// SAFETY: We just checked that `self.remaining != 0`.
|
||||
let (front, back) = self.0.as_slices();
|
||||
ptr::drop_in_place(front);
|
||||
ptr::drop_in_place(back);
|
||||
}
|
||||
}
|
||||
|
||||
let source_deque = unsafe { self.0.deque.as_mut() };
|
||||
|
||||
let drain_start = source_deque.len();
|
||||
let drain_len = self.0.drain_len;
|
||||
let drain_end = drain_start + drain_len;
|
||||
|
||||
let orig_len = self.0.tail_len + drain_end;
|
||||
|
||||
if T::IS_ZST {
|
||||
// no need to copy around any memory if T is a ZST
|
||||
source_deque.len = orig_len - drain_len;
|
||||
return;
|
||||
}
|
||||
|
||||
let head_len = drain_start;
|
||||
let tail_len = self.0.tail_len;
|
||||
|
||||
match (head_len, tail_len) {
|
||||
(0, 0) => {
|
||||
source_deque.head = 0;
|
||||
source_deque.len = 0;
|
||||
}
|
||||
(0, _) => {
|
||||
source_deque.head = source_deque.to_physical_idx(drain_len);
|
||||
source_deque.len = orig_len - drain_len;
|
||||
}
|
||||
(_, 0) => {
|
||||
source_deque.len = orig_len - drain_len;
|
||||
}
|
||||
_ => unsafe {
|
||||
if head_len <= tail_len {
|
||||
source_deque.wrap_copy(
|
||||
source_deque.head,
|
||||
source_deque.to_physical_idx(drain_len),
|
||||
head_len,
|
||||
);
|
||||
source_deque.head = source_deque.to_physical_idx(drain_len);
|
||||
source_deque.len = orig_len - drain_len;
|
||||
} else {
|
||||
source_deque.wrap_copy(
|
||||
source_deque.to_physical_idx(head_len + drain_len),
|
||||
source_deque.to_physical_idx(head_len),
|
||||
tail_len,
|
||||
);
|
||||
source_deque.len = orig_len - drain_len;
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let guard = DropGuard(self);
|
||||
if guard.0.remaining != 0 {
|
||||
|
||||
if mem::needs_drop::<T>() && guard.0.remaining != 0 {
|
||||
unsafe {
|
||||
// SAFETY: We just checked that `self.remaining != 0`.
|
||||
let (front, back) = guard.0.as_slices();
|
||||
@ -172,6 +111,125 @@ fn drop(&mut self) {
|
||||
}
|
||||
|
||||
// Dropping `guard` handles moving the remaining elements into place.
|
||||
impl<'r, 'a, T, A: Allocator> Drop for DropGuard<'r, 'a, T, A> {
|
||||
#[inline]
|
||||
fn drop(&mut self) {
|
||||
if mem::needs_drop::<T>() && self.0.remaining != 0 {
|
||||
unsafe {
|
||||
// SAFETY: We just checked that `self.remaining != 0`.
|
||||
let (front, back) = self.0.as_slices();
|
||||
ptr::drop_in_place(front);
|
||||
ptr::drop_in_place(back);
|
||||
}
|
||||
}
|
||||
|
||||
let source_deque = unsafe { self.0.deque.as_mut() };
|
||||
|
||||
let drain_len = self.0.drain_len;
|
||||
let new_len = self.0.new_len;
|
||||
|
||||
if T::IS_ZST {
|
||||
// no need to copy around any memory if T is a ZST
|
||||
source_deque.len = new_len;
|
||||
return;
|
||||
}
|
||||
|
||||
let head_len = source_deque.len; // #elements in front of the drain
|
||||
let tail_len = new_len - head_len; // #elements behind the drain
|
||||
|
||||
// Next, we will fill the hole left by the drain with as few writes as possible.
|
||||
// The code below handles the following control flow and reduces the amount of
|
||||
// branches under the assumption that `head_len == 0 || tail_len == 0`, i.e.
|
||||
// draining at the front or at the back of the dequeue is especially common.
|
||||
//
|
||||
// H = "head index" = `deque.head`
|
||||
// h = elements in front of the drain
|
||||
// d = elements in the drain
|
||||
// t = elements behind the drain
|
||||
//
|
||||
// Note that the buffer may wrap at any point and the wrapping is handled by
|
||||
// `wrap_copy` and `to_physical_idx`.
|
||||
//
|
||||
// Case 1: if `head_len == 0 && tail_len == 0`
|
||||
// Everything was drained, reset the head index back to 0.
|
||||
// H
|
||||
// [ . . . . . d d d d . . . . . ]
|
||||
// H
|
||||
// [ . . . . . . . . . . . . . . ]
|
||||
//
|
||||
// Case 2: else if `tail_len == 0`
|
||||
// Don't move data or the head index.
|
||||
// H
|
||||
// [ . . . h h h h d d d d . . . ]
|
||||
// H
|
||||
// [ . . . h h h h . . . . . . . ]
|
||||
//
|
||||
// Case 3: else if `head_len == 0`
|
||||
// Don't move data, but move the head index.
|
||||
// H
|
||||
// [ . . . d d d d t t t t . . . ]
|
||||
// H
|
||||
// [ . . . . . . . t t t t . . . ]
|
||||
//
|
||||
// Case 4: else if `tail_len <= head_len`
|
||||
// Move data, but not the head index.
|
||||
// H
|
||||
// [ . . h h h h d d d d t t . . ]
|
||||
// H
|
||||
// [ . . h h h h t t . . . . . . ]
|
||||
//
|
||||
// Case 5: else
|
||||
// Move data and the head index.
|
||||
// H
|
||||
// [ . . h h d d d d t t t t . . ]
|
||||
// H
|
||||
// [ . . . . . . h h t t t t . . ]
|
||||
|
||||
// When draining at the front (`.drain(..n)`) or at the back (`.drain(n..)`),
|
||||
// we don't need to copy any data. The number of elements copied would be 0.
|
||||
if head_len != 0 && tail_len != 0 {
|
||||
join_head_and_tail_wrapping(source_deque, drain_len, head_len, tail_len);
|
||||
// Marking this function as cold helps LLVM to eliminate it entirely if
|
||||
// this branch is never taken.
|
||||
// We use `#[cold]` instead of `#[inline(never)]`, because inlining this
|
||||
// function into the general case (`.drain(n..m)`) is fine.
|
||||
// See `tests/codegen/vecdeque-drain.rs` for a test.
|
||||
#[cold]
|
||||
fn join_head_and_tail_wrapping<T, A: Allocator>(
|
||||
source_deque: &mut VecDeque<T, A>,
|
||||
drain_len: usize,
|
||||
head_len: usize,
|
||||
tail_len: usize,
|
||||
) {
|
||||
// Pick whether to move the head or the tail here.
|
||||
let (src, dst, len);
|
||||
if head_len < tail_len {
|
||||
src = source_deque.head;
|
||||
dst = source_deque.to_physical_idx(drain_len);
|
||||
len = head_len;
|
||||
} else {
|
||||
src = source_deque.to_physical_idx(head_len + drain_len);
|
||||
dst = source_deque.to_physical_idx(head_len);
|
||||
len = tail_len;
|
||||
};
|
||||
|
||||
unsafe {
|
||||
source_deque.wrap_copy(src, dst, len);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if new_len == 0 {
|
||||
// Special case: If the entire dequeue was drained, reset the head back to 0,
|
||||
// like `.clear()` does.
|
||||
source_deque.head = 0;
|
||||
} else if head_len < tail_len {
|
||||
// If we moved the head above, then we need to adjust the head index here.
|
||||
source_deque.head = source_deque.to_physical_idx(drain_len);
|
||||
}
|
||||
source_deque.len = new_len;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
69
tests/codegen/vecdeque-drain.rs
Normal file
69
tests/codegen/vecdeque-drain.rs
Normal file
@ -0,0 +1,69 @@
|
||||
// Check that draining at the front or back doesn't copy memory.
|
||||
|
||||
// compile-flags: -O
|
||||
// ignore-debug: the debug assertions get in the way
|
||||
|
||||
#![crate_type = "lib"]
|
||||
|
||||
use std::collections::VecDeque;
|
||||
|
||||
// CHECK-LABEL: @clear
|
||||
// CHECK-NOT: call
|
||||
// CHECK-NOT: br
|
||||
// CHECK: getelementptr inbounds
|
||||
// CHECK-NEXT: {{call void @llvm.memset|store}}
|
||||
// CHECK-NEXT: ret void
|
||||
#[no_mangle]
|
||||
pub fn clear(v: &mut VecDeque<i32>) {
|
||||
v.drain(..);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @truncate
|
||||
// CHECK-NOT: call
|
||||
// CHECK: br
|
||||
// CHECK-NOT: call
|
||||
// CHECK: br
|
||||
// CHECK-NOT: call
|
||||
// CHECK: br
|
||||
// CHECK-NOT: call
|
||||
// CHECK: br
|
||||
// CHECK-NOT: call
|
||||
// CHECK-NOT: br
|
||||
// CHECK: ret void
|
||||
#[no_mangle]
|
||||
pub fn truncate(v: &mut VecDeque<i32>, n: usize) {
|
||||
if n < v.len() {
|
||||
v.drain(n..);
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @advance
|
||||
// CHECK-NOT: call
|
||||
// CHECK: br
|
||||
// CHECK-NOT: call
|
||||
// CHECK: br
|
||||
// CHECK-NOT: call
|
||||
// CHECK: br
|
||||
// CHECK-NOT: call
|
||||
// CHECK: br
|
||||
// CHECK-NOT: call
|
||||
// CHECK: br
|
||||
// CHECK-NOT: call
|
||||
// CHECK-NOT: br
|
||||
// CHECK: ret void
|
||||
#[no_mangle]
|
||||
pub fn advance(v: &mut VecDeque<i32>, n: usize) {
|
||||
if n < v.len() {
|
||||
v.drain(..n);
|
||||
} else {
|
||||
v.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @remove
|
||||
// CHECK: call
|
||||
// CHECK: ret void
|
||||
#[no_mangle]
|
||||
pub fn remove(v: &mut VecDeque<i32>, a: usize, b: usize) {
|
||||
v.drain(a..b);
|
||||
}
|
Loading…
Reference in New Issue
Block a user