2 minutes
How slow Is GetOverlappedResult
I was curious how expensive calling GetOverlappedResult is for a incomplete operation and if we leave user space.
BOOL GetOverlappedResult(
HANDLE hFile,
LPOVERLAPPED lpOverlapped,
LPDWORD lpNumberOfBytesTransferred,
BOOL bWait
);
So lets find out and look inside kernelbase.dll (Win10 x64)
GetOverlappedResult:
mov qword ptr [rsp+8],rbx
mov qword ptr [rsp+10h],rbp
push rsi
push rdi
push r14
sub rsp,20h
mov eax,r9d
mov r14,r8
neg eax
mov rdi,rdx
mov rsi,rcx
sbb ebp,ebp
xor ebx,ebx
test r9d,r9d
jne GetOverlappedResult+62h
mov eax,103h
cmp dword ptr [rdx],eax
je GetOverlappedResult+0E1h
lock or dword ptr [rsp],ebx
mov eax,dword ptr [rdi+8]
mov dword ptr [r14],eax
cmp dword ptr [rdi],ebx
jl GetOverlappedResult+0F1h
mov ebx,1
mov rbp,qword ptr [rsp+48h]
mov eax,ebx
mov rbx,qword ptr [rsp+40h]
add rsp,20h
pop r14
pop rdi
pop rsi
ret
xor r9d,r9d
mov dword ptr [rsp+58h],1
lea r8,[g_SbModuleTable_KernelBase]
mov ecx,0ABABABABh
lea edx,[r9+1]
call qword ptr [__imp_SbSelectProcedure]
test rax,rax
je GetOverlappedResult+9Fh
lea rcx,[GetOverlappedResult_Win7]
cmp rax,rcx
lea rcx,[rsp+58h]
je GetOverlappedResult+0FDh
call qword ptr [__guard_dispatch_icall_fptr)]
and rsi,0FFFFFFFFFFFFFFFEh
mov eax,103h
cmp dword ptr [rsp+58h],ebx
jne GetOverlappedResult+0B3h
cmp qword ptr [rdi],rax
jne GetOverlappedResult+3Ah
cmp qword ptr [rdi],rax
mov ecx,64h
cmovne ebp,ecx
cmp qword ptr [rdi+18h],rbx
mov edx,ebp
cmovne rsi,qword ptr [rdi+18h]
xor r8d,r8d
mov rcx,rsi
call WaitForSingleObjectEx
test eax,eax
je GetOverlappedResult+3Ah
jmp _guard_dispatch_icall_nop+10260h
mov ecx,3E4h
call qword ptr [__imp_RtlSetLastWin32Error]
jmp GetOverlappedResult+4Dh
mov ecx,dword ptr [rdi]
call BaseSetLastNTError
jmp GetOverlappedResult+4Dh
call GetOverlappedResult_Win7
jmp GetOverlappedResult+9Fh
The code executed for a incomplete operation is the following
GetOverlappedResult:
mov qword ptr [rsp+8],rbx
mov qword ptr [rsp+10h],rbp
push rsi
push rdi
push r14
sub rsp,20h
mov eax,r9d
mov r14,r8
neg eax
mov rdi,rdx
mov rsi,rcx
sbb ebp,ebp
xor ebx,ebx
test r9d,r9d
jne GetOverlappedResult+62h
mov eax,103h
cmp dword ptr [rdx],eax
je GetOverlappedResult+0E1h
GetOverlappedResult+0E1h:
mov ecx,3E4h
call qword ptr [__imp_RtlSetLastWin32Error]
jmp GetOverlappedResult+4Dh
GetOverlappedResult+4Dh:
mov rbp,qword ptr [rsp+48h]
mov eax,ebx
mov rbx,qword ptr [rsp+40h]
add rsp,20h
pop r14
pop rdi
pop rsi
ret
So all GetOverlappedResult does for a incomplete operation is check the value of OVERLAPPED::Internal against STATUS_PENDING and sets lasterror to ERROR_IO_INCOMPLETE if still pending.
So you could save a few cycles and check OVERLAPPED::Internal and then call GetOverlappedResult if no longer of value STATUS_PENDING but it is already pretty lightweight.
For a complete operation it performs a single interlocked OR and copies the value of OVERLAPPED::InternalHigh to NumberOfBytesTransferred.
I don’t really care about how long it takes to return if wait is true, since the return time is essentially unbounded.