I was curious how expensive calling GetOverlappedResult is for a incomplete operation and if we leave user space.

BOOL GetOverlappedResult(
  HANDLE       hFile,
  LPOVERLAPPED lpOverlapped,
  LPDWORD      lpNumberOfBytesTransferred,
  BOOL         bWait
);

So lets find out and look inside kernelbase.dll (Win10 x64)


GetOverlappedResult:
 mov         qword ptr [rsp+8],rbx  
 mov         qword ptr [rsp+10h],rbp  
 push        rsi  
 push        rdi  
 push        r14  
 sub         rsp,20h  
 mov         eax,r9d  
 mov         r14,r8  
 neg         eax  
 mov         rdi,rdx  
 mov         rsi,rcx  
 sbb         ebp,ebp  
 xor         ebx,ebx  
 test        r9d,r9d  
 jne         GetOverlappedResult+62h 
 mov         eax,103h  
 cmp         dword ptr [rdx],eax  
 je          GetOverlappedResult+0E1h
 lock or     dword ptr [rsp],ebx  
 mov         eax,dword ptr [rdi+8]  
 mov         dword ptr [r14],eax  
 cmp         dword ptr [rdi],ebx  
 jl          GetOverlappedResult+0F1h
 mov         ebx,1  
 mov         rbp,qword ptr [rsp+48h]  
 mov         eax,ebx  
 mov         rbx,qword ptr [rsp+40h]  
 add         rsp,20h  
 pop         r14  
 pop         rdi  
 pop         rsi  
 ret  
 xor         r9d,r9d  
 mov         dword ptr [rsp+58h],1  
 lea         r8,[g_SbModuleTable_KernelBase]  
 mov         ecx,0ABABABABh  
 lea         edx,[r9+1]  
 call        qword ptr [__imp_SbSelectProcedure]  
 test        rax,rax  
 je          GetOverlappedResult+9Fh
 lea         rcx,[GetOverlappedResult_Win7]  
 cmp         rax,rcx  
 lea         rcx,[rsp+58h]  
 je          GetOverlappedResult+0FDh  
 call        qword ptr [__guard_dispatch_icall_fptr)]  
 and         rsi,0FFFFFFFFFFFFFFFEh  
 mov         eax,103h  
 cmp         dword ptr [rsp+58h],ebx  
 jne         GetOverlappedResult+0B3h
 cmp         qword ptr [rdi],rax  
 jne         GetOverlappedResult+3Ah 
 cmp         qword ptr [rdi],rax  
 mov         ecx,64h  
 cmovne      ebp,ecx  
 cmp         qword ptr [rdi+18h],rbx  
 mov         edx,ebp  
 cmovne      rsi,qword ptr [rdi+18h]  
 xor         r8d,r8d  
 mov         rcx,rsi  
 call        WaitForSingleObjectEx
 test        eax,eax  
 je          GetOverlappedResult+3Ah
 jmp         _guard_dispatch_icall_nop+10260h
 mov         ecx,3E4h  
 call        qword ptr [__imp_RtlSetLastWin32Error]  
 jmp         GetOverlappedResult+4Dh
 mov         ecx,dword ptr [rdi]  
 call        BaseSetLastNTError 
 jmp         GetOverlappedResult+4Dh
 call        GetOverlappedResult_Win7
 jmp         GetOverlappedResult+9Fh  

The code executed for a incomplete operation is the following


GetOverlappedResult:
 mov         qword ptr [rsp+8],rbx  
 mov         qword ptr [rsp+10h],rbp  
 push        rsi  
 push        rdi  
 push        r14  
 sub         rsp,20h  
 mov         eax,r9d  
 mov         r14,r8  
 neg         eax  
 mov         rdi,rdx  
 mov         rsi,rcx  
 sbb         ebp,ebp  
 xor         ebx,ebx  
 test        r9d,r9d  
 jne         GetOverlappedResult+62h 
 mov         eax,103h  
 cmp         dword ptr [rdx],eax  
 je          GetOverlappedResult+0E1h
 
 GetOverlappedResult+0E1h:
 mov         ecx,3E4h  
 call        qword ptr [__imp_RtlSetLastWin32Error]  
 jmp         GetOverlappedResult+4Dh

 GetOverlappedResult+4Dh:
 mov         rbp,qword ptr [rsp+48h]  
 mov         eax,ebx  
 mov         rbx,qword ptr [rsp+40h]  
 add         rsp,20h  
 pop         r14  
 pop         rdi  
 pop         rsi  
 ret  

So all GetOverlappedResult does for a incomplete operation is check the value of OVERLAPPED::Internal against STATUS_PENDING and sets lasterror to ERROR_IO_INCOMPLETE if still pending.

So you could save a few cycles and check OVERLAPPED::Internal and then call GetOverlappedResult if no longer of value STATUS_PENDING but it is already pretty lightweight.

For a complete operation it performs a single interlocked OR and copies the value of OVERLAPPED::InternalHigh to NumberOfBytesTransferred.

I don’t really care about how long it takes to return if wait is true, since the return time is essentially unbounded.