Академический Документы
Профессиональный Документы
Культура Документы
executables
(April 17, 2017)
How small can a valid and useful Win32 executable be? There
already are a few tutorials about this topic, but these are
either not working on modern Windows versions any longer or
only cover the most basic do nothing but return zero
program. The goal here should be to do something genuinely
useful: a console application that outputs the contents of
the Windows clipboard on standard output.
#include <windows.h>
#include <stdio.h>
#include <string.h>
int main(void) {
if (!OpenClipboard(NULL)) {
ExitProcess(1);
}
HANDLE hData = GetClipboardData(CF_TEXT);
if (!hData) {
CloseClipboard();
ExitProcess(1);
}
const char *str = (const char*)GlobalLock(hData);
if (!str) {
CloseClipboard();
ExitProcess(1);
}
fwrite((const void*)str, 1, strlen(str), stdout);
GlobalUnlock(hData);
CloseClipboard();
return 0;
}
#include <windows.h>
#include <string.h>
int mainCRTStartup(void) {
if (!OpenClipboard(NULL)) {
ExitProcess(1);
}
HANDLE hData = GetClipboardData(CF_TEXT);
if (!hData) {
CloseClipboard();
ExitProcess(1);
}
const char *str = (const char*) GlobalLock(hData);
if (!str) {
CloseClipboard();
ExitProcess(1);
}
DWORD dummy;
WriteFile(GetStdHandle(STD_OUTPUT_HANDLE),
(const void*)str, strlen(str), &dummy, NULL);
GlobalUnlock(hData);
CloseClipboard();
ExitProcess(0);
}
global _mainCRTStartup
extern _ExitProcess@4
extern _OpenClipboard@4
extern _CloseClipboard@0
extern _GetClipboardData@4
extern _GlobalLock@4
extern _GlobalUnlock@4
extern _GetStdHandle@4
extern _WriteFile@20
section .text
_mainCRTStartup:
; if (!OpenClipboard(NULL)) ExitProcess(1);
push 0
call _OpenClipboard@4
or eax, eax
jz error2
; strlen(str)
mov ecx, eax
strlen_loop:
mov dl, [ecx]
or dl, dl
jz strlen_end
inc ecx
jmp strlen_loop
strlen_end:
sub ecx, eax
; WriteFile(GetStdHandle(STD_OUTPUT_HANDLE), ...)
push 0 ; lpOverlapped = NULL
lea edx, [ebp-4] ; put nBytesWritten on the stack
push edx
push ecx ; nNumberOfBytesToWrite = strlen(str)
push eax ; lpBuffer = str
push -11 ; hFile = ...
call _GetStdHandle@4 ; ... GetStdHandle(STD_OUTPUT_HANDLE)
push eax
call _WriteFile@20
; GlobalUnlock(hData); CloseClipboard(); ExitProcess(0);
call _GlobalUnlock@4 ; hData is already on the stack
call _CloseClipboard@0
push 0
call _ExitProcess@4
error:
call _CloseClipboard@0
error2:
push 1
call _ExitProcess@4
bits 32
BASE equ 0x00400000
ALIGNMENT equ 512
SECTALIGN equ 4096
mz_hdr:
dw "MZ" ; DOS magic
times 0x3a db 0 ; [UNUSED] DOS header
dd RVA(pe_hdr) ; address of PE header
pe_hdr:
dw "PE",0 ; PE magic + 2 padding bytes
dw 0x014c ; i386 architecture
dw 2 ; two sections
dd 0 ; [UNUSED] timestamp
dd 0 ; [UNUSED] symbol table pointer
dd 0 ; [UNUSED] symbol count
dw OPT_HDR_SIZE ; optional header size
dw 0x0102 ; characteristics: 32-bit,
executable
opt_hdr:
dw 0x010b ; optional header magic
db 13,37 ; [UNUSED] linker version
dd ALIGNED(S_TEXT_SIZE) ; [UNUSED] code size
dd ALIGNED(S_IDATA_SIZE) ; [UNUSED] size of initialized data
dd 0 ; [UNUSED] size of uninitialized
data
dd RVA(section..text.vstart) ; entry point address
dd RVA(section..text.vstart) ; [UNUSED] base of code
dd RVA(section..idata.vstart) ; [UNUSED] base of data
dd BASE ; image base
dd SECTALIGN ; section alignment
dd ALIGNMENT ; file alignment
dw 4,0 ; [UNUSED] OS version
dw 0,0 ; [UNUSED] image version
dw 4,0 ; subsystem version
dd 0 ; [UNUSED] Win32 version
dd RVA(the_end) ; size of image
dd ALIGNED(ALL_HDR_SIZE) ; size of headers
dd 0 ; [UNUSED] checksum
dw 3 ; subsystem = console
dw 0 ; [UNUSED] DLL characteristics
dd 0x00100000 ; [UNUSED] maximum stack size
dd 0x00001000 ; initial stack size
dd 0x00100000 ; maximum heap size
dd 0x00001000 ; [UNUSED] initial heap size
dd 0 ; [UNUSED] loader flags
dd 16 ; number of data directory entries
dd 0,0 ; no export table
dd RVA(import_table) ; import table address
dd IMPORT_TABLE_SIZE ; import table size
times 14 dd 0,0 ; no other entries in the data
directories
OPT_HDR_SIZE equ $ - opt_hdr
sect_hdr_text:
db ".text",0,0,0 ; section name
dd ALIGNED(S_TEXT_SIZE) ; virtual size
dd RVA(section..text.vstart) ; virtual address
dd ALIGNED(S_TEXT_SIZE) ; file size
dd section..text.start ; file position
dd 0,0 ; no relocations or debug info
dw 0,0 ; no relocations or debug info
dd 0x60000020 ; flags: code, readable, executable
sect_hdr_idata:
db ".idata",0,0 ; section name
dd ALIGNED(S_IDATA_SIZE) ; virtual size
dd RVA(section..idata.vstart) ; virtual address
dd ALIGNED(S_IDATA_SIZE) ; file size
dd section..idata.start ; file position
dd 0,0 ; no relocations or debug info
dw 0,0 ; no relocations or debug info
dd 0xC0000040 ; flags: data, readable, writeable
ALL_HDR_SIZE equ $ - $$
; if (!OpenClipboard(NULL)) ExitProcess(1);
push 0
call [OpenClipboard]
or eax, eax
jz error2
; strlen(str)
mov ecx, eax
strlen_loop:
mov dl, [ecx]
or dl, dl
jz strlen_end
inc ecx
jmp strlen_loop
strlen_end:
sub ecx, eax
; WriteFile(GetStdHandle(STD_OUTPUT_HANDLE), ...)
push 0 ; lpOverlapped = NULL
lea edx, [ebp-4] ; put nBytesWritten on the stack
push edx
push ecx ; nNumberOfBytesToWrite = strlen(str)
push eax ; lpBuffer = str
push -11 ; hFile = ...
call [GetStdHandle] ; ... GetStdHandle(STD_OUTPUT_HANDLE)
push eax
call [WriteFile]
error:
call [CloseClipboard]
error2:
push 1
call [ExitProcess]
import_table:
; import of kernel32.dll
dd 0 ; [UNUSED] read-only IAT
dd 0 ; [UNUSED] timestamp
dd 0 ; [UNUSED] forwarder chain
dd RVA(N_kernel32) ; library name
dd RVA(IAT_kernel32) ; IAT pointer
; import of user32.dll
dd 0 ; [UNUSED] read-only IAT
dd 0 ; [UNUSED] timestamp
dd 0 ; [UNUSED] forwarder chain
dd RVA(N_user32) ; library name
dd RVA(IAT_user32) ; IAT pointer
; terminator (empty item)
times 5 dd 0
IMPORT_TABLE_SIZE: equ $ - import_table
IAT_kernel32:
ExitProcess: dd RVA(H_ExitProcess)
GlobalLock: dd RVA(H_GlobalLock)
GlobalUnlock: dd RVA(H_GlobalUnlock)
GetStdHandle: dd RVA(H_GetStdHandle)
WriteFile: dd RVA(H_WriteFile)
dd 0
IAT_user32:
OpenClipboard: dd RVA(H_OpenClipboard)
CloseClipboard: dd RVA(H_CloseClipboard)
GetClipboardData: dd RVA(H_GetClipboardData)
dd 0
align 4, db 0
N_kernel32: db "kernel32.dll",0
align 4, db 0
N_user32: db "user32.dll",0
align 2, db 0
H_OpenClipboard: db 0,0,"OpenClipboard",0
align 2, db 0
H_GetClipboardData: db 0,0,"GetClipboardData",0
align 2, db 0
H_GlobalLock: db 0,0,"GlobalLock",0
align 2, db 0
H_GetStdHandle: db 0,0,"GetStdHandle",0
align 2, db 0
H_WriteFile: db 0,0,"WriteFile",0
align 2, db 0
H_GlobalUnlock: db 0,0,"GlobalUnlock",0
align 2, db 0
H_CloseClipboard: db 0,0,"CloseClipboard",0
align 2, db 0
H_ExitProcess: db 0,0,"ExitProcess",0
align ALIGNMENT, db 0
the_end:
Merging sections
Even with Windows being so uncooperative, we still got one
trick up our sleeves: We can just put both the code and the
import tables into a combined section. Thats not common to
do (code/data separation exists for a reason), but on our
quest to make the file smaller, we take what we can.
@@ -34,3 +42,3 @@
dw 0x014c ; i386 architecture
- dw 2 ; two sections
+ dw 1 ; one section
dd 0 ; [UNUSED] timestamp
@@ -44,8 +52,8 @@
db 13,37 ; [UNUSED] linker version
- dd ALIGNED(S_TEXT_SIZE) ; [UNUSED] code size
- dd ALIGNED(S_IDATA_SIZE) ; [UNUSED] size of initialized data
+ dd ALIGNED(S_SECT_SIZE) ; [UNUSED] code size
+ dd ALIGNED(S_SECT_SIZE) ; [UNUSED] size of initialized data
dd 0 ; [UNUSED] size of uninitialized
data
- dd RVA(section..text.vstart) ; entry point address
- dd RVA(section..text.vstart) ; [UNUSED] base of code
- dd RVA(section..idata.vstart) ; [UNUSED] base of data
+ dd RVA(section.getclip.vstart); entry point address
+ dd RVA(section.getclip.vstart); [UNUSED] base of code
+ dd RVA(section.getclip.vstart); [UNUSED] base of data
dd BASE ; image base
@@ -74,20 +82,11 @@
-sect_hdr_text:
- db ".text",0,0,0 ; section name
- dd ALIGNED(S_TEXT_SIZE) ; virtual size
- dd RVA(section..text.vstart) ; virtual address
- dd ALIGNED(S_TEXT_SIZE) ; file size
- dd section..text.start ; file position
+sect_hdr:
+ db "getclip",0 ; section name
+ dd ALIGNED(S_SECT_SIZE) ; virtual size
+ dd RVA(section.getclip.vstart); virtual address
+ dd ALIGNED(S_SECT_SIZE) ; file size
+ dd section.getclip.start ; file position
dd 0,0 ; no relocations or debug info
dw 0,0 ; no relocations or debug info
- dd 0x60000020 ; flags: code, readable, executable
+ dd 0xE0000060 ; flags: code + data, readable,
writeable, executable
-sect_hdr_idata:
- db ".idata",0,0 ; section name
- dd ALIGNED(S_IDATA_SIZE) ; virtual size
- dd RVA(section..idata.vstart) ; virtual address
- dd ALIGNED(S_IDATA_SIZE) ; file size
- dd section..idata.start ; file position
- dd 0,0 ; no relocations or debug info
- dw 0,0 ; no relocations or debug info
- dd 0xC0000040 ; flags: data, readable, writeable
@@ -97,4 +96,4 @@
-section .text progbits follows=header align=ALIGNMENT
vstart=BASE+SECTALIGN*1
-s_text:
+section getclip progbits follows=header align=ALIGNMENT
vstart=BASE+SECTALIGN*1
+the_section:
@@ -157,9 +156,5 @@
-S_TEXT_SIZE equ $ - s_text
-
;;;;;;;;;;;;;;;;;;;; .idata ;;;;;;;;;;;;;;;;;
Going sectionless
As this whole section business works against us, can we
possibly live without it? Windows will load at least the
header part of the executable into memory anyway, and if we
sneak the actual code and import table data into there, we
should be fine. In fact, this used to work in the past, but
at least Windows 10 version 1703 (and very likely already
versions before that) simply ignore import tables that are
not contained in a section. As a result, the pointers to
the function names in the Import Address Table are not
replaced by the functions entry point address the
program will load just fine, but it will crash shortly
thereafter when it tries to call the first API function.
bits 32
BASE equ 0x00400000
ALIGNMENT equ 4
SECTALIGN equ 4
org BASE
mz_hdr:
dw "MZ" ; DOS magic
times 0x3a db 0 ; [UNUSED] DOS header
dd RVA(pe_hdr) ; address of PE header
pe_hdr:
dw "PE",0 ; PE magic + 2 padding bytes
dw 0x014c ; i386 architecture
dw 0 ; no sections
dd 0 ; [UNUSED] timestamp
dd 0 ; [UNUSED] symbol table pointer
dd 0 ; [UNUSED] symbol count
dw 8 ; optional header size
dw 0x0102 ; characteristics: 32-bit,
executable
opt_hdr:
dw 0x010b ; optional header magic
db 13,37 ; [UNUSED] linker version
dd RVA(the_end) ; [UNUSED] code size
dd RVA(the_end) ; [UNUSED] size of initialized data
dd 0 ; [UNUSED] size of uninitialized
data
dd RVA(main) ; entry point address
dd RVA(main) ; [UNUSED] base of code
dd RVA(main) ; [UNUSED] base of data
dd BASE ; image base
dd SECTALIGN ; section alignment
dd ALIGNMENT ; file alignment
dw 4,0 ; [UNUSED] OS version
dw 0,0 ; [UNUSED] image version
dw 4,0 ; subsystem version
dd 0 ; [UNUSED] Win32 version
dd RVA(the_end) ; size of image
dd ALIGNED(ALL_HDR_SIZE) ; size of headers
dd 0 ; [UNUSED] checksum
dw 3 ; subsystem = console
dw 0 ; [UNUSED] DLL characteristics
dd 0x00100000 ; [UNUSED] maximum stack size
dd 0x00001000 ; initial stack size
dd 0x00100000 ; maximum heap size
dd 0x00001000 ; [UNUSED] initial heap size
dd 0 ; [UNUSED] loader flags
dd 16 ; number of data directory entries
times 16 dd 0,0 ; no entries in the data directories
OPT_HDR_SIZE equ $ - opt_hdr
ALL_HDR_SIZE equ $ - $$
main:
; set up stack frame for local variables
push ebp
%define DummyVar ebp-4
%define kernel32base ebp-8
%define user32base ebp-12
sub esp, 12
; locate the loader data tables where the loaded DLLs are managed
mov eax, [fs:0x30] ; get PEB pointer from TEB
mov eax, [eax+0x0C] ; get PEB_LDR_DATA pointer from PEB
mov eax, [eax+0x14] ; go to first LDR_DATA_TABLE_ENTRY
mov eax, [eax] ; move two entries further, because the
mov eax, [eax] ; third is typically kernel32.dll
try_next_lib:
push eax ; save LDR_DATA_TABLE_ENTRY pointer
mov ebx, [eax+0x10] ; load base address of the library
mov esi, N_LoadLibrary
call find_import ; load LoadLibrary from there (if present)
or eax, eax ; found?
jnz kernel32_found
pop eax ; restore LDR_DATA_TABLE_ENTRY pointer
mov eax, [eax] ; go to next LDR_DATA_TABLE_ENTRY
jmp try_next_lib
; if (!OpenClipboard(NULL)) ExitProcess(1);
push 0
mov ebx, eax ; user32 base address was still in eax
mov esi, N_OpenClipboard
call call_import
or eax, eax
jz error2
; strlen(str)
mov ecx, eax
strlen_loop:
mov dl, [ecx]
or dl, dl
jz strlen_end
inc ecx
jmp strlen_loop
strlen_end:
sub ecx, eax
; WriteFile(GetStdHandle(STD_OUTPUT_HANDLE), ...)
push 0 ; lpOverlapped = NULL
lea edx, [DummyVar] ; lpBytesWritten
push edx
push ecx ; nNumberOfBytesToWrite = strlen(str)
push eax ; lpBuffer = str
push -11 ; hFile = ...
; mov ebx, [kernel32base]
mov esi, N_GetStdHandle
call call_import ; ... GetStdHandle(STD_OUTPUT_HANDLE)
push eax
; mov ebx, [kernel32base]
mov esi, N_WriteFile
call call_import
push 0
jmp exit
error:
mov ebx, [user32base]
mov esi, N_CloseClipboard
call call_import
error2:
push 1
exit:
mov ebx, [kernel32base]
mov esi, N_ExitProcess
jmp call_import
critical_error:
ret
N_user32: db "user32.dll",0
N_LoadLibrary: db "LoadLibraryA", 0
N_OpenClipboard: db "OpenClipboard",0
N_GetClipboardData: db "GetClipboardData",0
N_GlobalLock: db "GlobalLock",0
N_GetStdHandle: db "GetStdHandle",0
N_WriteFile: db "WriteFile",0
N_GlobalUnlock: db "GlobalUnlock",0
N_CloseClipboard: db "CloseClipboard",0
N_ExitProcess: db "ExitProcess",0
align ALIGNMENT, db 0
the_end:
Import by hash
Of the 768 bytes in the sectionless version, 118 bytes
(15%!) are spent on function names. Thats seems a little
excessive, doesnt it? After all, were not really
interested in the names themselves, we just use them to
find the functions adresses. As a first try, we could
limit the length of the stored strings by only comparing
the first, say, 7 characters. We wont be able to discern
LoadLibraryA from its Unicode cousin LoadLibraryW this way, but
since the names are guaranteed to be alphabetically sorted
in export tables, we would hit LoadLibraryA first anyway.
However, we cant use less than 7 significant bytes,
because otherwise e.g. GlobalLock would be too unspecific
and we would get GlobalAddAtomA instead.
But 7 bytes per import is still quite some data, and the
whole approach is a forward compatibility timebomb, because
future versions of Windows could add new functions to our
two DLLs with catastrophic effect. So, truncating names is
not the best path to follow. However, theres a much more
powerful alternative: Hashing! As said, were not
interested in the names, not even parts of it. A machine-
readable mapping that can uniquely identify the proper
function name without actually knowing it is sufficient;
bonus points if its easy to compute. (For our purposes, we
dont need a cryptographically strong hash or anything
fancy, we just want to tell a few function names apart!)
@@ -95,5 +89,5 @@
mov ebx, [eax+0x10] ; load base address of the library
- mov esi, N_LoadLibrary
+ mov esi, 0x01364564 ; hash of "LoadLibraryA"
call find_import ; load LoadLibrary from there (if present)
@@ -123,15 +117,16 @@
cmp_loop:
- lodsb ; load a byte of the two strings into AL,
AH
- mov ah, [edi] ; and increase the pointers
- inc edi
- cmp al, ah ; identical bytes?
- jne next_name ; if not, this is not the correct name
- or al, al ; zero byte reached?
- jnz cmp_loop ; if not, we need to compare more
+ movzx eax, byte [edi] ; load a byte of the name ...
+ inc edi ; ... and advance the pointer
+ xor esi, eax ; apply xor-and-rotate
+ rol esi, 7
+ or eax, eax ; last byte?
+ jnz cmp_loop ; if not, process another byte
+ or esi, esi ; result hash match?
+ jnz next_name ; if not, this is not the correct name
; if we arrive here, we have a match!
@@ -180,5 +175,5 @@
push 0
mov ebx, eax ; user32 base address was still in eax
- mov esi, N_OpenClipboard
+ mov esi, 0xFC7956AD ; hash of "OpenClipboard"
call call_import
@@ -188,5 +183,5 @@
push 1 ; CF_TEXT
; mov ebx, [user32base]
- mov esi, N_GetClipboardData
+ mov esi, 0x0C473D74 ; hash of "GetClipboardData"
call call_import
or eax, eax
@@ -197,5 +192,5 @@
mov ebx, [kernel32base]
- mov esi, N_GlobalLock
+ mov esi, 0x4A88F58C ; hash of "GlobalLock"
call call_import
@@ -221,18 +216,18 @@
; mov ebx, [kernel32base]
- mov esi, N_GetStdHandle
+ mov esi, 0xEACA71C2 ; hash of "GetStdHandle"
call call_import ; ... GetStdHandle(STD_OUTPUT_HANDLE)
push eax
; mov ebx, [kernel32base]
- mov esi, N_WriteFile
+ mov esi, 0x3FD1C30F ; hash of "WriteFile"
call call_import
The result is 656 bytes, 112 bytes less than the version
without import-by-hash. Its not quite the optimal amount
of savings (which would be 118 bytes, the size of the name
strings) because the comparison grew a little bit, but
still quite an impressive result.
Header trickery
Before our short excursion into the land of hashes, we
worked hard on bypassing the alignment limits, but still
theres a lot of space spent in the PE headers. One trivial
thing is to remove the data dictionary as we dont even
have table-based imports by now. But thats not all:
Fortunately, there are many fields in the headers that
arent evaluated by the Windows loader where we can put
other stuff in. The largest part of this is the 64-byte DOS
header at the beginning, of which only the first two bytes
(the MZ signature) and the last four bytes (the address
of the PE header) are important. We can actually move
(collapse) the PE header inside the DOS header, all the
way until address 4 (which is the minimum alignment
requirement). In this case, the PE header location field of
the DOS header coincides with the section alignment field
of the PE header, so we get a section (and file) alignment
of 4 perfect!
The following dump is what the headers now look like. The
main part is the same, except that the blocks that have
been moved into the headers (N_user32, next_name and parts of
main) are now obviously gone:
mz_hdr:
dw "MZ" ; DOS magic
dw "kj" ; filler to align the PE header
pe_hdr:
dw "PE",0 ; PE magic + 2 padding bytes
dw 0x014c ; i386 architecture
dw 0 ; no sections
N_user32: db "user32.dll",0,0 ; 12 bytes of data collapsed into the
header
;dd 0 ; [UNUSED-12] timestamp
;dd 0 ; [UNUSED] symbol table pointer
;dd 0 ; [UNUSED] symbol count
dw 8 ; optional header size
dw 0x0102 ; characteristics: 32-bit,
executable
opt_hdr:
dw 0x010b ; optional header magic
main_part_1: ; 12 bytes of main entry point + 2 bytes of jump
mov eax, [fs:0x30] ; get PEB pointer from TEB
mov eax, [eax+0x0C] ; get PEB_LDR_DATA pointer from PEB
mov eax, [eax+0x14] ; go to first LDR_DATA_TABLE_ENTRY
jmp main_part_2
align 4, db 0
;db 13,37 ; [UNUSED-14] linker version
;dd RVA(the_end) ; [UNUSED] code size
;dd RVA(the_end) ; [UNUSED] size of initialized data
;dd 0 ; [UNUSED] size of uninitialized
data
dd RVA(main_part_1) ; entry point address
main_part_2: ; another 6 bytes of code + 2 bytes of jump
; set up stack frame for local variables
push ebp
%define DummyVar ebp-4
%define kernel32base ebp-8
%define user32base ebp-12
sub esp, 12
mov eax, [eax] ; go to where ntdll.dll typically is
jmp main_part_3
align 4, db 0
;dd RVA(main) ; [UNUSED-8] base of code
;dd RVA(main) ; [UNUSED] base of data
dd BASE ; image base
dd SECTALIGN ; section alignment (collapsed with
the
; PE header offset in the DOS
header)
dd ALIGNMENT ; file alignment
next_name: ; we interrupt again for a few bytes of code from the
loader
pop esi ; restore the name pointer
add edx, 4 ; advance to next list item
jmp name_loop
align 4, db 0
;dw 4,0 ; [UNUSED-8] OS version
;dw 0,0 ; [UNUSED] image version
dw 4,0 ; subsystem version
dd 0 ; [UNUSED-4] Win32 version
dd RVA(the_end) ; size of image
dd RVA(opt_hdr) ; size of headers (must be small
enough
; so that entry point inside header
is accepted)
dd 0 ; [UNUSED-4] checksum
dw 3 ; subsystem = console
dw 0 ; [UNUSED-6] DLL characteristics
dd 0x00100000 ; maximum stack size
dd 0x00001000 ; initial stack size
dd 0x00100000 ; maximum heap size
dd 0x00001000 ; initial heap size
dd 0 ; [UNUSED-4] loader flags
dd 0 ; number of data directory entries
(= none!)
OPT_HDR_SIZE equ $ - opt_hdr
ALL_HDR_SIZE equ $ - $$
main_part_3:
mov eax, [eax] ; go to where kernel32.dll typically is
try_next_lib:
; (from here on, not much has changed)
With this, were at 436 bytes, a whopping 33% less than
before! The downside is that the header declarations in the
source code become quite unreadable by now, and that were
no longer forward compatible: A future version of Windows
might decide that the OS version listed in the EXE file is
now totally relevant and may thus not want to execute files
made for version 33630.1068.
Unsafe optimizations
All along the way, we were cautious not to remove any
checks and clean exits in case of failure. But were
already relying on a few details of the PE loader that are
unlikely to change soon, but are not carved into stone
either. So why not go full YOLO and strip off all the
safety nets? We could assume that
bits 32
BASE equ 0x00400000
ALIGNMENT equ 4
SECTALIGN equ 4
org BASE
mz_hdr:
dw "MZ" ; DOS magic
dw "kj" ; filler to align the PE header
pe_hdr:
dw "PE",0 ; PE magic + 2 padding bytes
dw 0x014c ; i386 architecture
dw 0 ; no sections
N_user32: db "user32.dll",0,0 ; 12 bytes of data collapsed into the
header
;dd 0 ; [UNUSED-12] timestamp
;dd 0 ; [UNUSED] symbol table pointer
;dd 0 ; [UNUSED] symbol count
dw 8 ; optional header size
dw 0x0102 ; characteristics: 32-bit,
executable
opt_hdr:
dw 0x010b ; optional header magic
main_part_1: ; 12 bytes of main entry point + 2 bytes of jump
mov eax, [fs:0x30] ; get PEB pointer from TEB
mov eax, [eax+0x0C] ; get PEB_LDR_DATA pointer from PEB
mov eax, [eax+0x14] ; go to first LDR_DATA_TABLE_ENTRY
jmp main_part_2
align 4, db 0
;db 13,37 ; [UNUSED-14] linker version
;dd RVA(the_end) ; [UNUSED] code size
;dd RVA(the_end) ; [UNUSED] size of initialized data
;dd 0 ; [UNUSED] size of uninitialized
data
dd RVA(main_part_1) ; entry point address
main_part_2: ; another 6 bytes of code + 2 bytes of jump
; set up stack frame for local variables
push ebp
%define DummyVar ebp-4
%define kernel32base ebp-8
%define user32base ebp-12
sub esp, 12
mov eax, [eax] ; go to where ntdll.dll typically is
jmp main_part_3
align 4, db 0
;dd RVA(main) ; [UNUSED-8] base of code
;dd RVA(main) ; [UNUSED] base of data
dd BASE ; image base
dd SECTALIGN ; section alignment (collapsed with
the
; PE header offset in the DOS
header)
dd ALIGNMENT ; file alignment
main_part_3: ; another 5 bytes of code + 2 bytes of jump
mov eax, [eax] ; go to where kernel32.dll typically is
mov ebx, [eax+0x10] ; load base address of the library
jmp main_part_4
align 4, db 0
;dw 4,0 ; [UNUSED-8] OS version
;dw 0,0 ; [UNUSED] image version
dw 4,0 ; subsystem version
dd 0 ; [UNUSED-4] Win32 version
dd RVA(the_end) ; size of image
dd RVA(opt_hdr) ; size of headers (must be small
enough
; so that entry point inside header
is accepted)
dd 0 ; [UNUSED-4] checksum
dw 3 ; subsystem = console
dw 0 ; [UNUSED-2] DLL characteristics
dd 0x00100000 ; maximum stack size
dd 0x00001000 ; initial stack size
dd 0x00100000 ; maximum heap size
dd 0x00001000 ; initial heap size
dd 0 ; [UNUSED-4] loader flags
dd 0 ; number of data directory entries
(= none!)
OPT_HDR_SIZE equ $ - opt_hdr
ALL_HDR_SIZE equ $ - $$
main_part_4:
mov [kernel32base], ebx ; store kernel32's base address
mov esi, 0x01364564 ; hash of "LoadLibraryA"
push N_user32 ; we want to load user32.dll
call call_import ; call LoadLibraryA
mov [user32base], eax ; store user32's base address
; if (!OpenClipboard(NULL)) ExitProcess(1);
push 0
mov ebx, eax ; user32 base address was still in eax
mov esi, 0xFC7956AD ; hash of "OpenClipboard"
call call_import
or eax, eax
jz error
; strlen(str)
mov ecx, eax
strlen_loop:
mov dl, [ecx]
or dl, dl
jz strlen_end
inc ecx
jmp strlen_loop
strlen_end:
sub ecx, eax
; WriteFile(GetStdHandle(STD_OUTPUT_HANDLE), ...)
push 0 ; lpOverlapped = NULL
lea edx, [DummyVar] ; lpBytesWritten
push edx
push ecx ; nNumberOfBytesToWrite = strlen(str)
push eax ; lpBuffer = str
push -11 ; hFile = ...
mov ebx, [kernel32base]
mov esi, 0xEACA71C2 ; hash of "GetStdHandle"
call call_import ; ... GetStdHandle(STD_OUTPUT_HANDLE)
push eax
; mov ebx, [kernel32base]
mov esi, 0x3FD1C30F ; hash of "WriteFile"
call call_import
; ExitProcess(0);
push 0
jmp exit
error:
push 1
exit:
mov ebx, [kernel32base]
mov esi, 0x665640AC ; hash of "ExitProcess"
; fall-through into call_import
align ALIGNMENT, db 0
the_end:
The final result with this is 316 bytes, another 27% less
than before!
Conclusion
This concludes our journey into size optimization. At this
point, were 240 times smaller than the nave first C
implementation, and even if we consider our first serious
optimization step (the C implementation without C library)
as the starting point, were still almost 10 times smaller.
But admittedly, the amount of effort necessary for this is
extremely high and hardly justified ;)