Basic datatype reversing

Table of Contents

Basic Types

This is a basic overview over datatypes after a program has been compiled. All excamples have been compiled with gcc, it might be worthwhile to comapre the output for Visualstudio. Everything is compiled as x64, therefore padding is usually done on 8 byte sizes and pointers are also 8 byte, for 32 bit applications this should be different (4 byte).

Local and Global variables

This example uses global and local variables. The global variables end up in the data section of the program, while the local variables will be part of the stack.

basic_types.c:

#include <stdio.h>

int globalInt = 5;
char globalChar = 'C';
short globalShort = 123;
long globalLong = 0x42424242;

void setValues(){
    int localInt = 555;
    char localChar = 'E';
    short localShort = 4545;
    long localLong = 0x4343434343;

    globalInt = localInt;
    globalChar = localChar;
    globalShort = localShort;
    globalLong = localLong;
}

int main(){
    setValues();

    printf("%d\r\n", globalInt);
    printf("%c\r\n", globalChar);
    printf("%d\r\n", globalShort);
    printf("0x%lx\r\n", globalLong);
    return 0;
}

All code on this page has been compiled with:

  • gcc basic_types.c -o basic_types

Output:

./basic_types
555
E
4545
0x4343434343

file basic_types
basic_types: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=d5c99e6ba65bfb8957534b9c2e7df3d3b8213f2e, for GNU/Linux 3.2.0, not stripped

Viewing the data section in the disassembler(using binja) shows:

00004010  uint32_t globalInt = 0x5
00004014  uint8_t globalChar = 0x43

00004015  00                                                     . padding

00004016  uint16_t globalShort = 0x7b
00004018  uint64_t globalLong = 0x42424242

Partial disassembly of the setValues() function (there does not seem to be an obvious alignment like the global varibles had):

.text:0000000000001151 008                 mov     [rbp+var_C], 22Bh
.text:0000000000001158 008                 mov     [rbp+var_F], 45h ; 'E'
.text:000000000000115C 008                 mov     rax, 4343434343h
.text:0000000000001166 008                 mov     [rbp+var_8], rax
.text:000000000000116A 008                 mov     [rbp+var_E], 11C1h
.text:0000000000001170 008                 mov     eax, [rbp+var_C]
.text:0000000000001173 008                 mov     cs:globalInt, eax
.text:0000000000001179 008                 movzx   eax, [rbp+var_F] ; Move with Zero-Extend
.text:000000000000117D 008                 mov     cs:globalChar, al
.text:0000000000001183 008                 movzx   eax, [rbp+var_E] ; Move with Zero-Extend
.text:0000000000001187 008                 mov     cs:globalShort, ax
.text:000000000000118E 008                 mov     rax, [rbp+var_8]
.text:0000000000001192 008                 mov     cs:globalLong, rax
.text:0000000000001199 008                 nop                     ; No Operation

Changing the order of variable declaration for the global variables, changes the padding, two examples of this are shown here.

Change 1

Changed code:

#include <stdio.h>

int globalInt = 5;
char globalChar = 'C';
long globalLong = 0x42424242;
short globalShort = 123;

[...]

Changed diassembler output:

00004010  uint32_t globalInt = 0x5
00004014  uint8_t globalChar = 0x43

00004015  00 00 00                                               ... padding

00004018  uint64_t globalLong = 0x42424242
00004020  uint16_t globalShort = 0x7b

Change 2

Changed code:

char globalChar = 'C';
long globalLong = 0x42424242;
short globalShort = 123;
int globalInt = 5;

Changed diassembler output:

00004010  uint8_t globalChar = 0x43

00004011  00 00 00 00 00 00 00         ....... padding

00004018  uint64_t globalLong = 0x42424242
00004020  uint16_t globalShort = 0x7b

00004022  00 00                          .. padding

00004024  uint32_t globalInt = 0x5

Structs

Structs in C are data structures that contain a number of elements which are stored consecutively in memory. Those elements can be of different type and depending on the size of the elements padding might occur. To determine the kind of element it is necessary to observe how the elements are accessed, in the following disassembler outputs, it can be observed that the elements are accessed differently. The char is accessed via a byte pointer, while int and long use dword and qword respectively (short would likely be two bytes and therefore a word).

Example of setting char c to “E”:

mov byte ptr [rax+0x4], 0x45

Example source code:

#include <stdio.h>

typedef struct {
    int a;
    char c;
    int b; 
    long l;
} Test ; 

Test tg;

void pointer(Test *t){

    printf("%d\r\n", t->a);
    printf("%c\r\n", t->c);
    printf("%d\r\n", t->b);
    printf("0x%lx\r\n", t->l);

    t->a = 555;
    t->c = 'E';
    t->b = 5555;
    t->l = 55555;
}

int main(){
    typedef struct {
        int a;
        char c;
        int b; 
        long l;
    } TestLocal ;
    
    Test t;
    TestLocal tl;

    tl.a = 4444;
    tl.c = 'F';
    tl.b = 44444;
    tl.l = 444444;

    pointer(&t);

    printf("%d\r\n", t.a);
    printf("%c\r\n", t.c);
    printf("%d\r\n", t.b);
    printf("0x%lx\r\n", t.l);

    printf("%d\r\n", tg.a);
    printf("%c\r\n", tg.c);
    printf("%d\r\n", tg.b);
    printf("0x%lx\r\n", tg.l);
    return 0;
}

Excerpt of the end of the pointer function (the assigning of new values):

000011e6  488b45f8           mov     rax, qword ptr [rbp-0x8 {arg1}]
000011ea  c7002b020000       mov     dword ptr [rax], 0x22b
000011f0  488b45f8           mov     rax, qword ptr [rbp-0x8 {arg1}]
000011f4  c6400445           mov     byte ptr [rax+0x4], 0x45
000011f8  488b45f8           mov     rax, qword ptr [rbp-0x8 {arg1}]
000011fc  c74008b3150000     mov     dword ptr [rax+0x8], 0x15b3
00001203  488b45f8           mov     rax, qword ptr [rbp-0x8 {arg1}]
00001207  48c7401003d90000   mov     qword ptr [rax+0x10], 0xd903

The following shows the data section for the global variable, as before we can observe padding for alignment.

00004020  int32_t tg = 0x0
00004024  char data_4024 = 0x0

00004025  00 00 00      ... padding

00004028  int32_t data_4028 = 0x0

0000402c  00 00 00 00   .... padding

00004030  int64_t data_4030 = 0x0

Unions

Unions in C are datastructures that can have one of multiple datatypes. So the union can contain multiple datatypes, however at a given point in time it is representing only one of the datatypes. All of those types will use the same memory address.

#include <stdio.h>

typedef union{
    char c;
    int i;
    long l;
    short s;
} Uni;

Uni u;

void valueCyclce(){
    u.c = 'c';
    printf("%c\r\n", u.c);

    u.i = 123;
    printf("%d\r\n", u.i);
    
    u.l = 345;
    printf("0x%ld\r\n", u.l);
    
    u.s = 42;
    printf("%d\r\n", u.s);
}

int main(){
    valueCyclce();

    return 0;
}

In this case, it can be seen that in the data section, only room for one 64 bit value is reserved, which is the biggest value that this specific union can contain.

00004018  uint64_t u = 0x0

The following disassembly is a shortened version of the valueCycle() functions, basically the parameters for the printf function have been removed. The printf calls are in the order, char, int, long, short. As can be seen this results in the use of different mov instructions:

  • mov byte ptr [u], 0x63 // the byte for the char
  • mov dword ptr [u], 0x7b // dword (4 byte) for the int
  • mov qword ptr [u], 0x159 // qword (8 byte) for the long
  • mov word ptr [u], 0x2a // word (2 byte) for the short
00001151  c605c02e000063     mov     byte ptr [u], 0x63
00001158  0fb605b92e0000     movzx   eax, byte ptr [u]
...
00001170  e8dbfeffff         call    printf
00001175  c705992e00007b00  mov     dword ptr [u], 0x7b
0000117f  8b05932e0000       mov     eax, dword ptr [u]
...
00001193  e8b8feffff         call    printf
00001198  48c705752e000059  mov     qword ptr [u], 0x159
000011a3  488b056e2e0000     mov     rax, qword ptr [u]
...
000011b9  e892feffff         call    printf
000011be  66c705512e00002a  mov     word ptr [u], 0x2a
000011c7  0fb7054a2e0000     movzx   eax, word ptr [u]
...
000011dd  e86efeffff         call    printf

This is the unabrided disassembly code as shown by binja 3.0.3426-dev:

valueCyclce:
00001149  f30f1efa           endbr64 
0000114d  55                 push    rbp {__saved_rbp}
0000114e  4889e5             mov     rbp, rsp {__saved_rbp}
00001151  c605c02e000063     mov     byte ptr [u], 0x63
00001158  0fb605b92e0000     movzx   eax, byte ptr [u]
0000115f  0fbec0             movsx   eax, al
00001162  89c6               mov     esi, eax
00001164  488d3d990e0000     lea     rdi, qword ptr [data_2004]  {"%c\r\n"}
0000116b  b800000000         mov     eax, 0x0
00001170  e8dbfeffff         call    printf
00001175  c705992e00007b00mov     dword ptr [u], 0x7b
0000117f  8b05932e0000       mov     eax, dword ptr [u]
00001185  89c6               mov     esi, eax
00001187  488d3d7b0e0000     lea     rdi, qword ptr [data_2009]  {"%d\r\n"}
0000118e  b800000000         mov     eax, 0x0
00001193  e8b8feffff         call    printf
00001198  48c705752e000059mov     qword ptr [u], 0x159
000011a3  488b056e2e0000     mov     rax, qword ptr [u]
000011aa  4889c6             mov     rsi, rax
000011ad  488d3d5a0e0000     lea     rdi, qword ptr [data_200e]  {"0x%ld\r\n"}
000011b4  b800000000         mov     eax, 0x0
000011b9  e892feffff         call    printf
000011be  66c705512e00002amov     word ptr [u], 0x2a
000011c7  0fb7054a2e0000     movzx   eax, word ptr [u]
000011ce  98                 cwde    
000011cf  89c6               mov     esi, eax
000011d1  488d3d310e0000     lea     rdi, qword ptr [data_2009]  {"%d\r\n"}
000011d8  b800000000         mov     eax, 0x0
000011dd  e86efeffff         call    printf
000011e2  90                 nop     
000011e3  5d                 pop     rbp {__saved_rbp}
000011e4  c3                 ret      {__return_addr}

Enums

I have not looked too much into enums, it seems they are resolved on compile time. With the same gcc compilation as the other cases the following code, turns into disassembled code that has the value of the weekday, but the rest of the enum does not seem to be part of the binary.

#include<stdio.h>
  
enum week{Monday, Tuesday, Wednesday, Thursday, Friday, Saturday, Sunday};
  
int main()
{
    enum week day;
    day = Thursday; // 3
    printf("%d", day);
    return 0;
} 
main:
00001149  f30f1efa           endbr64 
0000114d  55                 push    rbp {__saved_rbp}
0000114e  4889e5             mov     rbp, rsp {__saved_rbp}
00001151  4883ec10           sub     rsp, 0x10
00001155  // value of enum Thursday
00001155  c745fc03000000     mov     dword ptr [rbp-0x4 {var_c}], 0x3
0000115c  8b45fc             mov     eax, dword ptr [rbp-0x4]  {0x3}
0000115f  89c6               mov     esi, eax  {0x3}
00001161  488d3d9c0e0000     lea     rdi, qword ptr [format]  {"%d"}
00001168  b800000000         mov     eax, 0x0
0000116d  e8defeffff         call    printf
00001172  b800000000         mov     eax, 0x0
00001177  c9                 leave    {__saved_rbp}
00001178  c3                 ret      {__return_addr}