C#


Contents#




Binary Prefixes#

  • [ w ] Binary Prefix

  • [ w ] names of large numbers

  • [ w ] Unit Prefix

print(f'2^10 B = 1024^1 B = {2**10:>35,.0f} B = 1 KiB (about one thousand B)')
print(f'2^20 B = 1024^2 B = {2**20:>35,.0f} B = 1 MiB (about one million B)')
print(f'2^30 B = 1024^3 B = {2**30:>35,.0f} B = 1 GiB (about one billion B)')
print(f'2^40 B = 1024^4 B = {2**40:>35,.0f} B = 1 TiB (about one trillion B)')
print(f'2^50 B = 1024^5 B = {2**50:>35,.0f} B = 1 PiB (about one quadrillion B)')
print(f'2^60 B = 1024^6 B = {2**60:>35,.0f} B = 1 EiB (about one quintillion B)')
print(f'2^70 B = 1024^7 B = {2**70:>35,.0f} B = 1 ZiB (about one sextillion B)')
print(f'2^80 B = 1024^8 B = {2**80:>35,.0f} B = 1 YiB (about one septillion B)')
2^10 B = 1024^1 B =                               1,024 B = 1 KiB (about one thousand B)
2^20 B = 1024^2 B =                           1,048,576 B = 1 MiB (about one million B)
2^30 B = 1024^3 B =                       1,073,741,824 B = 1 GiB (about one billion B)
2^40 B = 1024^4 B =                   1,099,511,627,776 B = 1 TiB (about one trillion B)
2^50 B = 1024^5 B =               1,125,899,906,842,624 B = 1 PiB (about one quadrillion B)
2^60 B = 1024^6 B =           1,152,921,504,606,846,976 B = 1 EiB (about one quintillion B)
2^70 B = 1024^7 B =       1,180,591,620,717,411,303,424 B = 1 ZiB (about one sextillion B)
2^80 B = 1024^8 B =   1,208,925,819,614,629,174,706,176 B = 1 YiB (about one septillion B)



Types#

Type

Bytes (32-bit)

Bytes (64-bit)

32-bit Range

printf

char

1

1

[0, 255]

  %c

short int

2

2

[-32,768, 32,767]

%hd

short unsigned int

2

2

[0, 65,535]

%hu

int

4

4

[-214,748,648, 214,748,647]

  %d

unsigned int

4

4

  %u

long int

4

8

%ld

long long int

8

8

%lld

long unsigned int

%lu

long long unsigned int

%llu

float

4

4

  %f

double

8

8

%lf

long double

12

16

%Lf

float scientific

  %e

pointer (memory address)

4

8

  %p

string

  %s

octal

  %o

hexadecimal

  %x

C99 extended integer types

  • int8_t exactly 8 bits signed

  • int16_t exactly 16 bits signed

  • int32_t exactly 32 bits signed

  • int64_t exactly 64 bits signed

  • uint8_t exactly 8 bits unsigned

int8_t
int16_t
int32_t
int64_t
int128_t

uint8_t
uint16_t
uint32_t
uint64_t
uint128_t

defined in stdint.h

8-bit unsigned int that stores a value in the range 0-255

useful for dealing with data that can be represented by 8 bits, such as data buffers

Determining their size#

UINT_MAX  (%u)   :   4294967295
UINT_MAX  (%d)   :   -1
UINT_MAX  (%x)   :   ffffffff
ULONG_MAX (%lu)  :   18446744073709551615
ULONG_MAX (%ld)  :   -1
ULONG_MAX (%lx)  :   ffffffffffffffff
void print_type_sizes () {
  printf("               char : %02zu bits\n", sizeof(               char) * 8);
  printf("unsigned       char : %02zu bits\n", sizeof(unsigned       char) * 8);
  printf("              short : %02zu bits\n", sizeof(              short) * 8);
  printf("unsigned      short : %02zu bits\n", sizeof(unsigned      short) * 8);
  printf("                int : %02zu bits\n", sizeof(                int) * 8);
  printf("unsigned        int : %02zu bits\n", sizeof(unsigned        int) * 8);
  printf("               long : %02zu bits\n", sizeof(               long) * 8);
  printf("unsigned       long : %02zu bits\n", sizeof(unsigned       long) * 8);
  printf("          long long : %02zu bits\n", sizeof(          long long) * 8);
  printf("unsigned  long long : %02zu bits\n", sizeof(unsigned  long long) * 8);
  printf("              float : %02zu bits\n", sizeof(              float) * 8);
  printf("             double : %02zu bits\n", sizeof(             double) * 8);
  printf("        long double : %02zu bits\n", sizeof(        long double) * 8);
  printf("              void* : %02zu bits\n", sizeof(              void*) * 8);
}
               char : 08 bits
unsigned       char : 08 bits
              short : 16 bits
unsigned      short : 16 bits
                int : 32 bits
unsigned        int : 32 bits
               long : 64 bits
unsigned       long : 64 bits
          long long : 64 bits
unsigned  long long : 64 bits
              float : 32 bits
             double : 64 bits
        long double : 64 bits
              void* : 64 bits
/* sizeof.c */

#include <limits.h> /* CHAR_BIT */
#include <stdio.h>  /* printf   */
#include <stdint.h> /* int8_t, uint8_t, */

int main (int argc, char** argv) {
  printf("CHAR_BIT           : %2zu b\n", (size_t)CHAR_BIT);
  printf("\n");
  printf("              char : %2zu B    %25d    (CHAR_MIN) - %25d     (CHAR_MAX)\n", sizeof(              char),  CHAR_MIN,   CHAR_MAX);
  printf("  signed      char : %2zu B    %25d   (SCHAR_MIN) - %25d    (SCHAR_MAX)\n", sizeof(  signed      char), SCHAR_MIN,  SCHAR_MAX);
  printf("unsigned      char : %2zu B    %25u               - %25u    (UCHAR_MAX)\n", sizeof(unsigned      char),         0,  UCHAR_MAX);
  printf("             short : %2zu B    %25d    (SHRT_MIN) - %25d     (SHRT_MAX)\n", sizeof(             short),  SHRT_MIN,   SHRT_MAX);
  printf("unsigned     short : %2zu B    %25u               - %25u    (USHRT_MAX)\n", sizeof(unsigned     short),         0,  USHRT_MAX);
  printf("               int : %2zu B    %25d     (INT_MIN) - %25u      (INT_MAX)\n", sizeof(               int),   INT_MIN,    INT_MAX);
  printf("unsigned           : %2zu B    %25d               - %25u     (UINT_MAX)\n", sizeof(unsigned          ),         0,   UINT_MAX);
  printf("              long : %2zu B    %25ld    (LONG_MIN) - %25ld     (LONG_MAX)\n", sizeof(              long),  LONG_MIN,   LONG_MAX);
  printf("unsigned      long : %2zu B    %25u               - %25lu    (ULONG_MAX)\n", sizeof(unsigned      long),         0,  ULONG_MAX);
  printf("         long long : %2zu B    %25lld   (LLONG_MIN) - %25lld    (LLONG_MAX)\n", sizeof(         long long), LLONG_MIN,  LLONG_MAX);
  printf("unsigned long long : %2zu B    %25u               - %25llu   (ULLONG_MAX)\n", sizeof(unsigned long long),         0, ULLONG_MAX);
  printf("             float : %2zu B\n", sizeof(float));
  printf("            double : %2zu B\n", sizeof(double));
  printf("       long double : %2zu B\n", sizeof(long double));
  printf("            size_t : %2zd B\n", sizeof(size_t));
  printf("            int8_t : %2zd B    %25d     (INT8_MIN) - %25d   (INT8_MAX)\n", sizeof(  int8_t),  INT8_MIN,   INT8_MAX);
  printf("           uint8_t : %2zu B    %25u                - %25u  (UINT8_MAX)\n", sizeof( uint8_t),         0,  UINT8_MAX);
  printf("           int16_t : %2zd B    %25d    (INT16_MIN) - %25d  (INT16_MAX)\n", sizeof( int16_t), INT16_MIN,  INT16_MAX);
  printf("          uint16_t : %2zu B    %25u                - %25u (UINT16_MAX)\n", sizeof(uint16_t),         0, UINT16_MAX);
  printf("           int32_t : %2zd B    %25d    (INT32_MIN) - %25d  (INT32_MAX)\n", sizeof( int32_t), INT32_MIN,  INT32_MAX);
  printf("          uint32_t : %2zu B    %25u                - %25u (UINT32_MAX)\n", sizeof(uint32_t),         0, UINT32_MAX);
  printf("           int64_t : %2zd B    %25ld    (INT64_MIN) - %25ld  (INT64_MAX)\n", sizeof( int64_t), INT64_MIN, INT64_MAX);
  printf("          uint64_t : %2zu B    %25u                - %25lu (UINT64_MAX)\n", sizeof(uint64_t),         0, UINT64_MAX);

  return 0;
}
CHAR_BIT           :  8 b

              char :  1 B                            0    (CHAR_MIN) -                       255     (CHAR_MAX)
  signed      char :  1 B                         -128   (SCHAR_MIN) -                       127    (SCHAR_MAX)
unsigned      char :  1 B                            0               -                       255    (UCHAR_MAX)
             short :  2 B                       -32768    (SHRT_MIN) -                     32767     (SHRT_MAX)
unsigned     short :  2 B                            0               -                     65535    (USHRT_MAX)
               int :  4 B                  -2147483648     (INT_MIN) -                2147483647      (INT_MAX)
unsigned           :  4 B                            0               -                4294967295     (UINT_MAX)
              long :  8 B         -9223372036854775808    (LONG_MIN) -       9223372036854775807     (LONG_MAX)
unsigned      long :  8 B                            0               -      18446744073709551615    (ULONG_MAX)
         long long :  8 B         -9223372036854775808   (LLONG_MIN) -       9223372036854775807    (LLONG_MAX)
unsigned long long :  8 B                            0               -      18446744073709551615   (ULLONG_MAX)
             float :  4 B
            double :  8 B
       long double : 16 B
            size_t :  8 B
            int8_t :  1 B                         -128     (INT8_MIN) -                       127   (INT8_MAX)
           uint8_t :  1 B                            0                -                       255  (UINT8_MAX)
           int16_t :  2 B                       -32768    (INT16_MIN) -                     32767  (INT16_MAX)
          uint16_t :  2 B                            0                -                     65535 (UINT16_MAX)
           int32_t :  4 B                  -2147483648    (INT32_MIN) -                2147483647  (INT32_MAX)
          uint32_t :  4 B                            0                -                4294967295 (UINT32_MAX)
           int64_t :  8 B         -9223372036854775808    (INT64_MIN) -       9223372036854775807  (INT64_MAX)
          uint64_t :  8 B                            0                -      18446744073709551615 (UINT64_MAX)

Loss of precision#

#include <limits.h>
#include <stdio.h>

int main ()
{
  unsigned large_int = UINT_MAX;
  float    float_int = (float)large_int;
  double   doubl_int = (double)large_int;

  printf("unsigned int: %u\n", large_int);
  printf("float       : %f\n", float_int);
  printf("double      : %f\n", doubl_int);

  return 0;
}

Data Type

  • an abstraction over memory ranges and the data they contain, that allows them to be classified into different types

  • and so a variable name is just an alias for a memory range

short int   si = 9;
long  int   li = 1234567890L;
float        f = 3.14;
double       d = 1234567890.1234567;
char         c = 'a';
char      *ptr = &c;
short int *ptr = &si;

Type System

  • the compiler uses type information in order to determine how to apply logical operations defined by the code, and to determine ultimately which machine instructions are generated and executed

double one = 3.14, two = 4.5, res1;
int    three = 3, four = 4059; res2;

res1 = one + two      // double + double
res2 = three + four   // int + int

Programming language typing

  • distinctions between strong typing and weak typing refer to the quality of the language in dealing with ambiguity in types and usage

  • C IS WEAKLY TYPED, which leads to greater flexibility at the cost of being more error-prone

Careful

  • expressions like one + 1

  • passing parameters like f(one)

  • comparisons like one == 1

Type Casting (type) variable

double one = 3.24;
if (one / 3 == 1) printf("equal");         // 3.24 / 3.0 == 1.0
else              printf("not equal")

double one = 3.24;
if ((int)one / 3 == 1) printf("equal");    // 3 / 3 == 1
else                   printf("not equal")
// typecast.c

#include <stdio.h>

int main (void) {
  short int   si = 9;
  long  int   li = 1234567890L;
  float        f = 3.14;
  double       d = 1234567890.1234567;
  char         c = 'a';
  char      *ptr = &si;

  printf("short int %d %f %p\n",    (int)si,  (float)si, (char *)si);
  printf("long  int %d %f %p\n",    (int)li,  (float)li, (char *)li);
  printf("float     %d %f (ERR)\n", (int)f,   (float)f);
  printf("double    %d %f (ERR)\n", (int)d,   (float)d);
  printf("char      %d %f %p\n",    (int)c,   (float)c,  (char *)c);
  printf("ptr       %d (ERR) %p\n", (int)ptr, (char *)ptr);

  return 0;
}
./typecast
short int 9 9.000000 0x9
long  int 1234567890 1234567936.000000 0x499602d2
float     3 3.140000 (ERR)
double    1234567890 1234567936.000000 (ERR)
char      97 97.000000 0x61
ptr       1875863146 (ERR) 0x16fcf666a

typedef#

typedef old_type new_type

The compiler treats the new type just like the old type: the new type is like an alias of the old type.

EXAMPLE

typedef unsigned char bitfield

bifield my_func (bitfield x, int y) {
  float    a;
  bitfield z;

  ...

  return (bitfield)1;
}

structs#

C has no objects. The closest thing is a struct, or set of fields.

struct Struct_Name {
  list
  of
  variable
  definitions
  called
  fields
} variable_name(s);

struct Struct_Name another_variable_name;

EXAMPLE

struct {
  char name[128];
  int  mileage;
} gremlin, cayman, cessna180, montauk;

strcpy(cayman.name, "My Favorite Car");
cayman.mileage = 1240;

enums#

enum values are always ints. They are optional and assigned by the compiler if not explicitly assigned.

enum [ Enum_Name ] {
  id1 [ = val ],
  id2 [ = val ],
  ...
} variable;

EXAMPLE

enum {
  SUNDAY    = 0,
  MONDAY    = 1,
  TUESDAY   = 2,
  WEDNESDAY = 3,
  THURSDAY  = 4,
  FRIDAY    = 5,
  SATURDAY  = 6
} days_of_week;

enum {
  SUNDAY,
  MONDAY,
  TUESDAY,
  WEDNESDAY,
  THURSDAY,
  FRIDAY,
  SATURDAY,
} days_of_week;

unions#


struct {
  enum {
    AUTOMOTIVE   = 0,
    AERONAUTICAL = 1,
    MARINE       = 2
  } type;             //  12 B

  char name[128];     // 128 B
  int  mileage;       //   4 B

  struct {
    int cylinders;
    int horsepower;
    int hours_smoh;
  } engine;           //  12 B

  union {
    char vin[17];
    char tail_number[8];
    char hull_id[12];
  } vehicle_id;       //  17 B
} gremlin, cayman, cessna180, montauk;

cayman.type = AUTOMOTIVE;
if (cayman.type == AUTOMOTIVE) {
  printf("this is a car");
}

cayman.engine.cylinders = 6;
cayman.mileage          = 1240;

strcpy(vehicle_id.vin, "123456");
printf("%s\n", vehicle_id.vin);             // 123456
vehicle_id.tail_number = "F-BOZQ";
printf("%s\n", vehicle_id.vin);             // F-BOZQ
printf("Size: %lu\n", sizeof(vehicle_id));  // 17
typedef enum {
  AUTOMOTIVE   = 0;
  AERONAUTICAL = 1;
  MARINE       = 2;
} VEHICLE_TYPE;

typedef struct {
  int cylinders;
  int horsepower;
  int hours_smoh;
} ENGINE_INFO;

typedef union {
  char vin[17];
  char tail_number[8];
  char hull_id[12];
} VEHICLE_IDENT;

typedef struct {
  char          name[128];
  int           mileage;
  VEHICLE_TYPE  type;
  ENGINE_INFO   engine;
  VEHICLE_IDENT vehicle_id;
} VEHICLE;

VEHICLE gremlin, cayman, cessna180, montauk;

VEHICLE *vehicle = &cayman;
strcpy(vehicle->name, "2013 Porsche Cayman S");
vehicle->engine.cylinders = 6;

printf(
  "*** Vehicle Information ***\n"
  "Name         : %s\n"
  "Mileage      : %u\n"
  "Vehicle Type : %s\n"
  "Cylinders    : %u\n"
  "Horsepower   : %u hp\n"
  "SMOH         : %u hours\n"
  "VIN          : %s\n",
  vehicle->name,
  vehicle->mileage,
  (vehicle->type == AUTOMOTIVE) ? "car" :
      (vehicle->type == AERONAUTICAL) ? "airplane" : boat,
  vehicle->engine.cylinders,
  vehicle->engine.horsepower,
  vehicle->engine.hours_smoh,
  (vehicle->type == AUTOMOTIVE) ? vehicle->vehicle_id.vin :
      (vehicle->type == AERONAUTICAL) ? vehicle->vehicle_id.tail_number : vehicle->vehicle_id.hull_id;
);

bit fields#

struct {
  unsigned int width;  // 4 B
  unsigned int height; // 4 B
} status;



Arrays#

int arr[100]; // declaration - the compiler allocates 100 ints worth of storage in the stack initialized with garbage data

// an array does not know its own size, so the following is better
int SIZE = 100;
int arr[SIZE];

// arrays with fewer values than size are initialized with 0
int arr2[10] = { [0 ... 2] = 1 }; // 1 1 1 0 0 0 0 0 0 0
arr2[3]  = 1;                     // 1 1 1 1 0 0 0 0 0 0
arr2[10] = 1;                     // overflow!

Array names can be used as pointers to the first element of the array.

/*    array_and_pointer.c
 */

#include <stdio.h>

int main (void) {
  const int SIZE = 6;
  int a[SIZE] = { [0 ... 5] = 42 };

  for (int i = 0; i < SIZE; i++) {
    printf("a[%d] = %d, addr %p", i, *(a + i), a + i); printf("\n");
  }

  int *p = a;

  for (int i = 0; i < SIZE; i++) {
    printf("p[%d] = %d, addr %p", i, *(p + i), p + i); printf("\n");
  }

  p++;
  for (int i = 0; i < SIZE; i++) { // overflow!
    printf("p[%d] = %d, addr %p", i, *(p + i), p + i); printf("\n");
  }

  return 0;
}

An array is a contiguous block of memory.

Arrays are zero-indexed.

Arrays are declared with notation type name[size] or declared + initialized with notation type name[size] = {value0, value1, ..., value(n - 1)}. If fewer values are provided than what is indicated in the size of the array, the remaining values are initialized with the value 0.

An array of six integers requires \(6 \times 4 = 24\) bytes of memory.

Arrays have no methods and do not know their own length (and there is no bounds checking).

  • sizeof(array) is not reliable and only works in some situations

int scores[100];

// allocates 100 "ints worth" of memory where each array element contains garbage data initially
int n = 100;
int scores[n]; // C99 allows the array size to be an expression
int x[6] = {34, 11, -129, 49, 708, -11};
x[7] = 45; // legal, but will cause a memory fault

Arrays are passed by reference by default#

void min_max (int array[], int len, int *min, int *max) {
  // find the index of the largest value and assign it to max_i
  // find the index of the smallest value and assign it to min_i
  *max = array[max_i];
  *min = array[min_i];
}

int main (void) {
  int x[100];
  int largest, smallest;
  // some code that populates the array x
  min_max(x, 100, &smallest, &largest);
  printf("smallest = %d, largest = %d\n", smallest, largest);
}

Pointers#

#include <stdio.h>

int main (void) {
  int x  = 42; // variable definition
  int *p;      // pointer  declaration
  int *q = 43; // pointer  definition
  int *r;

  p = &x;     // assign address of x to pointer variable p

  printf(" x is %d\n",  x); // 42
  printf("&x is %p\n", &x); // address of x
  printf(" p is %p\n",  p); // address of x

  *p = 43;                  // dereference: *p is just x
  printf(" x is %d\n",  x); // 43

  r = p;                    // pointer assignment: r points to x just like p
  printf(" r is %p\n",  r); // address of x

  return 0;
}

Pass by reference in C via pointers

void min_max (int arr[], int len, int *min, int *max) {
  //                              ^^^^^^^^  ^^^^^^^^ pass by reference

  // min_i = the index of the smallest value of arr
  // max_i = the index of the largest  value of arr
  *min = arr[min_i]
  *max = arr[max_i]
}

int main (void) {
  int x[100];
  int min, max;

  // populate arr

  min_max(x, 100, &min, &max);
  //              ^^^^  ^^^^ pass by reference

  printf("min = %d, max = %d", min, max); printf("\n");
}

Pointers (and dynamically-allocated memory) can be returned from a function. Local addresses cannot be returned.

int *max (int *a, int *b) {
  if (*a > *b) return a;
  else         return b;
  //  ^^   ^^            comparing values of variables pointed to by a and b
  //           ^^^^^^^^  returning the address of the variable that contains max value
}

int main (void) {
  int x = 5;
  int y = 6;
  printf("max of %d and %d is %d", x, y, *max(&x, &y)); printf("\n");
}
int *foo (void) {
  int x = 5;
  return &x; // WRONG!!!
}
/* boxarrow.c */

#include <stdio.h> /* printf */

int main (int argc, char** argv) {
  int x      = 1;
  int arr[3] = {2, 3, 4};
  int* p     = &arr[1];

  printf("&x: %p; x: %d\n", &x, x);
  printf("&arr[0]: %p; arr[0]: %d\n", &arr[0], arr[0]);
  printf("&arr[2]: %p; arr[2]: %d\n", &arr[2], arr[2]);
  printf("&p: %p; p: %p; *p: %d\n", &p, p, *p);

  return 0;
}
the stack grows downward

address             | name   | value (this col represents main's stack frame)
--------------------|--------|------
&x      (0xbfff2dc) | x      | 1
&arr[2] (0xbfff2d8) | arr[2] | 4
&arr[1] (0xbfff2d4) | arr[1] | 3
&arr[0] (0xbfff2d0) | arr[0] | 2
&p      (0xbfff2cc) | p      | &arr[1] (0xbfff2d4)
$ gcc -g boxarrow.c -o boxarrow

$ ./boxarrow
&x: 0x7fce62361c; x: 1
&arr[0]: 0x7fce623610; arr[0]: 2
&arr[2]: 0x7fce623618; arr[2]: 4
&p: 0x7fce623608; p: 0x7fce623614; *p: 3
/* boxarrow2.c */

int main (int argc, char** argv) {
  int x      = 1;
  int arr[3] = {2, 3, 4};
  int* p     = &arr[1];
  int** dp   = &p;

  **dp += 1 /* arr[1] = 4  */
  p    += 1 /* p = &arr[2] */
  **dp += 1 /* arr[2] = 5  */

  return 0;
}
address             | name   | value (this col represents main's stack frame)
--------------------|--------|------
&x      (0xbfff2dc) | x      | 1
&arr[2] (0xbfff2d8) | arr[2] | 4 --[ **dp += 1 ]--> 5
&arr[1] (0xbfff2d4) | arr[1] | 3 --[ **dp += 1 ]--> 4
&arr[0] (0xbfff2d0) | arr[0] | 2
&p      (0xbfff2cc) | p      | &arr[1] (0xbfff2d4) --[ p += 1 ]--> &arr[2] (0xbfff2d8)
&dp     (0xbfff2c8) | dp     | &p      (0xbfff2cc)


/* doublepointer.c */

#include <stdio.h> /* printf */

int main (int argc, char** argv) {
  char hi[6] = {'h', 'e', 'l', 'l', 'o', '\0'};

  char* p;
  char** dp;

  p  = &hi[0];
  dp = &p;

  printf("%c %c\n", *p, **dp);      /* 'h' */
  printf("%p %p %p\n", p, *dp, hi); /* &hi[0] */
  p += 1;                           /* p = &hi[1] */
  printf("%c %c\n", *p, **dp);      /* 'e' */
  printf("%p %p %p\n", p, *dp, hi); /* &hi[1], &hi[1], &hi[0] */
  *dp += 2;                         /* p = &hi[3] */
  printf("%c %c\n", *p, **dp);      /* 'l' */
  printf("%p %p %p\n", p, *dp, hi); /* &hi[3], &hi[3], &hi[0] */

  return 0;
}
Why is the stack not growing down on the Raspberry Pi?

address                         | name  | value (this col represents main's stack frame)
--------------------------------|-------|------
&dp    (0x7ffffff208)           | dp    | &p
&hi[5] (0x7ffffff205)           | hi[5] | '\0'
&hi[4] (0x7ffffff204)           | hi[4] | 'o'
&hi[3] (0x7ffffff203)           | hi[3] | 'l'    <-- *(p + 3)
&hi[2] (0x7ffffff202)           | hi[2] | 'l'
&hi[1] (0x7ffffff201) <-- p + 1 | hi[1] | 'e'    <-- *(p + 1)
&hi[0] (0x7ffffff200)           | hi[0] | 'h'    <-- *p, **dp
&p     (0x7ffffff1f8)           | p     | &hi[0] <-- *dp
$ gcc -g doublepointer.c -o doublepointer

$ ./doublepointer 
h h
0x7fd2eab8e0 0x7fd2eab8e0 0x7fd2eab8e0
e e
0x7fd2eab8e1 0x7fd2eab8e1 0x7fd2eab8e0
l l
0x7fd2eab8e3 0x7fd2eab8e3 0x7fd2eab8e0

/* pointer_arithmetic.c */

#include <stdio.h> /* printf */

int main (int argc, char** argv) {
  int arr[3]     = {1, 2, 3};
  int* int_ptr   = &arr[0];
  char* char_ptr = (char *)int_ptr; /* &arr[0] */

  printf(" int_ptr: %p;  *int_ptr: %d\n", int_ptr, *int_ptr);   /* &arr[0], 1                       */
  int_ptr += 1;                                                 /* int_ptr = &arr[1]                */
  printf(" int_ptr: %p;  *int_ptr: %d\n", int_ptr, *int_ptr);   /* &arr[1], 2                       */
  int_ptr += 2;                                                 /* int_ptr = &arr[3] does not exist */
  printf(" int_ptr: %p;  *int_ptr: %d\n", int_ptr, *int_ptr);   /* ???                              */

  printf("char_ptr: %p; *char_ptr: %d\n", char_ptr, *char_ptr); /* &arr[0], 1                       */
  char_ptr += 1;
  printf("char_ptr: %p; *char_ptr: %d\n", char_ptr, *char_ptr); /* &arr[0], 0                       */
  char_ptr += 2;
  printf("char_ptr: %p; *char_ptr: %d\n", char_ptr, *char_ptr); /* &arr[0], 0                       */

  return 0;
}
 int_ptr: 0x7ff0810150;  *int_ptr: 1
 int_ptr: 0x7ff0810154;  *int_ptr: 2
 int_ptr: 0x7ff081015c;  *int_ptr: 127
char_ptr: 0x7ff0810150; *char_ptr: 1
char_ptr: 0x7ff0810151; *char_ptr: 0
char_ptr: 0x7ff0810153; *char_ptr: 0

Multidimensional arrays#

C arrays are stored in row-major order.

int x[][4]  =
int x[2][4] = {{1, 2, 3, 4},
               {5, 6, 7, 8}}; // row-major order -> 1 2 3 4 5 6 7 8
//    ^     2 row (width)
//       ^  4 col (height)

int x[][2][4]  =
int x[3][2][4] = {{{ 1,  2,  3,  4}, { 5,  6,  7,  8}},
                  {{ 9, 10, 11, 12}, {13, 14, 15, 16}},
                  {{17, 18, 19, 20}, {21, 22, 23, 24}}};
//    ^        3 row (width)
//       ^     2 col (height)
//          ^  4     (depth, first elem in front as col rep)
//
// row-major order -> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
//
// if you were just looking at the front of this 3d array, it would look like this
//     x[3][2] = {{{ 1}, { 5}},
//                {{ 9}, {13}},
//                {{17}, {21}}}
//
// there are 3 "2d" arrays stacked on top of one another, each of which has 2 row and 4 col

Arrays of strings#

char planets[][8] = {
  "Mercury",  // M e r c  u  r  y \0
  "Venus",    // V e n u  s \0 \0 \0
  "Earth",    // E a r t  h \0 \0 \0
  "Mars",     // M a r s \0 \0 \0 \0
  "Jupiter",  // J u p i  t  e  r \0
  "Saturn",   // S a t u  r  n \0 \0
  "Uranus",   // U r a n  u  s \0 \0
  "Neptune",  // N e p t  u  n  e \0
};

// array of pointers where each pointer points to a string
char *planets[] = {
  "Mercury",  // -> M e r c  u  r  y \0
  "Venus",    // -> V e n u  s \0
  "Earth",    // -> E a r t  h \0
  "Mars",     // -> M a r s \0
  "Jupiter",  // -> J u p i  t  e  r \0
  "Saturn",   // -> S a t u  r  n \0
  "Uranus",   // -> U r a n  u  s \0
  "Neptune",  // -> N e p t  u  n  e \0
};

for (int i = 0; i < 8; ++i) {
  if (planets[i][0] == 'M')
    printf("%s\n", planets[i]);
}

for (char **p = planets; p < planets + 8; ++p) {
  if (**p == 'M')
    printf("%s\n", *p)
}
#include <stdio.h>

int main (void) {
  int   i =  5;
  int *ip = &i;

  printf("%d\n", i);     // 5
  printf("%p\n", ip);    // memory address
  *ip = 42;              //     re-assign the value of variable i via dereference
  printf("%d\n", i);     // 42
  printf("%d\n", *ip);   // 42
}
gcc -g -Wall pointer.c -o myprog

Pointers and pass by value and pass by reference#

Example#

C passes by value: values passed into a function are copied and local modifications to those copies are not reflected in the original values.

However, pointers let you pass by reference.

void add_pbv (int c) {
  c += 10;
  printf("pbv c: %d\n", c);   // 11
}

void add_pbr (int *c) {       //     this parameter is a pointer
  *c += 10;                   //     which is dereferenced
  printf("pbr *c: %d\n", *c); // 11
}

int main (void) {
  int x = 1;
  printf("x: %d\n", x);   // 1
  add_pbv(x);             //     pass a copy of the value of the variable, the value of the variable remains unchanged
  printf("x: %d\n", x);   // 1, not 11
  add_pbr(&x);            //     pass the memory location of the variable
  printf("x: %d\n", x);   // 11
  return 0;
}

Example#

PASS BY VALUE

#include <stdio.h>

void swap (int a, int b) {
  int tmp =   a;
        a =   b;
        b = tmp;
}

int main (void) {
  int a = 42, b = -7;
  swap(a, b);
  printf("a: %d, b: %d\n", a, b);
  return 0;
}
./pointer_pass_by_value
a: 42, b: -7

PASS BY REFERENCE VIA POINTERS

#include <stdio.h>

void swap (int *a, int *b) {
  int tmp =  *a;
       *a =  *b;
       *b = tmp;
}

int main (void) {
  int a = 42, b = -7;
  swap(&a, &b);
  printf("a: %d, b: %d\n", a, b);
  return 0;
}
./pointer_pass_by_reference
a: -7, b: 42

Back to pointers#

We use notation type *name; to declare (or type *name = &address; to declare and initialize) a pointer variable.

We subsequently use dereference notation *name to refer to the value of the variable pointed to by the pointer variable.

We can use notation name to refer to the memory address stored in the pointer variable.

EXAMPLE

int  x = 42; // regular variable declaration + initialization   memory address 0x1
int *p;      // pointer variable declaration                    memory address 0x2
     p = &x; // pointer variable               initialization

printf("%d",  x);   //  42
printf("%p", &x);   // 0x1
printf("%d", *p);   //  42
printf("%p",  p);   // 0x1

Pointers as return values#

We can return a pointer that was passed (or dynamically allocated memory???).

int *max (int *a, int *b) {
  if (*a > *b) return a;
               return b;
}

int main (void) {
  int x = 5, y = 6;
  printf("max of %d and %d is %d\n", x, y, *max(&x, &y));
}

We cannot return the address of a local variable.

int *foo (void) {
  int x = 5;
  return &x;
}

int main (void) {
  printf("%d\n", *foo());
}

Adding integer i to pointer p yields a pointer to the element i places after the element that p points to.

int a[6] = { [0 ... 5] = 42 };   // `42 42 42 42 42 42`
int *p = &a[2];                  //        |
int *q = p + 3;                  //                 |
++p;                             //           |

The name of an array can be used as a pointer.

a is the same as &a[0]

a[0] is the same as *(&a[0] + 0) or *(a + 0) or *a

a[1] is the same as *(&a[0] + 1) or *(a + 1)

a[2] == 2[a]

  • a[2] is the same as &a[0] + 2

  • 2[a] is the same as 2 + &a[0]

int a[5] = {0, 1, 2, 3, 4};
printf("%d %d\n", a[2], 2[a]);

Pointer Arithmetic#

The pointer is incremented based on the size of a single array element.

  • Pointer arithmetic is only applicable to pointers that point to arrays. Otherwise, it is undefined.

  • Pointers can only be subtracted if they point to the same array. Otherwise, it is undefined.

Example#

# include <stdio.h>

int main (void) {
  int foo[10];
  int *p = foo;   // point to the 0-th element
  printf("%p\n", p);
  ++p;            // point to the next element
  printf("%p\n", p);

  char bar[10];
  char *q = bar;  // point to the 0-th element
  printf("%p\n", q);
  ++q;            // point to the next element
  printf("%p\n", q);
}
gcc -Wall foo.c -o foo && ./foo
0x16fdc2620
0x16fdc2624
0x16fdc2616
0x16fdc2617

Example#

#include <stdio.h>

int main (void) {
  int foo[10] = { [0 ... 9] = 42 };
  int sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0;

  for (int i = 0; i < 10; ++i)
    sum1 += foo[i];

  for (int *p = &foo[0]; p < &foo[10]; ++p)
    sum2 += *p;

  for (int *p = foo; p < foo + 10; ++p)
    sum3 += *p;

  int *p = foo;
  for (int i = 0; i < 10; ++i)
    sum4 += p[i]   // &foo[0] + i

  printf("%d %d %d %d\n", sum1, sum2, sum3, sum4);
}
gcc -Wall sum.c -o sum && ./sum
420 420 420 420



Strings#

There are no strings proper, only character arrays which are terminated by the null character \0.

C “strings” are not objects and have no methods, but string.h has helpful utilities.

char *x = "hello\n";   // x --> h e l l o \n \0
#include <string.h>

The C string API is error-prone so study secure C coding guidelines.

https://wiki.sei.cmu.edu/confluence/display/c/SEI+CERT+C+Coding+Standard

ASCII

int a = 65;
printf("%d is ASCII \'%c\'\n", a, (char)a);

String literals are stored in read-only memory. A string literal-initialized array’s contents are copied from the read-only memory and so it writable. A string-literal initialized pointer points directly to the read-only memory and so is read-only.

// define a string   h e l l o \n \0
char *str1   = "hello\n"; printf("%s", str1); // read-only
char str2[]  = "hello\n"; printf("%s", str2); // initialized by the compiler
char str[7]  = "hello\n"; printf("%s", str);  // initialized by the compiler?

str1[4] = 'j' // segmentation fault!
str2[4] = 'j' // this is okay

Determining the size of a string

sizeof returns the size of the declaration

strlen returns the number of characters less the null terminator

char *str1    = "text for example";
char str2[17] = "text for example";

printf("str1 has size %lu\n", sizoef(str1)); //  8, the size in bytes of a pointer on a 64-bit machine
printf("str2 has size %lu\n", sizoef(str2)); // 17
printf("str1 has leng %lu\n", strlen(str1)); // 16
printf("str2 has leng %lu\n", strlen(str2)); // 16
char *str1   = "abc";                 // read-only
char str2[]  = "abc";                 // compiler determines size 4
char str3[4] = "abc";
char str4[4] = "abcd";                // incorrect! should be size 5, including the null terminator
char str5[]  = {'a', 'b', 'c', '\0'}; // compiler determines size 4
char str6[3] = {'a', 'b', 'c'};       // incorrect! should be size 4, including the null terminator
char str7[9] = {'a', 'b', 'c'};       // a b c \0 \0 \0 \0 \0 \0 \0

printf("str1 = %s\n", str1);
printf("str2 = %s\n", str2);
printf("str3 = %s\n", str3);
printf("str4 = %s\n", str4); // continues to read memory past 3 characters until a null terminator is encountered
printf("str5 = %s\n", str5);
printf("str6 = %s\n", str6); // continues to read memory past 3 characters until a null terminator is encountered
printf("str7 = %s\n", str7);

String copy#

strcpy(dst, src); copies src to dst up to and including the null terminator

char *str1 = "abcde"; // 6 characters, including the null terminator
char str2[6];
char str3[3];
int i = 0xff; //            0000 0000 0000 0000 0000 0000 1111 1111
              // d -> 68 -> 0100 0100
              // e -> 69 ->           0100 0101
              //                                0000 0000
              //     101                                  0110 0101   ???

printf("str1 = %s\n", str1); // abcde
strcpy(str2, str1);
printf("str2 = %s\n", str2); // abcde
printf(   "i = %d\n",    i); // 255
strcpy(str3, str1);          // overflow! d e \0 overwrote other data
printf("str3 = %s\n", str3); // abcde
printf(   "i = %d\n",    i); // 101

strncpy(dst, src, n)

char *str1 = "abcde";
char str2[6];
char str3[3];
int i = 0xff;

printf("str1 = %s\n", str1); // abcde
strcpy(str2, str1);
printf("str2 = %s\n", str2); // abcde
printf(   "i = %d\n",    i); // 255
strncpy(str3, str1, 2);
str3[2] = 0x0;
printf("str3 = %s\n", str3); // ab
printf(   "i = %d\n",    i); // 255

String concatenation#

strcat(dst, src) appends src to dst, strncat(dst, src, n) appends at most n bytes of src to dst

char str1[20] = "abcde";
char *str2    = "efghi";
char str3[20] = "abcde";

strcat(str1, str2);
printf("str1 is [%s]\n", str1); // abcdeefghi
strncat(str3, str2, 20);
printf("str3 is [%s]\n", str3); // abcdeefghi
char* strncat (char* dst, const char* src, size_t n)
/*             ^^^^^^^^^                             a pointer to the destination buffer, which should be large enough to contain
 *                                                       len(dst_string) + len(src_string) + '\0'
 *                        ^^^^^^^^^^^^^^^            a pointer to the string to concatenate
 *                                         ^^^^^^^^  the maximum number of characters to concatenate
 *
 * strncat
 * -------
 * appends up to `n` characters of the string pointed to by `src` to the string pointed to by `dst`
 *
 * returns
 * -------
 * a pointer to the destination buffer
 */

String comparison#

Strings are compared to see whether they match, or whether one is lexicographically larger or smaller than the other

strcmp(s1, s2) compares the strings, strncmp(s1, s2, n) compares the first n bytes of the strings

return neg_int if s1 < s2
return       0 if s1 = s2
return pos_int if s1 > s2

String parse#

sscanf(str, "format", &args) (arguments must be passed by reference using &)

char *str = "1 3.14 a bob";
char c;
char s[20];
float f;
int ret, i;

ret = sscanf(str, "", &i, &f, &c, s);
printf("Scanned %d fields int [%d], float [%f], char [%c], string [%s]\n", ret, i, f, c, s);
#include <stdio.h>
#include <string.h>
#include <stdint.h>

// print an array of chars
void print_char_array (const char *arr, size_t size) {
  //                   ^^^^^^^^^^^^^^^   pointer to an array of constant chars
  for (int i = 0; i < size; ++i)
    printf("%c ", arr[i]);
  printf("\n");
}

// copy elems from array `src` to array `dst` via function `memcpy`
void copy_whole_array (char *src, char *dst, int copy_length) {
  memcpy(dst, src, copy_length);
}

// copy the string from string `src_str` to string `dst_str` via function `memcpy` and include the null terminator
void copy_whole_string (char *src_str, char *dst_str, int copy_length) {
  memcpy(dst_str, src_str, copy_length);
}

// copy length elems from idx 0 in array `src_partial` to array `dst_partial` via function `memcpy`
void copy_partial_array (char *src_partial, char *dst_partial, int copy_length) {
  memcpy(dst_partial, src_partial, copy_length);
}

// copy length elems from idx 3 in array `src_partial` to array `dst_partial` via function `memcpy`
void copy_partial_array_1 (char *src_partial, char *dst_partial, int copy_length) {
  memcpy(dst_partial, src_partial + 3, copy_length);
}

// copy elems from array `src1` to array `dst` following the pattern of copying 2 elems and skipping 1 elem and repeating until the required size is achieved via function `memcpy`
void copy_multiple_3_indices (char *src1, char *dst, int src_length, int required_size) {
  int copied_elements = 0;
  int start_idx = 0;
  int dst_idx = 0;

  while (copied_elements < required_size) {
    for (int i = 0; i < src_length; ) {
      int elements_to_copy = 2;
      memcpy(dst + dst_idx, src1 + start_idx, 2);
      dst_idx += elements_to_copy;
      start_idx += 3;
      copied_elements += elements_to_copy;
      i += 3;
    }
  }
}

int main () {
  // 1 - array of chars
  char src[] = {'1', '2', '3', '4', '5'};
  char dst[5];
  copy_whole_array(src, dst, 5);
  printf("EXERCISE 1 - array of chars\n");
  printf("src: ");
  print_char_array(src, 5);
  printf("dst: ");
  print_char_array(dst, 5);
  printf("\n");

  // 2 - string
  char src_str[] = "Hello, world!";
  char dst_str[20];
  copy_whole_string(src_str, dst_str, 14);
  printf("EXERCISE 2 - string\n");
  printf("src: %s\n", src_str);
  printf("dst: %s\n", dst_str);
  printf("\n");

  // 3 - portion of an array
  char src_partial[] = {'a', 'b', 'c', 'd', 'e', 'i', 'o', 'u'};
  char dst_partial[4];
  copy_partial_array(src_partial, dst_partial, 3);
  printf("EXERCISE 3 - portion of an array\n");
  printf("src: ");
  print_char_array(src_partial, 8);
  printf("dst: ");
  print_char_array(dst_partial, 3);
  printf("\n");

  // 4 - portion of an array
  char dst_partial2[4];
  copy_partial_array_1(src_partial, dst_partial2, 4);
  printf("EXERCISE 4 - portion of an array\n");
  printf("src: ");
  print_char_array(src_partial, 8);
  printf("dst: ");
  print_char_array(dst_partial2, 4);
  printf("\n");

  // 5 - portion of an array
  char src1[15] = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o'};
  char dst[6];
  int required_size = 6;
  copy_multiple_3_indices(src1, dst, 15, required_size);
  printf("EXERCISE 5 - portion of an array\n");
  printf("src: ");
  print_char_array(src1, 15);
  printf("dst: ");
  print_char_array(dst, 6);
  printf("\n");

  return 0;
}



Bit Operations#

A logic left shift or logic right shift shifts bits and fills vacant positions with zeros.

0xFFFF1234
0xFFF12340    4-bit logic left  shift
0x0FFFF123    4-bit logic right shift

An artihmetic shift preserves the sign bit. C99 requires that a signed integer be represented in two’s complement or one’s complement and so the MSB (most significant bit) of the integer is the sign bit.

0xFFFF1234
0xFFF12340    4-bit arithmetic left  shift (fill in with 0s)
0xFFFFF123    4-bit arithmetic right shift (fill in with 1s to preserve the sign bit)

0x12345678
0x23456780    4-bit arithmetic left  shift (fill in with 0s)
0x01234567    4-bit arithmetic right shift (fill in with 0s to preserve the sign bit)

Bitwise left shifts are zero-padded whether they are arithmetic or logical, signed or unsigned.

/* convert a byte to a binary string
 *
 * for each bit, if the bit is set then assign '1' to the appropriate position in the string; otherwise, '0'
 */
void byte_to_binary (uint8_t byte, char *buffer) {
  for (int i = 7; i >= 0; i--)
    buffer[7 - i] = (byte & (1 << i)) ? '1' : '0';
  buffer[8] = '\0';
}
/* extract n adjacent bits
 */

uint32_t mask (uint32_t b, uint32_t n) {
/*             ^^^^^^^^^^                start     bit
 *                         ^^^^^^^^^^    number of bits
 *
 * construct a bit mask with number of bits equal to the return type
 * mask(4, 4) creates the following 32-bit bit mask: 0000 0000 0000 0000 0000 0000 1111 0000
 */

  return ((1 << n) - 1) << b;
}
print(int('11111111111111111', 2))
print(int('01111100000000000', 2))
131071
63488

Bit \(\in \{ 0, 1\}\)

Nibble = 4 bits

Byte = 8 bits

Word = 4 bytes = 32 bits


Number Systems#

Base b (base b to decimal)

\(\texttt{decimal\_value} = \sum_{i = 0}^{k} b^i \times p_i \quad \text{st} \quad p_i \in \{ 0, \dotsc, b - 1 \}\)

\( \underset{p_2}{}\underset{p_1}{}\underset{p_0}{} = \underbrace{b^2 \times p_2}_{} + \underbrace{b^1 \times p_1}_{} + \underbrace{b^0 \times p_0}_{} = \)

Base 10

\(\texttt{decimal\_value} = \sum_{i = 0}^{k} 10^i \times p_i \quad \text{st} \quad p_i \in \{ 0, \dotsc, 9 \}\)

\( \underset{p_2}{1}\underset{p_1}{2}\underset{p_0}{3} = \underbrace{10^2 \times 1}_{100} + \underbrace{10^1 \times 2}_{20} + \underbrace{10^0 \times 3}_{3} = 123 \)

Base 2 (binary to decimal)

\(\texttt{decimal\_value} = \sum_{i = 0}^{k} 2^i \times p_i \quad \text{st} \quad p_i \in \{ 0, 1 \}\)

\( \underset{p_3}{1}\underset{p_2}{0}\underset{p_1}{1}\underset{p_0}{1} = \underbrace{2^3 \times 1}_{8} + \underbrace{2^2 \times 0}_{0} + \underbrace{2^1 \times 1}_{2} + \underbrace{2^0 \times 1}_{1} = 11 \)

Base 16 (hexadecimal to decimal)

\(\texttt{decimal\_value} = \sum_{i = 0}^{k} 16^i \times p_i \quad \text{st} \quad p_i \in \{ 0, \dotsc, 9, \underset{10}{\texttt{a}}, \underset{11}{\texttt{b}}, \underset{12}{\texttt{c}}, \underset{13}{\texttt{d}}, \underset{14}{\texttt{e}}, \underset{15}{\texttt{f}} \}\)

\( \underset{p_2}{\texttt{a}}\underset{p_1}{\texttt{f}}\underset{p_0}{\texttt{c}} = \underbrace{16^2 \times \texttt{a}}_{2560} + \underbrace{16^1 \times \texttt{f}}_{240} + \underbrace{16^0 \times \texttt{c}}_{12} = 2812 \)

Decimal to base b

p = [] // contains hexadecimal digit values from least-sig to most-sig
i = 0
while dec > 0
    p[i] = dec % b // get the next hexadecimal digit value, from least-sig to most-sig
    dec /= b
    i++

// print the number by iterating through the digit array backwards since most-sig digits come first

Decimal to binary

p = [] // contains hexadecimal digit values from least-sig to most-sig
i = 0
while dec > 0
    p[i] = dec % 2 // get the next hexadecimal digit value, from least-sig to most-sig
    dec /= 2
    i++

// print the number by iterating through the digit array backwards since most-sig digits come first

Decimal to hexadecimal

p = [] // contains hexadecimal digit values from least-sig to most-sig
i = 0
while dec > 0
    p[i] = dec % 16 // get the next hexadecimal digit value, from least-sig to most-sig
    dec /= 16
    i++

// print the number by iterating through the digit array backwards since most-sig digits come first

Binary and hexadecimal

\( \texttt{deadbeef} = \underbrace{1101}_{\texttt{d}} \, \underbrace{1110}_{\texttt{e}} \, \underbrace{1010}_{\texttt{a}} \, \underbrace{1101}_{\texttt{d}} \, \underbrace{1011}_{\texttt{b}} \, \underbrace{1110}_{\texttt{e}} \, \underbrace{1110}_{\texttt{e}} \, \underbrace{1111}_{\texttt{f}} \)


// dec_to_bin.c

#include <stdio.h>

void dec_to_bin (int dec) {
  int bin[32];
  
  int idx = 0;
  while (dec != 0) {      // `dec > 0` could also work here
    bin[idx++] = dec % 2; // from rightmost (least-sig) binary digit to leftmost (most-sig) binary digit
    dec /= 2;             // when `dec` equals 1, 1/2 evaluates to 0 and the loop exits
  }

  if (idx == 0) printf("0");           // when the while loop never iterated because the condition was false on its first pass (meaning `dec == 0`)
  for (int i = idx - 1; i >= 0; i--) { // most-sig binary digits were computed last, and are printed first here
    printf("%d", bin[i]);
  }
  printf("\n");
}

int main () {
  int dec;

  printf("Enter a decimal integer: ");
  scanf("%d", &dec);

  dec_to_bin(dec);

  return 0;
}

Endianness#

How are bytes within a multi-byte word ordered in memory?

Big endianness means the least-significant byte has the highest memory address.

Little endianness means that the least significant byte has the lowest memory address.

EXAMPLE

int x = 305419896;

printf("%p\n", &x); // 0x0

\( 305,419,896 = 12345678_{16}\)

Big endian representation in memory: memory addresses are read from the lower address \(\texttt{0x0}\) to higher ones

\( \overset{\texttt{0x0}}{12} \quad \overset{\texttt{0x1}}{34} \quad \overset{\texttt{0x2}}{56} \quad \overset{\texttt{0x3}}{78} \)

Little endian representation in memory: memory addresses are read from the lower address \(\texttt{0x0}\) to higher ones

\( \overset{\texttt{0x0}}{78} \quad \overset{\texttt{0x1}}{56} \quad \overset{\texttt{0x2}}{34} \quad \overset{\texttt{0x3}}{12} \)

#include <stdio.h>
#include <stdint.h> // for type `uint8_t`

void show_bytes (uint8_t *start, int len) {
  for (int i = 0; i < len; ++i)
    printf("%p\t0x%.2x\n", start + i, start[i]);
  printf("\n");
}

int main (void) {
  int a = 305419896;
  printf("a lives at address %p\n\n", &a);
  show_bytes((uint8_t *)&a, sizeof(int));
}

Bitwise Operations#

&   // and
|   // or
~   // not
^   // exclusive or
~0x00

\( \begin{aligned} 00_{16} &= 0000 \quad 0000 \\ \sim 00_{16} &= 1111 \quad 1111 = \text{FF} \\ \end {aligned} \)

~0x41

\( \begin{aligned} 41_{16} &= 0100 \quad 0001 \\ \sim 41_{16} &= 1011 \quad 1110 = \text{BE} \\ \end {aligned} \)

0x69 & 0x55

\( \begin{aligned} 69_{16} &= 0110 \quad 1001 \\ 55_{16} &= 0101 \quad 0101 \\ 69_{16} \,\&\, 55_{16} &= 0100 \quad 0001 = 41_{16} \\ \end {aligned} \)

0x69 | 0x55

\( \begin{aligned} 69_{16} &= 0110 \quad 1001 \\ 55_{16} &= 0101 \quad 0101 \\ 69_{16} \,|\, 55_{16} &= 0111 \quad 1101 = \text{7D} \\ \end {aligned} \)

Bit shift#

uint8_t x = 172    // 10101100
x = x << 3         //    01100 000
printf("%d\n", x); // 96

Representing mathematical sets#

A width-\(w\) bitstring represents subsets of the set \(A = \{ 0, \dotsc, w - 1 \}\); namely, \(a_j = 1 \implies j \in A\).

For example, the bitstring

\( \underset{7}{0} \underset{6}{1} \underset{5}{0} \underset{4}{1} \underset{3}{0} \underset{2}{1} \underset{1}{0} \underset{0}{1} \)

represents the set

\(\{ 6, 4, 2, 0 \}\)

and the bitstring

\( \underset{7}{0} \underset{6}{1} \underset{5}{1} \underset{4}{0} \underset{3}{1} \underset{2}{0} \underset{1}{0} \underset{0}{1} \)

represents the set

\(\{ 6, 5, 3, 0 \}\)

Then the bitwise operators can be interpreted as the set operations.

&   // intersection
|   // union
~   // complement
^   // symmetric difference

\( \begin{aligned} \underset{7}{0} \underset{6}{1} \underset{5}{0} \underset{4}{1} \underset{3}{0} \underset{2}{1} \underset{1}{0} \underset{0}{1} \,\&\, \underset{7}{0} \underset{6}{1} \underset{5}{1} \underset{4}{0} \underset{3}{1} \underset{2}{0} \underset{1}{0} \underset{0}{1} &= \underset{7}{0} \underset{6}{1} \underset{5}{0} \underset{4}{0} \underset{3}{0} \underset{2}{0} \underset{1}{0} \underset{0}{1} \end {aligned} \)

\(\{ 6, 4, 2, 0 \} \cap \{ 6, 5, 3, 0 \} = \{ 6, 0 \}\)

\( \begin{aligned} \underset{7}{0} \underset{6}{1} \underset{5}{0} \underset{4}{1} \underset{3}{0} \underset{2}{1} \underset{1}{0} \underset{0}{1} \,|\, \underset{7}{0} \underset{6}{1} \underset{5}{1} \underset{4}{0} \underset{3}{1} \underset{2}{0} \underset{1}{0} \underset{0}{1} &= \underset{7}{0} \underset{6}{1} \underset{5}{1} \underset{4}{1} \underset{3}{1} \underset{2}{1} \underset{1}{0} \underset{0}{1} \end {aligned} \)

\(\{ 6, 4, 2, 0 \} \cup \{ 6, 5, 3, 0 \} = \{ 6, 5, 4, 3, 2, 0 \}\)

\( \begin{aligned} \sim \underset{7}{0} \underset{6}{1} \underset{5}{0} \underset{4}{1} \underset{3}{0} \underset{2}{1} \underset{1}{0} \underset{0}{1} &= \underset{7}{1} \underset{6}{0} \underset{5}{1} \underset{4}{0} \underset{3}{1} \underset{2}{0} \underset{1}{1} \underset{0}{0} \end {aligned} \)

\(\{ 6, 4, 2, 0 \}^\text{C} = \{ 7, 5, 3, 1 \}\)

\( \begin{aligned} \underset{7}{0} \underset{6}{1} \underset{5}{0} \underset{4}{1} \underset{3}{0} \underset{2}{1} \underset{1}{0} \underset{0}{1} \,\wedge\, \underset{7}{0} \underset{6}{1} \underset{5}{1} \underset{4}{0} \underset{3}{1} \underset{2}{0} \underset{1}{0} \underset{0}{1} &= \underset{7}{0} \underset{6}{0} \underset{5}{1} \underset{4}{1} \underset{3}{1} \underset{2}{1} \underset{1}{0} \underset{0}{0} \end {aligned} \)

\(\{ 6, 4, 2, 0 \} \Delta \{ 6, 5, 3, 0 \} = \{ 5, 4, 3, 2 \}\)


for (uint8_t n = 10; n >= 0; --n) {
  printf("do I ever terminate?\n");
}

Since a uint8_t has a value in the range 0-255 it will never become negative but rather when \(n\) reaches 0 it will be decremented to 255.

// an int contains a value ranging from -2,147,483,648 to 2,147,483,647
// the value of 0xffffffff is the equal to the magnitude of this range
// so the following wraps around to the value -1
int x = 0xffffffff;
printf("%d\n", x); // -1

\( \underbrace{16^7 \times \texttt{f}}_{4,026,531,840} + \underbrace{16^6 \times \texttt{f}}_{251,658,240} + \underbrace{16^5 \times \texttt{f}}_{15,728,640} + \underbrace{16^4 \times \texttt{f}}_{983,040} + \underbrace{16^3 \times \texttt{f}}_{61,440} + \underbrace{16^2 \times \texttt{f}}_{3,840} + \underbrace{16^1 \times \texttt{f}}_{240} + \underbrace{16^0 \times \texttt{f}}_{15} = 4,294,967,295 = 2 \times 2,147,483,647 + 1 \)


Say you want to place multiple bytes/values into a four-byte word (32-bit int).

  • value \(a\) into the least-significant byte (bits 0-7)

  • value \(b\) into the second byte (bits 8-15)

  • value \(c\) into the third byte (bits 16-23)

  • value \(d\) into the fourth byte (bits 24-31)

uint32_t pack_bytes (uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  uint32_t ret_val = 0x0, tempa, tempb, tempc, tempd;

  tempa =  a & 0xff;        // make sure to get only the least-significant 8 bits
  tempb = (b & 0xff) <<  8; // shift the value to the second byte
  tempc = (c & 0xff) << 16; // shift the value to the third  byte
  tempd = (d & 0xff) << 24; // shift the value to the fourth, most-significant byte
  rev_val = tempa | tempb | tempc | tempd;

  printf("a: 0x%08x\n", tempa);
  printf("b: 0x%08x\n", tempb);
  printf("c: 0x%08x\n", tempc);
  printf("d: 0x%08x\n", tempd);
}

printf("Packed bytes: 0x%08x\n", pack_bytes(0x111, 0x222, 0x333, 0x444));



Storage Classes#

Global

  • accessible from anywhere within the same file

  • accessible in other .c/.h files via keyword extern

Static

  • static global are variables that are global to the local file only

  • static local are variables that are local to the scope of a function

    • unlike automatic variables, preserves changes across invocations

  • keyword static is used to identity variables as local only

Global and static variables are initialized before program execution begins with supplied value or are given a default value, often 0, and changes are preserved until the end of program execution

  • pointer type gets null pointer

  • arithmetic type gets positive/unsigned zero

  • aggregate type, every member is initialized recursively according to these rules

  • union type, the first named member is initialized recursively according to these rules

Auto

  • are automatically allocated/deallocated variables (local function variables declared on the stack)

  • given indeterminate default values, which means that the compiler does what it wants (most compilers use whatever value is already in memory)

EXAMPLE

// main.c

#include <stdio.h>

int x = 42;

int triple (void);

int main (void) {
  printf("%d\n", x);
  printf("%d\n", triple());
}
// triple.c

extern int x;

int triple (void) {
  return x * 3;
}
gcc -g -Wall main.c triple.c -o myprog && ./myprog
42
126

EXAMPLE

// main.c

#include <stdio.h>

static int x = 42;   // <--

int triple (void);

int main (void) {
  printf("%d\n", x);
  printf("%d\n", triple());
}
// triple.c

extern int x;

int triple (void) {
  return x * 3;
}
gcc -g -Wall main.c triple.c -o myprog && ./myprog

COMPILATION FAILS

EXAMPLE - static local vs automatic

// foo.c
#include <stdio.h>

void foo (void) {
         int x = 0;
  static int y = 0;
  x += 1;
  y += 1;
  printf("x=%d y=%d\n", x, y);
}

int main (void) {
  foo();
  foo();
}
gcc -g -Wall foo.c -o foo && ./foo
x=1 y=1
x=1 y=2



I/O#

Console#

  • scanf()

  • printf()

library libc has portable routines for reading/writing

Unix standard streams provided to all programs.

printf("this is printed to standard output\n");
fprintf(stdout, "this is printed to standard output as well\n");
fprintf(stderr, "this is printed to standard error\n");

File#

C library has portable routines for reading/writing

  • fopen()

  • fread()

  • fwrite()

  • fclose()

  • does buffering by default

  • is blocking by default

OS provides system calls (more control over buffering and blocking)

  • low-level binary read/write

    • read()

    • write()

    • open()

    • close()




Signals#

C has no try-catch mechanism to handle exceptions.

Errors are returned from functions as integer error codes.

“If you do something bad you’ll end up spraying bytes around memory. Hopefully, this causes a segmentation fault and crash.”

There are mechanisms for exception handling (or exception-al control flow) at different levels of a computer system.

  • low-level

    • exceptions: aborts, faults, interrupts, traps (implemented using a combination of hardware and OS software)

  • high-level

    • process context switch (implemented by OS software, and hardware timer)

    • signals (implemented by OS software)

    • non-local jumps: longjmp(), setjmp() (implemented by the C runtime library)

Signal

A signal is a special message sent through the OS to tell a process (or thread) about some command or event. The process execution (or processor’s control flow) stops, and signal handler code starts. After the signal handler control flow finishes, the processor control flow can resume where it left off.

SIGHUP       1   // handup                   (POSIX)
SIGINT       2   // interrupt                (ANSI)
SIGQUIT      3   // quit                     (POSIX)
SIGILL       4   // illegal instruction      (ANSI)     error
SIGTRAP      5   // trace trap               (POSIX)    error
SIGABRT      6   // abort                    (ANSI)
SIGIOT       6   // IOT trap                 (BSD 4.2)  error
SIGBUS       7   // bus error                (BSD 4.2)  error
SIGFPE       8   // floating-point exception (ANSI)     error
SIGKILL      9   // unblockable kill         (POSIX)    control process execution
SIGUSR1     10   // user-defined signal 1    (POSIX)    sent by another application
SIGSEGV     11   // segmentation violation   (ANSI)     error
SIGUSR2     12   // user-defined signal 2    (POSIX)    sent by another application
SIGTERM     15   // termination              (ANSI)
SIGSTKFLT   16   // stack fault
SIGCHLD     17   // child status has changed (POSIX)
SIGCONT     18   // continue                 (POSIX)    control process execution
SIGSTOP     19   // unblockable stop         (POSIX)    control process execution
SIGSYS      31   // bad system call
ps -<signal> <PID> # if no signal is provided, default SIGTERM
ps -U <username>

kill

# send a signal to all instances of a particular program (default SIGTERM)
killall -<signal> <program_name>

SIGTERM

  • interrupts the program and “asks” it to shut down, which it should

  • sometimes, this does not work (e.g., when the program is in a locked state)

  • it is often desirable to add a signal handler to handle the SIGTERM so that it can gracefully shutdown the process, cleanup memory, close files, etc.

SIGKILL

  • kills the process

  • can lead to inconsistent state because there is no opportunity to gracefully shutdown the process

Graceful shutdown refers to the proper and complete sync with secondary storage, disposal of resources, and normal termination.

int raise (int sig)
/*
 *
 * allow a process to send signals to itself
 *     suspend itself - SIGSTOP
 *     kill itself    - SIGKILL
 *     reset its configuration - SIGHUP
 *     user-defined            - SIGUSR1
 */

void suicide_signal (void) {
  raise(SIGKILL);
  return; // this will never be reached
}

Creating a signal handler

Create a signal handler by creating a function of the form

void signal_hander_function_name (int signal_number)

and passing a function pointer of the form

sighandler_t signal (int signum, sighandler_t signal_handler_function_name)

to the function. Thereafter, whenever a signal of the type signal_number is raised the program is called instead of the default handler.

void signal_handler (int sig_no) {
  printf("the signal handler got a [%d]\n", sig_no);
  return;
}

signal(SIGHUP, signal_handler);
signal(SIGINT, signal_handler);

Function pointer

A function pointer is a pointer to a function that can be assigned, passed as a parameter, and called.

   <return> (*<var>)(<params>);
/* ^^^^^^^^                     the return type of the function
 *            ^^^^^             the variable names
 *                   ^^^^^^^^   comma-separated list of parameter names
 */
int my_func (int i) {
  printf("Got into function `my_func` with %d\n", i);
  return 0;
}

int main (void) {
  int (*func)(int);
  func = my_func;
  func(7);
  return 0;
}

sigaction

int sigaction (int signum, const struct sigaction *act, struct sigaction *oldact)
/*             ^^^^^^^^^^                                                         the signal to be handled
 *                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^                            a structure that contains information about the new signal handler, NULL means ignore the signal
 *                                                      ^^^^^^^^^^^^^^^^^^^^^^^^  a pointer to the previously assigned handler as it is assigned in the function call
 */

The sigaction() system call changes the action taken by a process on receipt of a specific signal.

struct sigaction new_action, old_action;
new_action.sa_handler = signal_handler;
new_action.sa_flags   = SA_NODEFER | SA_ONSTACK;
/*                      ^^^^^^^^^^               don't suspend signals while in the handler
 *                                   ^^^^^^^^^^  provide an alternate stack for the signal handler
 *                                               SA_RESETHAND - restore the signal action to the default upon entry to the signal handler
 */
sigaction(SIGINT, &new_action, &old_action);

It is argued that sigaction is better than signal.

  • signal does not block other signals from arriving while the current handler is executing whereas sigaction can block other signals until the current action returns

  • signal resets the signal action back to the default SIG_DFL for almost all signals

  • provides better tuning of signals/controls of process through flags




Cache#

https://www.baeldung.com/cs/cache-write-policy

Cache Hierarchy#

The idea of a memory hierarchy is that the faster, smaller device at level \(k\) serves as a cache for the larger, slower device at level \(k + 1\), for each \(k\). Programs tend to access the data at level \(k\) more often then they access the data at level \(k + 1\) due to locality, and so the storage at level \(k + 1\) can be slower, larger, and cheaper per bit. The memory hierarchy creates a large pool of storage that costs as much as the cheap storage near the bottom but that serves data to programs at the rate of the fast storage near the top. Instruction caches are different from data caches.

The memory hierarchy

L0 - CPU registers

  • hold the words retrieved from L1

L1 cache (SRAM)

  • holds cache lines retrieved from L2

  • very fast (usually 100x faster than RAM), small, processor-adjacent

L2 cache (SRAM)

  • holds cache lines retrieved from L3

  • a bit slower (usually 25x faster than RAM) but often much larger

L3 main memory (DRAM)

  • holds disk blocks retrieved from local disks

  • slower still, larger still, may be off-chip (may be shared amongst processors in a multi-core system)

L4 local secondary storage (local disks)

  • hold files retrieved from disks on the remote network

  • slowest, least expensive

L5 remote secondary storage (tapes, distributed file systems, web servers)

EXAMPLE - AMD’s Ryzen 5 5600X

  • 384 KB L1

  • 3 MB L2

  • 32 MB L3

EXAMPLE - Intel Core i5 3570K CPU

  • L1 data cache = 32 KB x 4 (8-way set associative, 64 B line size)

  • L1 instruction cache = 32 KB x 4 (8-way set associative, 64 B line size)

  • L2 cache = 256 KB x 4 (8-way set associative, 64 B line size)

  • L3 cache = 6 MB (12-way set associative, 64 B line size)


A cache is a smaller, faster storage device that acts as a staging area for a subset of the data in a larger, slower device.

A cache block (or cache line) is the basic unit for cache storage which contains multiple bytes or words of data.

A cache set (or cache row) is a row in the cache.

Locality

Caches exploit two types of locality to improve their performance.

Spatial locality, data that needs to be accessed tends to be close to data that has already been accessed

Temporal locality, data that has already been accessed is likely to be accessed again soon

These ideas lead to two cache design strategies:

  • spatial - cache items in blocks larger than that accessed

  • temporal - keep stuff that was used recently around longer

Cache Placement Policy#

The placement policy determines where in the cache a copy of an entry in main memory will go. If the placement policy is free to choose any entry in the cache to hold the copy then the cache is called fully associative. On the other hand, if each entry in main memory can go in just one place in the cache then the cache is direct-mapped. Choosing the right value of associativity involves a tradeoff between the efficiency of searching the cache and the hit rate.

In a cache with \(n\) cache sets the incoming memory address \(y\) is mapped to cache set \(x = y \mod n\).

Let \(\ell\) be the minimum number of bits required to uniquely represent a memory address.

If the cache block size is \(B \,\,\text{bytes}\) then the \(\log_2 B = b\) least significant bits of the memory address are the cache block offset bits.

If there are \(S\) cache sets in the cache then the \(\log_2 S = s\) next least significant bits of the memory address are the cache set index bits.

The remaining \(\ell - b - s = t\) most-significant bits of the memory address are the tag bits.

\( \ell\text{-bit memory address} = \underbrace{\boxed{\vphantom{Tg}\quad\quad\text{tag}\quad\quad}}_{t}\underbrace{\boxed{\vphantom{Tg}\quad\quad\text{cache set index}\quad\quad}}_{s}\underbrace{\boxed{\vphantom{Tg}\quad\text{offset}\quad}}_{b} \)


In a direct-mapped (or \(1\)-way set associative) cache each memory address is stored in a single cache block.

Locating data in the cache

  • use the index bits of the address of the memory block to determine the cache set

  • compare the tag bits of the address of the memory block with the tag bits of the cache block: if they match then there is a cache hit and the cache block is returned; otherwise there is a cache miss and the memory block is fetched from main memory

Loading data into the cache

  • use the index bits of the address of the memory block to determine the cache set

  • place the memory block into the cache set and store use the tag bits of the address of the memory block to store the tag in the associated tag field

  • if the cache block is occupied, replace the current memory block with the new memory block

In a \(2\)-way set associative cache each memory address can be stored in one of two cache blocks.

Locating data in the cache

Given a memory address it can be determined whether the data at that memory address is in the cache via the following procedure:

  • Use the index of the cache block to determine which cache set the address should reside in

  • For each cache block in the cache set, compare the tag associated with that cache block to the tag of the memory address. If they match, proceed to the next step. Otherwise, the data is not in the cache.

  • For the cache block where the data was found, inspect the valid bit. If it is \(1\) then the data is in the cache. Otherwise, it is not.

If the data at that memory address is in the cache then we use the block offset from that address to find the data within the cache block

Loading data into the cache

If the requested address is not found in the cache then it will be brought in from memory and placed there along with other addresses that are near it to take advantage of spatial locality. To determine which addresses will be brought in, we find the starting address and ending address of the range that will be brought in.

In a fully associative cache each memory address can be stored in any cache block.

Locating data in the cache

Loading data in the cache

  • A cache block is sought based on the value of its valid bit. If a cache block’s valid bit is \(1\) then a different cache block is sought. If a cache block’s valid bit is \(0\) then the memory block is placed into that cache block.

  • If the cache is completely full then a cache block is evicted according to the cache replacement policy and the memory block is placed into that cache block.


EXAMPLE

Consider a main memory of \(\overset{2^4}{16} \,\,\overset{2^{10}}{\text{KB}}\) which is organized into \(\overset{2^2}{4}\text{-B}\) memory blocks for a total of \(\overset{2^{12}}{4,096}\) memory blocks.

\( \begin{aligned} \frac{16 \,\,\text{KB}}{\text{memory}} &\times \frac{1 \,\,\text{block}}{4 \,\,\text{B}} &&= 4,096 \,\,\text{memory block} \\ \end {aligned} \)

And consider a cache of \(\overset{2^8}{256} \,\,\text{B}\) which is organized into \(\overset{2^2}{4}\text{-B}\) cache blocks for a total of \(\overset{2^6}{64}\) cache blocks.

\( \begin{aligned} \frac{256 \,\,\text{B}}{\text{cache}} &\times \frac{1 \,\,\text{block}}{4 \,\,\text{B}} &&= 64 \,\,\text{cache block} \\ \end {aligned} \)

A minimum of \(\ell = 14\) bits is needed to uniquely represent a memory address in a \(16\text{-KB}\) memory. Regardless of the cache placement policy, the incoming \(14\text{-bit}\) memory address will contain \(2\) offset bits because there are \(2^2 \,\,\text{B per cache block}\).

\(b = \log_2 4\)

If the cache is direct-mapped (or \(\overset{m}{1}\)-way set associative) then each of the \(64\) cache blocks is mapped to a set. There are \(\overset{n}{64}\) cache sets and the cache is represented as a \(64 \times 1\) column matrix. The incoming \(14\text{-bit}\) memory address will contain \(6\) index bits in order to uniquely represent the \(2^6\) cache sets, and \(14 - 6 - 2 = 6\) tag bits.

\(\underbrace{00 \, 0000}_{\text{tag}} \, \underbrace{0000 \, 00}_{\text{index}}\underbrace{00}_{\text{offset}}\)

In a direct-mapped cache, each memory address can only go in one cache block. This means that there is no replacement policy: if two memory addresses are mapped to the same cache block then they may continually knock each other out. Although a direct-mapped cache is simpler than an associative cache, it needs to be much larger to give comparable performance.

If the cache is \(\overset{m}{2}\)-way set associative then there are \(2\) cache blocks per cache set. There are \(\overset{n}{32}\) cache sets and the cache is represented as a \(32 \times 2\) matrix. The incoming \(14\text{-bit}\) memory address will contain \(5\) index bits in order to uniquely represent the \(2^5\) cache sets, and \(14 - 5 - 2 = 7\) tag bits.

\(\underbrace{00 \, 0000 \, 0}_{\text{tag}}\underbrace{000 \, 00}_{\text{index}}\underbrace{00}_{\text{offset}}\)

If the cache is fully associative (or \(\overset{m}{64}\)-way set associative) then there are \(64\) cache blocks per cache set. In other words, there is just \(\overset{n}{1}\) cache set and the cache is represented as a \(1 \times 64\) row matrix. The incoming \(14\text{-bit}\) memory address will contain \(0\) index bits in order to uniquely represent the \(2^0\) (i.e., single) cache set, and \(14 - 2 = 12\) tag bits.

\(\underbrace{00 \, 0000 \, 0000 \, 00}_{\text{tag}}\underbrace{00}_{\text{offset}}\)

Some memory addresses and their corresponding cache placements

\( \begin{aligned} \text{address, hex} && \text{address, bin} && \text{mem blck} && 64 \times 1 \,\,\text{tag, idx} && 32 \times 2 \,\,\text{tag, idx} && 1 \times 64 \,\,\text{tag} \\ 0x0000 && 00 \, 0000 \, 0000 \, 00\underbrace{00}_{2^0 - 1} && 0 && 0, 0 && 0, 0 && 0 \\ 0x0004 && 00 \, 0000 \, 0000 \, 01\underbrace{00}_{2^0 - 1} && 1 && 0, 1 && 0, 1 && 1 \\ 0x00\text{FF} && 00 \, 0000 \, 1111 \, 11\underbrace{11}_{2^2 - 1} && 63 && 0, 63 && 1, 31 && 63 \\ 0x0100 && 00 \, 0001 \, 0000 \, 00\underbrace{00}_{2^0 - 1} && 64 && 1, 0 && 3, 0 && 64 \\ \end {aligned} \)


Cache Eviction/Replacement#

When the cache is full a previously stored block must be evicted/replaced. The performance of the cache is determined by the cache eviction/replacement policy.

Least Recently Used (LRU)

In the LRU replacement policy the block that has been in the cache the longest without being used is evicted.

Least Frequently Used (LFU)

In the LFU replacement policy the block that has been used the least is evicted.

First In First Out (FIFO)

Queue.


Cache Performance#

A cache hit occurs when the block that is requested is found in and served from the cache.

A cache miss occurs when the block that is requested is not found in and served from the cache.

Types of cache misses

1. A cold miss (or compulsory miss) occurs because the cache is empty.

2. A capacity miss occurs when the set of active cache blocks (called the working set) is larger than the cache.

3. Most caches limit the blocks at level \(k + 1\) to a small subset–sometimes a singleton–of the block positions at level \(k\). For example, block \(i\) at level \(k + 1\) must be placed in block \(i \mod 4\) at level \(k\). A conflict miss occurs when the level \(k\) cache is large enough but multiple data objects all map to the same level \(k\) block. For example, requesting blocks \(0, 8, 0, 8, 0, 8, 0, 8\) would miss every time.

The efficiency of a cache is almost entirely determined by the hit ratio.

\(\text{hit ratio} = \frac{\text{num cache hits}}{\text{num requests}}\) where \(\text{num requests} = \text{num cache hits} + \text{num cache misses}\)

Average memory access time (or average memory latency)

\(\text{AMAT} = \text{H} + \text{MR} \times \text{AMP}\)

hit time \(\text{H}\) is the time it takes to read data from the cache

miss ratio \(\text{MR}\) is the probability that a cache access results in a miss: \(\text{miss ratio} = 1 - \text{hit ratio}\)

average miss penalty \(\text{AMP}\) is the cost to serve the block out of level \(k + 1\)

\(\text{AMAT} = \text{H} + \text{MR} \times \text{AMAT}_{k + 1}\)

EXAMPLE

\( \begin{aligned} \text{hit time} &= 25 \text{us} \\ \text{miss penalty} &= 250 \text{us} \\ \text{hit ratio} &= 0.8 \\ \text{miss ratio} &= 1 - 0.8 = 0.2 \\ \text{average} &= 25 \text{us} + 0.2 \times 250 \text{us} = 75 \text{us} \\ \end {aligned} \)

EXAMPLE

\(4\)-block LRU cache


https://www.makeuseof.com/tag/what-is-cpu-cache/

https://cseweb.ucsd.edu/classes/su07/cse141/cache-handout.pdf


  • [ w ] Average Memory Access Time (AMAT)

  • [ w ] Cache

  • [ w ] Cache Coherence

  • [ w ] Cache Control Instruction

  • [ w ] Cache Hierarchy

  • [ w ] Cache Hit

  • [ w ] Cache Miss

  • [ w ] Cache Performance

  • [ w ] Cache Placement Policy

  • [ w ] Cache Pollution

  • [ w ] Cache Replacement Policy

  • [ w ] Content-Addressable Memory (CAM)

  • [ w ] Data Remanence

  • [ w ] Dirty Bit

  • [ w ] Disk

  • [ w ] Dynamic RAM (DRAM)

  • [ w ] Gather/Scatter

  • [ w ] Locality of Reference

  • [ w ] Magnetic-Core Memory

  • [ w ] Memory

  • [ w ] Memory Access Pattern

  • [ w ] Memory Address

  • [ w ] Memory Management Unit (MMU)

  • [ w ] Non Volatile RAM (NVRAM)

  • [ w ] Non Volatile SRAM (nvSRAM)

  • [ w ] Page Replacement Algorithm

  • [ w ] Pseudo LRU (PLRU)

  • [ w ] Random Access

  • [ w ] Random Access Memory (RAM)

  • [ w ] Read-Only Memory (ROM)

  • [ w ] Scratchpad Memory

  • [ w ] Sequential Access

  • [ w ] Sequential Access Memory (SAM)

  • [ w ] Shared Memory

  • [ w ] Static RAM (SRAM)

  • [ w ] Translation Lookaside Buffer (TLB)

  • [ w ] Volatile Memory

  • [ w ] Application-Specific Integrated Circuit (ASIC)

  • [ w ] Complementary Metal-Oxide Semiconductor (CMOS)

  • [ w ] Complex Programmable Logic Device (CPLD)

  • [ w ] Configurable Logic Block (CLB)

  • [ w ] Field-Programmable Gate Array (FPGA)

  • [ w ] Programmable Logic Device (PLD)

  • [ w ] Reconfigurable Computing

  • [ w ] Load-Store Architecture

  • [ w ] Non Uniform Memory Access (NUMA)

  • [ w ] Processor Register

  • [ w ] Register File




Memory Ops#

Pointers are typed, and pointer arithemtic obeys types: when you add 1 to a pointer you add sizeof(type).

Buffer

A buffer is just a memory region that has some use which is typically referenced/maintained by a pointer. A buffer is a temporary holding place.

char *buffer;

void *buffer;

A void buffer is a general-purpose pointer, a pointer to a raw address.

Memory Management#

C has no garbage collector.

Any memory that is allocated must subsequently be un-allocated (freed). MEMORY LEAKS

local variables are allocated off the stack

freed when you return from the function

global and static variables are allocated in a data segment

are freed when the program exits

Memory can be allocated in the heap segment using malloc(). This memory must be freed via free().

Failing to free allocated memory is a memory leak. Double-freeing allocated memory is an error (hopefully, a crash).


memcpy copies one memory region to another, or from a src buffer to a dst buffer. The size must be explicit because there is no terminator.

char buf1[]  = {0, 1, 2, 3};
char buf2[4] = {0, 0, 0, 0};

printf("Before\n");
for (int i = 0; i < 4; i++) {
  printf("buf1[i] = %ld, buf2[i] = %ld\n", (int)buf1[i], (int)buf2[i]);
}

memcpy(buf2, buf1, 4);

printf("After\n");
for (int i = 0; i < 4; i++) {
  printf("buf1[i] = %ld, buf2[i] = %ld\n", (int)buf1[i], (int)buf2[i]);
}

memset fills memory with a given constant byte

void *memset (void *buf, int c, size_t n)

char a[4] = {0, 1, 2, 3};
memset(a, 0, 4);

int b[4] = {0, 1, 2, 3};
memset(b, 0, 4);

memcmp(buf1, buf2, n) compares the first n bytes of the buffers

returns

  • neg int if buf1 < buf2

  • 0 if buf1 = buf2

  • pos int if buf1 > buf2

int x;

char cmps[4][2] = {
  {0x0, 0x1}, {0x1, 0x0}, {0x0, 0x1}, {0x9, 0x0}
};

for (int i = 0; i < 4; i++) {
  for (int j = 0; j < 4; j++) {
    x = memcmp(&cmps[i][0], &cmps[j][0], 2);
    printf("compare %ld%ld with %ld%ld = %d\n", cmps[i][0], cmps[i][1], cmps[j][0], cmps[j][1], x);
  }
}

malloc allocates a block of memory of the given size and returns a void pointer to the first byte of that memory, which does not need to be case because it is automatically promoted, or NULL if the memory could not be allocated. Initialized with garbage values.

void *malloc (size_t size)

#include <stdlib.h>

// allocate a 10-float array
float *arr = malloc(10 * sizeof(float));
if (arr = NULL)
  return errcode;
arr[0] = 5.1;

calloc returns an array of nmemb members, each of size bytes, and zeroes out allocated memory: all bytes have the value 0x0. calloc is slightly slower than malloc and is preferred for non performance-critical code.

void *calloc (size_t nmemb, size_t bytes);
#include <stdlib.h>

// allocate a 10-long int array
long *arr = calloc(10, sizeof(long));
if (arr = NULL)
  return errcode;
arr[0] = 5L;

void free (void *ptr) releases the memory pointed to by the pointer

  • the ptr must point to the first byte of the heap-allocated memory (i.e., something returned by either malloc or calloc)

  • set ptr to NULL after freeing a block of memory to avoid a dangling pointer

#include <stdlib.h>

long *arr = calloc(10 * sizeof(long));
if (arr = NULL)
  return errcode;

free(arr);
arr = NULL;

typedef struct {
  double real;
  double imag;
} Complex, *ComplexPtr;

ComplexPtr AllocComplex (double real, double imag) {
  Complex *retval = malloc(sizeof(Complex));
  if (retval != NULL) {
    retval->real = real;
    retval->imag = imag;
  }
  return retval;
}

void *realloc (void *ptr, size_t size) modifies a previous allocation by resizing it in-place if possible; if not, it creates a new allocation and copies as much data as it can. It returns NULL if the memory could not be allocated.

char *buf, *rbuf;

buf = malloc(2); // allocate a 2-byte array
memset(buf, 0xa, 2);
printf("buf = %x %x\n", buf[0], buf[1]);
rbuf = realloc(buf, 4); // resize to 4 bytes
printf("rbuf = %x %x %x %x\n", rbuf[0], rbuf[1], rbuf[2], rbuf[3]);

Heap#

The heap is a pool of unused memory that is used for dynamically-allocated data: malloc allocates chunks of heap and free deallocates them, and malloc maintains bookkeeping data in the heap to keep track of allocated chunks.




Memory#

Units

\( \begin{aligned} \text{bit b} && \\ \text{byte B} && 1 \text{B} &= 8 \text{b} \\ \text{halfword (short) hw} && 1 \text{hw} &= 2 \text{B} \\ \text{word w} && 1 \text{w} &= 4 \text{B (sometimes 2B, then double word dw = 4B)} \\ \text{giant g} && 1 \text{g} &= 2 \text{w} = 8 \text{B} \\ \end {aligned} \)

The number of possible memory addresses on 32-bit architecture and 64-bit architecture respectively:

\( \begin{aligned} 2^{32} &= 4.29496730 \times 10^9 &&= 4,294,967,296 \\ 2^{64} &= 1.84467441 \times 10^{19} &&= 18,446,744,073,709,551,616 \\ \end {aligned} \)

64-bit processors can run in 32-bit compatibility mode which allows them to run 32-bit code quickly.

print('Possible memory addresses (theoretical limit)')
print(f'2^  8 = {2**  8:>55,.0f} B')
print(f'2^ 16 = {2** 16:>55,.0f} B')
print(f'2^ 32 = {2** 32:>55,.0f} B (about   4.3 million)')
print(f'2^ 64 = {2** 64:>55,.0f} B (about  18.4 quintillion, or  16 EiB)')
print(f'2^128 = {2**128:>55,.0f} B (about 340   undecillion)')
print()
print('Some common physical memory limits')
print(f'2^ 48 = {2** 48:>55,.0f} B (about   0.3 quadrillion, or 256 TiB)')
print(f'2^ 52 = {2** 52:>55,.0f} B (about   4.5 quadrillion, or   4 PiB)')
print()
print('Some common user space and kernel space limits')
print(f'2^ 47 = {2** 47:>55,.0f} B (Linux user   space)')
print(f'2^ 17 = {2** 17:>55,.0f} B (Linux kernel space)')
Possible memory addresses (theoretical limit)
2^  8 =                                                     256 B
2^ 16 =                                                  65,536 B
2^ 32 =                                           4,294,967,296 B (about   4.3 million)
2^ 64 =                              18,446,744,073,709,551,616 B (about  18.4 quintillion, or  16 EiB)
2^128 =     340,282,366,920,938,463,463,374,607,431,768,211,456 B (about 340   undecillion)

Some common physical memory limits
2^ 48 =                                     281,474,976,710,656 B (about   0.3 quadrillion, or 256 TiB)
2^ 52 =                                   4,503,599,627,370,496 B (about   4.5 quadrillion, or   4 PiB)

Some common user space and kernel space limits
2^ 47 =                                     140,737,488,355,328 B (Linux user   space)
2^ 17 =                                                 131,072 B (Linux kernel space)
MEMORY

0xFFFFFFFF +--------------------------------------------------+
           | Kernel
           +--------------------------------------------------+
           | STACK (8 MB limit)
           |
    bp --> | stack frame
           |     function params/args
           |     automatic (local-to-function) vars
           |     saved frame pointer (SFP) - used to restore the frame pointer (bp) to its previous value
           |     return address pointer    - used to restore the instruction pointer (ip) to the next instruction after the function call
    sp --> +--------------------------------------------------+
           |                         |
           |                         v
           +--------------------------------------------------+
           | Shared Libraries (here, or above the stack???)
           +--------------------------------------------------+
           |                         ^
           |                         |
           +--------------------------------------------------+ <-- program break, moves up and down as memory is allocated to, deallocated from the heap
           | HEAP
           |     dynamic memory (de)allocation - C malloc/free, C++ new/delete
           +--------------------------------------------------+
           | BSS  (.bss)
           |     uninitialized static data, read from the binary executable and loaded into memory at this address
           |     writable, but has a fixed size
           +--------------------------------------------------+
           | DATA (.data)
           |     initialized static data such as globals, static locals, and strings, read from the binary executable and loaded into memory at this address
           |     writable, but has a fixed size
           +--------------------------------------------------+
           | CODE/TEXT (.rodata)
           |     static const
           +--------------------------------------------------+
           | CODE/TEXT (.text)
           |     assembled machine instructions, read from the binary executable and load it into memory at this address
           |     read-only, and has a fixed size
0x00000000 +--------------------------------------------------+

PROGRAM EXECUTION

The instruction pointer is set to point to the first instruction of the text segement (function `main`), and the processor proceeds as follows:
1. read the instruction that the instruction pointer is pointing to
2. add the byte length of the instruction to the instruction pointer
3. execute the instruction that was read in (1)
4. go back to (1)

Signed representation#

// in sign-and-magnitude, -1 is encoded as 100...001
if      (-1 & 3) == 1
  printf("The encoding is sign-and-magnitude");
// in one's complement,   -1 is encoded as 111...110
else if (-1 & 3) == 2
  printf("The encoding is one's complement");
// in two's complement,   -1 is encoded as 111...111
else if (-1 & 3) == 3
  printf("The encoding is two's complement");
else
  printf("Not possible in the C standard");

From lower memory addresses to higher addresses:

1. Text segment

  • stores the executable code of the program

  • usually read-only

2. Data Segment .data

  • initialized static/global variables static int a = 3;

  • read/write

  • the size of the data segment is determined by the size of the values in the program’s source code and does not change at runtime

Read-Only Data Segment .rodata

  • initialized static/global constants

  • read-only

3. BSS segment

  • uninitialized static/global constants/variables static int b;

  • this segment is filled with zeros by the operating system and so uninitialized variables are initialized with zeros

4. Heap

  • grows from lower addresses to higher addresses

  • used to provide space for dynamic memory allocation (malloc, calloc, realloc, free, etc.)

5. Stack

  • grows from higher addresses to lower addresses

  • used to store local variables defined inside functions as well as data related to function calls such as return addresses, arguments, etc.

C’s memory layout from higher addresses to lower addresses

  • command line arguments

  • dynamic memory

    • stack - a new stack frame for each function call, not for each function

    • heap

  • static memory

    • uninitialized data segment

    • initialized data segment

    • text/code segment - read-only permission prevents accidental modification; shareable, so that only a single copy is required amng multiple programs like text editors, shells, etc.

Post compilation, the instructions in the binary executable are loaded into the text segment.

print('32 b arch can address 2^32 B of RAM')
print(f'{(2**32)  :>15,.0f} B')
print(f'{(2**32)*8:>15,.0f} b')
32 b arch can address 2^32 B of RAM
  4,294,967,296 B
 34,359,738,368 b

EXAMPLE

int x = 100;                                 // data segment

int main () {
  int   a = 2;                               // stack
  float b = 2.5;                             // stack
  static int y;                              // bss segment

  int *ptr = (int *)malloc(2 * sizeof(int)); // local variable `ptr` is stored in the stack, points to a block of memory in the heap
  ptr[0] = 5;                                // heap
  ptr[1] = 6;                                // heap

  free(ptr);
  return 1;
}



x86 Assembly#


https://www.quora.com/What-hardware-do-you-need-to-start-learning-x86-assembly


Syntax Flavor

  • AT&T

    • puts destination registers after source registers

  • Intel

    • puts destination registers before source registers

Registers

general-purpose - act as temp variables for the CPU when it's executing machine instructions
===============
ax    accumulator
bx    base
cx    counter
dx    data

sp    stack pointer - stores an address
bp    base  pointer - stores an address - references local function variables in the current stack frame
                    - frame pointer (FP) or local base (LB) pointer
si    src   idx     - stores an address, when data needs to be read
di    dst   idx     - stores an address, when data needs to be written

ip    instruction pointer - always holds the memory address of the next instruction to execute in the program's code segment
                            the processor increments %rip automatically after each instruction
                            control flow instructions, like branches, set the value of %rip to change the next instruction

flags - consists of several bit flags that are used for comparisons and memory segmentations

Registers

eax       accumulator   (general-purpose register)
ebx       base          (general-purpose register)
ecx       counter       (general-purpose register)
edx       data          (general-purpose register)
esp       stack pointer (general-purpose register), pointer that stores a memory address
ebp       base  pointer (general-purpose register), pointer that stores a memory address
esi       src index     (general-purpose register), pointer used by load (read data) and store (write data) instructions
edi       dst index     (general-purpose register), pointer used by load (read data) and store (write data) instructions
eip       instruction pointer, points to the instruction that the processor is reading
eflags    several bit flags that are used for comparisons and memory segmentations
cs
ds
es
fs
gs
ss

Register Formats

 63                              31              15      7       0
 +-------------------------------+-------------------------------+
 |                               |               |       |       |
 +---------------------------------------------------------------+

 |---------------------%rax (64 bits/8 bytes)--------------------|
                                 |-----%eax (32 bits/4 bytes)----|
                                                 |-%ax (16b/2B)--|
                                                 |--%ah--|--%al--| <-- 8 bits/1 byte each

Instructions

/**************************************************
 *
 * computation
 *
 **************************************************/
     src  src/dst
add %rax,%rbx    %rbx := %rbx + %rax
sub %rax,%rbx    %rbx := %rbx - %rax
inc
and
 or?


/**************************************************
 *
 * data movement
 *
 **************************************************/
     src  dst
mov %rax,%rbx    %rbx := %rax


/**************************************************
 * control flow transfers - deviations from sequential instruction execution (function calls loops, conditionals)
 *
 * unconditional branch - the instruction pointer is set to a new value
 * conditional branch - the instruction pointer is set to a new value if a condition is true
 * function call
 * function return
 **************************************************/
cmp "compare"
j   "jump"


lea "load effective address"
INTEL    AT&T

and
call
cmp      cmpl    compare
inc      incl    increment
jle              jump if less than or equal to
jmp              jump
lea
leave
mov
         movl
nop
push
ret
sub



GDB#

~/.gdbinit

set dis intel
set disassemble-next-line on
disass (disassemble)
~/.gdbinit

b   main             break main
dis main             disassemble main
i r eip              info register eip
p                    print
q                    quit
list
nexti                next instruction
run
set dis intel        set disassembly intel

x/<f> <addr>      examine memory address `addr` with format `f`
x/i   $eip        instruction
x/o   $eip        octal
x/t   $eip        binary
x/u   $eip        decimal
x/x   $eip        hexadecimal
x/2x  $eip        2 words
x/12x $eip        12 words

x/2xb $eip        2 B
x/2xh $eip        2 halfword ( 4 B = 2 halfword x 2 B/halfword)
x/2xw $eip        2 word     ( 8 B = 2     word x 4 B/    word)
x/2xg $eip        2 giant    (16 B = 4     word x 4 B/    word)
x/xdw             double word
gcc -S        # compile C to 32-bit assembly on a 32-bit system
gcc -S -m32   # compile C to 32-bit assembly on a 64-bit system

gcc    -Wall -o prog prog.c                # compile C to binary executable
gcc -g -Wall -o prog prog.c && gdb -q prog
#   ^^                                       provide debugging information to gdb
#                                  ^^        quiet mode

objdump -M intel -D prog | grep -A20 main.:



objdump#

https://en.wikipedia.org/wiki/Objdump

readelf#

https://en.wikipedia.org/wiki/Readelf

file a.out # check whether it is an ELF file

readelf -h a.out # view the ELF header
readelf -S a.out # view sections in the ELF file
readelf -l a.out # view program headers (describes the segments used at runtime)
readelf -s a.out # view the symbol table
readelf -r a.out # view the relocation sections (shows how the binary modifies itself at runtime)
readelf -d a.out # view the dynamic section (contains information on dynamic linking)



Building#

Build

  • edit -> foo.h, foo.c (source file)

  • compile -> foo.o (object file)

  • link -> foo + libZ.a statically-linked library (executable)

  • load -> bar + libc.so shared library (process)

  • execute, debug, profile

# gcc is used to build (compile + link) a program
#  ld can be used to link the program

# compilation
# gcc <src files> [options]
#     -c           generate object file
#     -g           generate debug information
#     -o output    specify the output file
#     -Wall        show all warnings


gcc -Wall -g -c -o hello.o hello.c
#   ^^^^^^^^                       generate debug information and display errors
#            ^^^^^^^^^^^^^         compile, but do not link

# linking
# gcc <obj files> [options]
#     -g           generate debug information
#     -l<lib>      link with a library
#     -o output    specify the output file


 gcc -g -lmyexample -o hello hello.o goodbye.o
 #      ^^^^^^^^^^^                            link with a library
 #                  ^^^^^^^^                   output file

Static Library

Static linking is done at link time. A library is a collection of related code and functions that are linked against a C program. The library exports symbols. Program object code has unresolved symbols. The linker pulls chunks of the library containing those symbols and places them into the program. The program is done when all the pieces are resolved.

A statically-linked library produces object code that is inserted into the program at link time.

Build an archive of the library which the linker uses to search for and transfer code into the program.

The name of the library is linked against, not the name of the file in which the library exists

# ar rcs lib<libname>.a <object files>
#    ^      replace existing code with the objects passed in
#     ^     create the library, if needed
#      ^    create an index for relocatable code
# 

ar rcs libmyexample.a a.o b.o c.o d.o

Dynamic Library

A dynamic library is a collection of related code and functions that are resolved at run time. The program’s object code contains unresolved symbols, and the library exports symbols. The loader pulls chunks of the library containing those symbols and places them into the process. The symbols are resolved when the process is started or later during execution. The library is called dynamic because this process can be done at any time.

A dynamically-linked library produces object code that is inserted into the program at execution time.

A loadable version of the library is being built, which the loader uses to launch the application. All object files to be placed in the library must have been compiled to position-independent code (PIC). PIC is not dependent on being located at any predefined location in memory; it uses relative jumps, so it does not matter where it is loaded at exection time.

gcc -Wall -g -c -fpic -o a.o a.c
//           ^^^^^^^^            compile to PIC

gcc -shared -o libmyexample.so a.o b.o c.o d.o

Pros - static linking

  • resolve all implementation issues at link time

  • avoids problems of API versioning

  • linker can optimize library code at link time

Pros - dynamic linking

  • reduced executable

  • system updates

Preprocessor

The preprocessor processes input source code files for commands that setup the compile environment specific to that execution. # directives are used to indicate what the programmer wants done.

#include            // include data from some other data file
#include "foo.h"    // look local, used in application programming
#include <foo.h>    // look at the default directories and any directories provided in the command line

#define             // create a definition symbol that gets searched-replaced throughout
                    // commonly used to define a constant that might be changed in the future (e.g., the size of arrays)
                    // can also be used to create functions called macros; macros are not called, but are replaced during preprocessing (and so there are no function call overheads, stack operations)
#undef              // undoes the binding done by #define

// conditionally compile parts of a program
#if
#ifdef
#ifndef
gcc -I<path> # look in a specific directory for include files
#define NUM_ENTRIES 15
#define DEFINED

#if 0
/* not compiled */
#else
/*     compiled */
#endif

#ifdef UNKNOWNVALUE
/* not compiled */
#else
/*     compiled */
#endif

#ifndef DEFINED
/* not compiled */
#else
/*     compiled */
#endif

int main (void) {
  float myFloats[NUM_ENTRIES];
#if 0
  for (int i = 0; i < NUM_ENTRIES; i++)
    scanf("%f", &myFloats[i]);

  printCharline('*', 69);
  printf("Received and computed\n");
#endif
}

Make

The make utility is a utility for building complex programs/systems.

  • determine which parts of a program/system are out of date

  • determine the dependencies between objects

  • issue commands to create intermediate and final project files

Each project/system has one or more files defining the build called the makefile: what to build, how to build them, how they are related. The target is the thing to build. The prerequisites are the things that the target depends on. a.o depends on a.c. Variables are data that defines elements of interest to the build. Rules are statements of targets, prerequisites, and commands.

Productions or rules are statements consisting of targets, prerequisites, and commands which define how a particular item is built.

builtin variables

  • $@ the current rule target

  • $^ the prerequisite list

  • $< the first prerequisite

Pattern rule

  • % matches any nonempty string

Run the commands to build the target when any of the prerequisites are out of date:

target: prereq1, prereq2, prereq3
    command1
    command2
CC=gcc
LINK=gcc
CFLAGS=-c -Wall -I.
OBJ_FILES= sample.o support.o

sample    : $(OBJ_FILES)
    $(CC) $^ -o $@
sample.o  : sample.c support.h
    $(CC) $(CFLAGS) sample.c -o $@
support.o : support.c support.h
    $(CC) $(CFLAGS) support.c -o $@
make
CC=gcc
CFLAGS=-c -Wall -I. -fpic -g -fbounds-check
LIBS=-lcrypto

OBJS=tester.o util.o mdadm.o

%.o    : %.c %.h
         $(CC) $(CFLAGS) $< -o $@
tester : $(OBJS) jbod.o
         $(CC) -o $@ $^ $(LIBS)
clean  :
         rm -f $(OBJS) tester

# source code
vim foo.c foo.h bar.c

# assembling
gcc -a foo.c -o foo.s
gcc -a bar.c -o bar.s
as

# compilation
#
#     generates object files
gcc -c foo.c -o foo.o
gcc -c bar.c -o bar.o

# linking
#
#     produces executable `bar`
gcc foo.o bar.o -llibZ.a -o foo
ld

# loading
#
#     produces process `bar`
libc.so # shared library, linked with process `bar`

# execute, debug, profile, etc.


####################
#
#     Example of assemble + compile + link
#
####################
gcc hello.c -o hello -g -Wall -O0 -fsanitize=address -I.
#                    ^^                                  the compiler generates debug information
#                       ^^^^^                            checks
#                             ^^^                        disable the compiler optimizations, for debugging
#                                 ^^^^^^^^^^^^^^^^^^     AddressSanitizer memory debugger
#                                                    ^^^ include data from data files in the working directory

Static linking is the link-time process in which the linker places chunks of static libraries into the source code in order to resolve unresolved symbols. A statically-linked library produces object code which is inserted into the program at link-time.

# build an archive of the library, which the linker uses to search for and transfer code into the program
ar rcs libMyStaticLibrary.a a.o b.o c.o d.o
#  ^                                        replace existing code with the objects
#   ^                                       create the library if it doesn't exist
#    ^                                      create an index for relocatable code
#      ^^^^^^^^^^^^^^^^^^^^                 the name of the static library
#                           ^^^^^^^^^^^^^^^ object files

Dynamic linking is the run-time process in which the loader places chunks of dynamically-linked libraries into the process in order to resolve unresolved symbols. A dynamically-linked library produces object code which is inserted into the program at run-time.

# all object files to be placed in the dynamically-linked library must be compiled to position-independent code
gcc -c a.c -o a.o -fpic -g -Wall
gcc -c b.c -o b.o -fpic -g -Wall
gcc -c c.c -o c.o -fpic -g -Wall
gcc -c d.c -o d.o -fpic -g -Wall

# build a loadable version of the library, which the loader uses to launch the program
gcc -shared -o libMyDynamicLibrary.so a.o b.o c.o d.o

C Preprocessor#

The C preprocessor processes source code files for # directives that setup the compile environment.

#include <stdio.h> /* include data from data file `stdio.h` either in the default directories or on the command line */
#include "foo.h"   /* include data from data file `foo.h`   in the local directory                                   */

#define MY_CONSTANT 10 /* define a symbol to be replaced throughout the program, usually for a constant that might be changed */
#undef                 /* */

/* macros
 *
 *     the `#define` directive can also be used to create simple functions called macros
 *     macros are not called like functions are, but rather substituted during the preprocessing phase of compilation
 *     thus, there is no function call overhead on the stack (is this true???)
 */
#define swap (x, y) { int temp = x; x = y; y = temp; }

#if
#else
#endif
#ifdef
#ifndef

Makefile#

  • the target is the thing to build

  • the prerequisites are the things that the target depends on

  • the dependencies are relationships between files like a.c and a.o

  • productions/rules are statements that include targets, prerequisites, and commands

// run the commands to build the target when any of the prerequisites are out-of-date

target : prereq1, prereq2, prereq3, ...
<TAB>command1
<TAB>command2

builtins

  • $@ the current rule target

  • $^ the prerequisites

  • $< the first prerequisite

1

sample : sample.o support.o
    gcc sample.o support.o -o sample

sample.o : sample.c support.h
    gcc -c sample.c -o sample.o -Wall -I.

support.o : support.c support.h
    gcc -c support.c -o support.o -Wall -I.

2

OBJECT_FILES = sample.o support.o

sample : $(OBJECT_FILES)
    gcc $(OBJECT_FILES) -o sample

sample.o : sample.c support.h
    gcc -c sample.c -o sample.o -Wall -I.

support.o : support.c support.h
    gcc -c support.c -o support.o -Wall -I.

3

CC = gcc
LINK = gcc
CFLAGS = -c -Wall -I.
OBJECT_FILES = sample.o support.o

sample : $(OBJECT_FILES)
    $(CC) $^ -o $@

sample.o : sample.c support.h
    $(CC) $(CFLAGS) sample.c -o $@

support.o : support.c support.h
    $(CC) $(CFLAGS) support.c -o $@
# assemble + compile
gcc -S source.c   # produces `source.s`
as     source.s   # produces `source.o`

# compile
gcc -c source.c   # produces `source.o`

# link
gcc          source.c -o source   # produces executable file
gcc -g -Wall source.c -o source   # view warnings

Source File .c

Header File .h

Assembly File .s

Object File .o

  • contains re-locatable machine code produced by the compiler

  • generally, cannot be executed without some manipulation (e.g., via a linker)

  • may contain references to external symbols in other object files: linking will resolve external references

Library

  • a collection of object files

Object files and libraries are linked to produce an executable file.




Profiling#


gprof#

gprof measures a program’s performance and behavior

gcc -pg -o profiling profiling.c    # 1. compile with flag `-pg`
./profiling                         # 2. execute (generates file `gmon.out`)
gprof profiling                     # 3. run `gprof`
                                    # 4. review, optimize, re-profile

Amdahl’s Law

\(k\) is the percentage of total execution time spent in the optimized module(s)

\(s = \frac{\text{original execution time}}{\text{improved execution time}}\) is the execution time expressed in terms of n-factor speedup (2x, 3x)

The overall speedup of the program is expressed as

\( \begin{aligned} T = \frac{1}{(1 - k) + \frac{k}{s}} \end {aligned} \)

where \(1 - k\) is the part of the program that is unchanged and \(k/s\) is the ratio of altered program size to speedup

EXAMPLE

Consider an optimized program module \(A\) which represents \(45\%\) of the program run time. The optimization reduces the runtime of the module from \(750\text{ ms}\) to \(50\text{ ms}\). What is the speedup?

\( \begin{aligned} k &= 0.45 \\ s &= \frac{750}{50} = 15 \\ T &= \frac{1}{(1 - 0.45) + \frac{0.45}{15}} = \frac{1}{0.55 + 0.03} \approx 1.72 \end {aligned} \)

The program speedup is \(1.72\) times.

EXAMPLE

Memory operations take \(30\%\) of the total execution time. A new widget called a cache speeds up \(80\%\) of memory operations by a factor of \(4\). A second new widget called an L2 cache speeds up \(1/2\) the remaining \(20\%\) by a factor of \(2\). What is the total speedup?

\( \begin{aligned} k &= 0.3 \\ k_1 &= 0.3 \times 0.8 = 0.24 \\ s_1 &= 4 \\ k_2 &= 0.3 \times \frac{(1 - 0.8)}{2} = 0.03 \\ s_2 &= 2 \\ T &= \frac{1}{(1 - k_1 - k_2) + \frac{k_1}{s_1} + \frac{k_2}{s_2}} \\ &= \frac{1}{(1 - 0.24 - 0.03) + \frac{0.24}{4} + \frac{0.03}{2}} \\ &= \frac{1}{0.73 + 0.06 + 0.015} \\ &\approx 1.24 \\ \end {aligned} \)

EXAMPLE

Consider four modules that are measured prior to optimization and after optimization and suppose that the run time of the original execution is 2000 usec. What is the speedup?

module

before (usec)

after (usec)

A

200

60

B

450

11

C

1000

600

D

125

1


\( \begin{aligned} k_A &= \frac{200}{2000} = 0.1 \\ s_A &= \frac{200}{60} \approx 3.33 \\ T_A &= \frac{1}{(1 - k_A) + \frac{k_A}{s_A}} \\ &= \frac{1}{(1 - 0.1) + \frac{0.1}{3.33}} \\ &= \frac{1}{0.9 + 0.03} \\ &\approx 1.075 \\ \end {aligned} \)


\( \begin{aligned} k_B &= \frac{450}{2000} = 0.225 \\ s_B &= \frac{450}{11} \approx 40.91 \\ T_B &= \frac{1}{(1 - k_B) + \frac{k_B}{s_B}} \\ &= \frac{1}{(1 - 0.225) + \frac{0.225}{40.91}} \\ &= \frac{1}{0.775 + 0.0055} \\ &\approx 1.28 \\ \end {aligned} \)


\( \begin{aligned} k_C &= \frac{1000}{2000} = 0.5 \\ s_C &= \frac{1000}{600} \approx 1.67 \\ T_C &= \frac{1}{(1 - k_C) + \frac{k_C}{s_C}} \\ &= \frac{1}{(1 - 0.5) + \frac{0.5}{1.67}} \\ &= \frac{1}{0.5 + 0.299} \\ &\approx 1.25 \\ \end {aligned} \)


\( \begin{aligned} k_D &= \frac{125}{2000} = 0.0625 \\ s_D &= \frac{125}{1} = 125 \\ T_D &= \frac{1}{(1 - k_D) + \frac{k_D}{s_D}} \\ &= \frac{1}{(1 - 0.0625) + \frac{0.0625}{125}} \\ &= \frac{1}{0.9375 + 0.0005} \\ &\approx 1.066 \\ \end {aligned} \)


\(1.075 \times 1.28 \times 1.25 \times 1.066 \approx 1.8335\)

\( \begin{aligned} k_A &= \frac{ 200}{2000} = 0.1 \\ k_B &= \frac{ 450}{2000} = 0.225 \\ k_C &= \frac{1000}{2000} = 0.5 \\ k_D &= \frac{ 125}{2000} = 0.0625 \\ s_A &= \frac{ 200}{ 60} \approx 3.33 \\ s_B &= \frac{ 450}{ 11} \approx 40.91 \\ s_C &= \frac{1000}{ 600} \approx 1.67 \\ s_D &= \frac{ 125}{ 1} = 125 \\ T_D &= \frac{1}{(1 - k_A - k_B - k_C - k_D) + \frac{k_A}{s_A} + \frac{k_B}{s_B} + \frac{k_C}{s_C} + \frac{k_D}{s_D}} \\ &= \frac{1}{(1 - 0.1 - 0.225 - 0.5 - 0.0625) + \frac{0.1}{3.33} + \frac{0.225}{40.91} + \frac{0.5}{1.67} + \frac{0.0625}{125}} \\ &= \frac{1}{0.1125 + 0.03 + 0.0055 + 0.299 + 0.0005} \\ &= \frac{1}{0.4475} \\ &\approx 2.235 \\ \end {aligned} \)


Concurrency#

Sequential processing: processing a network connection as it arrives and fulfilling the exchange completely; connections are processed in sequence of their arrival.

listen_fd = Listen(port);

while (1) {
  client_fd = accept(listen_fd);
  buf = read(client_fd);
  write(client_fd, buf);
  close(client_fd);
}

the benefits of sequential processing

  • it is simple to implement

  • there is very little persistent state to maintain

  • there are few complex error conditions

the disadvantes of sequential processing

  • sometimes there is poor performance: one slow client causes all others to block; this is poor utilization of the network and the CPU

Concurrent processing: process multiple requests concurrently

Approaches to concurrent server design

  • asynchronous servers select()

  • multiprocess servers fork()

  • multithreaded servers pthreads

Concurrency with processes

The server process blocks on accept() and waits for a new client to connect. When a new connection arrives the parent calls fork() to create another process. The child process handles the new connection and exit()s when the connection terminates. Children become zombies after death: use wait() to reap the children.

fork creates a new process by duplicating the calling process: the new child process is an exact duplicate of the calling parent process except that it has its own PID and pending signal queue. fork returns 0 for the child process or the child’s PID in the parent code.

pid_t fork (void);
/* creates a new child by duplicating the calling parent process
 *       the new child process is an exact duplicate, except that it has its own PID and pending signal queue
 *
 * returns
 *     0 for the child process
 *     child's PID in the parent code
 */

void exit (int status);
/* causes normal process termination with a return value
 * status is sent to the calling parent process
 * `return` is a keyword which returns (a value) from a function
 * `exit` is a function which eventually calls the system call `_exit`: it terminates the process immediately and returns the status to the parent
 */

pid_t wait (int *status);
/*          ^^^^^^^^^^^ the return value that is set by the child process
 *
 * used to wait for state changes in a child of the calling process
 *
 * returns
 *     child PID
 */

Concurrency with threads

A single process handles all the connections, but a parent thread forks (dispatches) a new thread to handle each connection. The child thread handles the new connection and exits when the connection terminates. As many threads can be created as one wants, up to the limits of the system.

A thread is an independent stream of instructions that can be scheduled to run as such by the OS. To the software developer, a thread is a procedure that runs independently of its main program. Imagine a main program that contains a number of procedures, and imagine that all the procedures can be scheduled to run simultaneously with (or independently of) the OS. This is multi-threaded

  • accept

  • pthread_create

The indpendent flow of control is accomplished because a thread maintains its own

  • stack pointer

  • registers

  • scheduling properties (such as policy or priority)

  • set of pending and blocked signals

  • thread specific data

A thread exists within a process and uses the process’s resources. It has its own independent flow of control as long as its parent process exists and the OS supports it. A thread duplicates only the essential resources it needs to be independently schedulable. It may share process resources with other threads that act equally as independently. A thread dies if the parent process dies. A thread is lightweight because most of the overhead has already been accomplished through the creation of its process.

Since threads within the same process share resources changes made by one thread (such as closing a file) will be seen by all other threads; two pointers having the same value point to the same data; reading and writing to the same memory locations is possible and so requires explicit synchronization by the programmer. Shared data between threads can cause conflicts, deadlocks, etc.

Thread control

  • main

    • pthread_create() create a thread

    • pthread_join() wait for the thread to finish

  • thread

    • begins as a function pointer

    • runs until return or pthread_exit

  • library support

    • #include <pthread.h>

    • compile with option -lpthread to link with the pthread library

pthread_t pthread_self (void);
/* returns the ID of the calling thread
 */

int pthread_create (pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
/*                  ^^^^^^^^^^^^^^^^^                                                                           a pthread library structure which contains thread information
 *                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^                                               a set of attributes to apply to the thread
 *                                                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^              the thread function pointer
 *                                                                                                  ^^^^^^^^^   an opaque data pointer to pass to the thread
 *
 * starts a new thread in the calling process
 * the thread becomes alive, it may even run before pthread_create returns
 *
 * returns
 *     0                   on success
 *     non-zero error code on failure
 */

int pthread_join (pthread_t thread, void **retval);
/*                ^^^^^^^^^^^^^^^^                  a pthread library structure which contains thread info
 *                                  ^^^^^^^^^^^^^   a pointer to a pointer to a return value
 *
 * `pthread_join` waits for the thread to terminate
 *
 * returns
 *     0                   on success
 *     non-zero error code on failure
 */

void pthread_exit (void *retval);
/*                 ^^^^^^^^^^^^   a pointer to a return value
 *
 *  `pthread_exit` terminates the calling thread and returns a value
 */

Race Condition

A race condition occurs when the outcome of a program depends on the interleaving of the execution of multiple threads accessing a critical section.

A critical section is a piece of code that accesses a shared variable and must not be concurrently executed by more than one thread.

Each instruction is executed atomically. Multiple threads executing these instructions can result in different interleavings (and outcomes) due to uncontrollable OS scheduling.

mov  0x2e50(%rip),%eax  # 4014 <counter>
add  $0x1,%eax
mov  %eax,0x2e47(%rip)  # 4014 <counter>

Avoiding race conditions

To avoid a race condition it must be ensured that only a single thread can execute a critical section at any given time.

Atomics: modifying a variable results in a single CPU instruction

#include <stdatomic.h>

In general, a critical section may contain complex logic. Primitives for mutual exclusion are needed–a guarantee that only one thread is executing the critical section while others are prevented from doing so. One way to achieve mutual exclusion is via locks

lock_t mutex
lock(&mutex)
critical_section
unlock(&mutex)

Advantages of threads

  • much of the code is identical

  • parallel execution: good CPU and network utilization

  • lower overhead than processes

  • shared-memory communication is possible

Disadvantages

  • synchronization is complicated

  • shared fate within a process: one rogue thread can hurt you

  • relatively insecure, no isolation


Network connections are processed in the sequence of their arrival. Sequential processing is relatively simple to implement, has relatively little persistent state to maintain, and has relatively few complex error conditions; but may have poor performance if a slow client blocks all the others, and poorly utilizes the network or CPU.

listen_fd = Listen(port);
while (1) {
  client_fd = accept(listen_fd);
  buf       = read(client_fd);
  write(client_fd, buf);
  close(client_fd);
}

Multi Processing#

Multi Threading#

Multithreading: a single process handles all connections

  • a parent thread dispatches/forks a new, child thread to handle each connection

  • a child thread handles a connection and exits when the connection terminates

  • one can create as many threads as desired, up to system limits

  • “forking multiple threads of execution in one process”

  • multiple threads share the same code and the same heap (but have different registers and different stacks) (shared data structures?)

  • the independency of threads is maintained via its own stack pointer, registers, scheduling properties (policy, priority, etc.), set of pending and blocked signals, and thread-specific data

A thread is an independent stream of instructions that can be scheduled to run as such by the OS. From the perspective of the developer, a thread is a procedure that runs independently of its main program. A multi-threaded program can be viewed as a main program that consists of a collection of procedures which are scheduled to run independently, or simultaneously.

  • a thread exists within a process, and uses the process’s resources, which it may share with other threads; if a thread modifies shared process resources then the modifications are seen be all other threads

    • closing a file

    • two pointers with the same value point to the same data

    • it is possible to read/write to the same memory location

  • as long as its process exists, and the OS supports it, a thread has its own, independent control flow

  • a thread replicates only those resources which it requires to be indepentantly schedulable

  • a thread dies when its process dies

  • a thread is lightweight in the sense that most of its overhead has already been handled via the creation of its process

/* server */
accept();         // client 1 connects to parent thread
pthread_create(); // client 1 connection transfers to child thread 1
accept();         // client 2 connects to parent thread
pthread_create(); // client 2 connection transfers to child thread 2
/* thread.c */

#include <pthread.h>

int main (int argc, char** argv) {

  pthread_create (pthread_t* thread, const pthread_attr_t* attr, void* (*start_routine)(void *), void* arg)
  /*              ^^^^^^^^^^^^^^^^^                                                                         a pthread library struct, holding thread info
   *                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^                                             a set of attributes to apply to the thread
   *                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^             the function pointer
   *                                                                                             ^^^^^^^^^  an opaque data pointer to pass to the thread
   *
   * pthread_create - start a new thread in the calling process
   * the thread starts at the function pointer, and may be executed before `pthread_create` returns or `pthread_exit`
   */

  pthread_self ()

  pthread_join (pthread_t* thread, void** retval)
  /*            ^^^^^^^^^^^^^^^^^                 a pthread library struct, holding thread info
   *                               ^^^^^^^^^^^^^  a pointer to a pointer to a return value
   *
   * wait for a thread to terminate (maybe)
   */

  pthread_exit (void* retval)
  /*            ^^^^^^^^^^^^  a pointer to a return value
   *
   * pthread_exit - terminates the calling thread, and returns a value
   */

  return 0;
}
gcc -lpthread -g -Wall -O0 thread.c -o thread

/* thread_no_args.c */

#include <pthread.h> /* pthread_create, pthread_join, pthread_self, pthread_t */
#include <stdio.h>   /* printf */

void* func (void* arg) {
  printf("Hello from thread %lx\n", pthread_self());
  return NULL;
}

int main (int argc, char** argv) {
  pthread_t t1, t2, t3;
  printf("main thread %lx starting a new thread\n", pthread_self());

  pthread_create(&t1, NULL, func, NULL);
  pthread_create(&t2, NULL, func, NULL);
  pthread_create(&t3, NULL, func, NULL);

  pthread_join(t1, NULL);
  pthread_join(t2, NULL);
  pthread_join(t3, NULL);

  return 0;
}

/* thread_1_arg.c */

#include <pthread.h> /* pthread_create, pthread_join, pthread_self, pthread_t */
#include <stdio.h>   /* printf */

void* func (void* arg) {
  char* s = (char *) arg;
  printf("Hello from thread %s\n", s);
  return NULL;
}

int main (int argc, char** argv) {
  pthread_t t1, t2, t3;

  pthread_create(&t1, NULL, func, "a");
  pthread_create(&t2, NULL, func, "b");
  pthread_create(&t3, NULL, func, "c");

  pthread_join(t1, NULL);
  pthread_join(t2, NULL);
  pthread_join(t3, NULL);

  return 0;
}

/* thread_multiple_args.c
 *
 *     this is, effectively, a procedure call - why?
 */

#include <pthread.h> /* pthread_create, pthread_join, pthread_self, pthread_t */
#include <stdio.h>   /* printf */

typedef struct {
  int num;
  const char* str;
} foo_t;

void* func (void* arg) {
  foo_t* val = arg;
  printf("thread %lx was passed values %d, %s\n", pthread_self(), val->num, val->str);
  return NULL;
}

int main (int argc, char** argv) {
  foo_t v = {5678, "bar"};

  pthread_t t;

  printf("main thread %lx starting a new thread\n", pthread_self());

  pthread_create(&t, NULL, func, &v);

  pthread_join(t, NULL);

  return 0;
}

/* thread_returning_values.c */

#include <ctype.h>   /* toupper */
#include <pthread.h> /* pthread_create, pthread_join, pthread_self, pthread_t */
#include <stdio.h>   /* printf */
#include <stdlib.h>  /* malloc */
#include <string.h>  /* strcpy, strlen */

typedef struct {
  int num;
  char* str;
} foo_t;

void* func (void* arg) {
  foo_t* a = arg;
  foo_t* b = malloc(sizeof(foo_t));
  b->num = a->num * 2;
  b->str = malloc(strlen(a->str) + 1);
  strcpy(b->str, a->str);
  for (char* p = b->str; *p; ++p) {
    *p = toupper(*p);
  }

  return b;
}

int main (int argc, char** argv) {
  foo_t v = {5678, "bar"};
  foo_t* p;

  pthread_t t;

  printf("main thread %lx starting a new thread\n", pthread_self());

  pthread_create(&t, NULL, func, &v);

  pthread_join(t, (void**) &p);

  printf("thread returned num = %d, str = %s\n", p->num, p->str);

  return 0;
}
/* thread_returning_values_bad.c */

#include <pthread.h> /* pthread_create, pthread_join, pthread_self, pthread_t */
#include <stdio.h>   /* printf */

typedef struct {
  int num;
  const char* str;
} foo_t;

void* func (void* arg) {
  foo_t p;
  // fill p
  return &p; //    <-- segfaults, bc returns a pointer to a stack-allocated local variable
}

int main (int argc, char** argv) {
  foo_t v = {5678, "bar"};
  foo_t* p;

  pthread_t t;

  printf("main thread %lx starting a new thread\n", pthread_self());

  pthread_create(&t, NULL, func, &v);
  pthread_join(t, (void**) &p);

  printf("thread returned num = %d, str = %s\n", p->num, p->str);

  return 0;
}
$ ./thread_returning_values_bad 
main thread 7fab984ec0 starting a new thread
Segmentation fault

Race Conditions and Mutual Exclusion#

A race condition occurs when the result of a program depends on the interleaving of the execution of multiple threads that are accessing a critical section.

A critical section is a piece of code that accesses a shared piece of data, and which must not be executed by more than one thread concurrently.

To avoid race conditions, it must be ensured that only a single thread can execute a critical section at a given time. Mutual exclusion primitives would guaranteee that only one thread is executing the critical section while others are prevented from doing so.

  • atomics

  • locks

/* threads_no_mutex.c */

#include <pthread.h> /* pthread_create, pthread_join, pthread_self, pthread_t */
#include <stdio.h>   /* printf */

static int counter = 0;

void* func (void* arg) {
  for (int i = 0; i < 5000; ++i) {
    ++counter;
  }
  return NULL;
}

int main (int argc, char** argv) {
  pthread_t t1, t2;

  printf("counter = %d\n", counter);

  pthread_create(&t1, NULL, func, NULL);
  pthread_create(&t2, NULL, func, NULL);
  pthread_join(t1, NULL);
  pthread_join(t2, NULL);

  printf("both threads completed, counter = %d\n", counter);

  return 0;
}

If more than one thread executes the following instructions, the program produces nondeterministic output (due to uncontrollable OS scheduling)

mov  0x2e50(%rip),%eax    # 4014 <counter>
add  0x1,%eax
mov  %eax,0x2e47(%rip)    # 4015 <counter>

one possible interleaving:

                             (values after the instruction)
OS          T1    T2         program_counter    eax    counter
(before critical section)    100                 0     50
            mov 8049a1c,%eax 105                50     50      <-- copy the value in variable `counter` (50) to register `eax`
            add $0x1,%eax    108                51     50      <-- add 1 to the value in register `eax` (51 = 1 + 50)
interrupt
save T1
restore T2                   100                 0     50
            mov 8049a1c,%eax 105                50     50      <-- copy the value in variable `counter` (50) to register `eax`
            add $0x1,%eax    108                51     50      <-- add 1 to the value in register `eax` (51 = 1 + 50)
            mov %eax,8049a1c 113                51     51      <-- copy the value in register `eax` (51) to variable `counter` (which is currently 50)
interrupt
save T2
restore T1                   108                51     51
            mov %eax,8049a1c 113                51     51      <-- copy the value in register `eax` (51) to variable `counter` (which is already 51)

atomics#

/* threads_mutex_via_atomics.c */

#include <pthread.h>   /* pthread_create, pthread_join, pthread_self, pthread_t */
#include <stdatomic.h> /* atomic_int - modifying a variable results in a single CPU instruction */
#include <stdio.h>     /* printf */

static atomic_int counter = 0;

void* func (void* arg) {
  for (int i = 0; i < 5000; ++i) {
    ++counter;
  }
  return NULL;
}

int main (int argc, char** argv) {
  pthread_t t1, t2;

  printf("counter = %d\n", counter);

  pthread_create(&t1, NULL, func, NULL);
  pthread_create(&t2, NULL, func, NULL);
  pthread_join(t1, NULL);
  pthread_join(t2, NULL);

  printf("both threads completed, counter = %d\n", counter);

  return 0;
}

Locks#

/* threads_mutex_via_locks.c */

#include <pthread.h>   /* pthread_create, pthread_join, PTHREAD_MUTEX_INITIALIZER, pthread_mutex_lock, pthread_mutex_t, pthread_mutex_unlock, pthread_self, pthread_t */
#include <stdio.h>     /* printf */

static int counter = 0;
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;

void* func (void* arg) {
  for (int i = 0; i < 5000; ++i) {
    pthread_mutex_lock(&lock);   //
    ++counter;                   //  <--  critical section
    pthread_mutex_unlock(&lock); //
  }
  return NULL;
}

int main (int argc, char** argv) {
  pthread_t t1, t2;

  printf("counter = %d\n", counter);

  pthread_create(&t1, NULL, func, NULL);
  pthread_create(&t2, NULL, func, NULL);
  pthread_join(t1, NULL);
  pthread_join(t2, NULL);

  printf("both threads completed, counter = %d\n", counter);

  return 0;
}



Resources#

Tools and Technologies

  • [ h ][ w ] Clang

  • [ h ][ w ] GCC

  • [ h ][ w ] GDB

  • [ h ][ w ] Valgrind

  • [ h ][ w ] VirtualBox

Courses

Online

YouTube#

Chris Kanich

  • [ y ] 12-02-2020 “what’s the difference between processes, threads, and io multiplexing?”

  • [ y ] 11-17-2020 “The Linux socket API explained”.

  • [ y ] 11-03-2020 “understanding mmap, the workhorse behind keeping memory access efficient in linux”.

  • [ y ] 10-08-2020 “What’s behind a file descriptor in Linux? Also, i/o redirection with dup2.”.

  • [ y ] 10-08-2020 “Interacting with files in Linux”.

  • [ y ] 09-30-2020 “Signal handling in Linux”.

  • [ y ] 09-23-2020 “Moving beyond fork() for process creation in Linux”.

  • [ y ] 09-16-2020 “Introduction to Processes in Linux”.

  • [ y ] 09-02-2020 “Linux Executable Symbol Relocation Explained”.

  • [ y ] 09-01-2020 “How do linkers resolve symbols? Systems Programming CS Lecture”.

freeCodeCamp

  • [ y ] 05-29-2024 “Learn C Programming and OOP with Dr. Chuck [feat. classic book by Kernighan and Ritchie]”.

  • [ y ] 08-18-2022 “Dr. Chuck reads C Programming (the classic book by Kernigan and Ritchie)”.

  • [ y ] 01-27-2021 “Data Structures - Full Course Using C and C++”.

  • [ y ] 08-15-2018 “C Programming Tutorial for Beginners”.

Jacob Sorber

  • [ y ] 06-03-2023. “How to get an IP address from a host name? (Example in C)”.

  • [ y ] 02-28-2023. “Sockets and Pipes Look Like Files (Unix/fdopen)”.

  • [ y ] 05-31-2022. “Detect Corruption with a Checksum”.

  • [ y ] 07-27-2021. “Highlighting a Student Programming Project (Code Review)”.

  • [ y ] 06-01-2021. “Why are Progress Bars Always Wrong? Progress Bar Example in C.”.

  • [ y ] 11-04-2020. “Easy Networking in C (libcurl)”.

  • [ y ] 02-18-2020. “How one thread listens to many sockets with select in C.”.

  • [ y ] 10-24-2019. “How to write a multithreaded webserver using condition variables (Part 3)”.

  • [ y ] 10-08-2019. “Multithreaded Server Part 2: Thread Pools”.

  • [ y ] 09-20-2019. “How to write a multithreaded server in C (threads, sockets)”.

  • [ y ] 01-30-2019. “Dealing with Endianness Issues in your Programs”.

  • [ y ] 12-04-2018. “Socket servers can get client addresses. (accept, inet_ntop)”.

  • [ y ] 11-20-2018. “Program your own web server in C. (sockets)”.

  • [ y ] 10-12-2018. “How to build a web client? (sockets)”.

More

  • [ y ] 05-29-2024 Cococry. “Make a GUI Task Management App in pure C (no bloat required)”.

  • [ y ] 04-11-2024 kimylamp. “Signals. I spent 2 years to understand this part.”.

  • [ y ] 10-17-2024 Core Dumped. “How a Single Bit Inside Your Processor Shields Your Operating System’s Integrity”.

[ h ][ w ] Berkeley Packet Filter (BPF)

  • [ y ] 02-23-2023 freeCodeCamp. “Introduction to Linux – Full Course for Beginners”. YouTube.

  • [ y ] 11-25-2023 Nir Lichtman. “Making Minimalist Web Server in C on Linux”.

  • [ y ] 03-15-2023 Tsoding Daily. “C is Just Better than Bash”.

  • [ y ] 05-28-2023 NeuralNine. “5 Awesome Linux Terminal Tools You Must Know”.

  • [ y ] 07-05-2024 typecraft. “This may be my favorite CLI tool ever”.

  • [ y ] 09-26-2024 Dave’s Garage. “Kernel Mode vs User Mode: Why it Matters, What You Need to Know”.

  • [ y ] 01-24-2017 _Drunk Engineer_. “OSI and TCP IP Models - Best Explanation”.

https://cs.brown.edu/courses/csci0300/2022/notes/l09.html

https://sites.google.com/site/embeddedmonologue/home/c-programming/logic-and-arithmetic-shift-and-divide-by-2

https://stackoverflow.com/questions/3952123/representation-of-negative-numbers-in-c

https://guyonbits.com/from-rodata-to-rwdata-introduction-to-memory-mapping-and-ld-scripts/

https://stackoverflow.com/questions/12102332/when-should-i-use-perror-and-fprintfstderr

https://www.cs.yale.edu/homes/aspnes/pinewiki/C(2f)Pointers.html

https://stackoverflow.com/questions/7672560/reading-in-a-variable-length-string-user-input-in-c

https://www.quora.com/Why-does-char-0-successfully-convert-a-char-to-int-in-C


Texts#

C

  • 2019 Amini, Kamran. Extreme C: Taking you to the limit in Concurrency, OOP, and the most advanced capabilities of C. Packt.

  • [ h ] 2015 Bryant, Randal E. & David R. O’Hallaron. Computer Systems: A Programmer’s Perspective. 3e.

  • [ h ][ g ] 2022 Deitel, Paul & Harvey Deitel. C How to Program. 9e.

  • 2019 Gustedt, Jens. Modern C. Manning.

  • 1996 Hanson, David R. C Interfaces and Implementations: Techniques for Creating Reusable Software. Addison-Wesley Professional.

  • 2020 Harwani, B. M. Practical C Programming: Solutions for modern C developers to create efficient and well-structured programs. Packt.

  • [ w ] 1988 Kernighan, Brian W. & Dennis M. Ritchie. C Programming Language. 2e. Pearson.

  • 2014 Klemens, Ben. 21st Century C: C Tips from the New School. 2e. O’Reilly.

  • 1999 Loudon, Kyle. Mastering Algorithms with C: Useful Techniques from Sorting to Encryption. O’Reilly.

  • 2021 Loy, Marc. Smaller C: Learn Code for Small Machines. O’Reilly.

  • 2022 Oualline, Stephen. Bare Metal C: Embedded Programming for the Real World. No Starch Press.

  • 2022 Preschern, Christopher. Fluent C: Principles, Practices, and Patters. O’Reilly.

  • 2013 Reese, Richard M. Understanding and Using C Pointers: Core Techniques for Memory Management. O’Reilly.

  • 2023 Sandler, Nora. Writing a C Compiler: Build a Real Programming Language from Scratch. No Starch Press.

  • 2020 Seacord, Robert C. Effective C: An Introduction to Professional C Programming. No Starch Press.

  • 1997 Sedgewick, Robert. “Algorithms in C, Parts 1-4: Fundamentals, Data Structures, Sorting, Searching”. 3e. Addison-Wesley Professional.

  • 2001 Sedgewick, Robert. “Algorithms in C, Part 5: Graph Algorithms”. 3e. Addison-Wesley Professional.

  • [ h ][ w ] 2013 Stevens, W. Richard & Stephen A. Rago. Advanced Programming in the UNIX Environment. 3e. Addison-Wesley Professional.

  • 2020 Szuhay, Jeff. Learn C Programming: A beginner’s guide to learning C programming the easy and disciplined way. Packt.

  • 1989 Tenenbaum, Aaron M.; Yedidyah Langsam; & Moshe J. Augenstein. “Data Structures Using C”. Prentice Hall.

  • 1994 Van Der Linden, Peter. Expert C Programming: Deep C Secrets. Pearson.

  • 2019 Van Winkle, Lewis. Hands-On Network Programming with C: Learn socket programming in C and write secure and optimized network code. Packt.

Systems

  • [ h ] Arpaci-Dusseau, Remzi h. & Andrea C. Arpaci-Dusseau. Operating Systems: Three Easy Pieces.

  • Gregg, Brendan. (2020). Systems Performance: Enterprise and the Cloud. 2nd Ed. Pearson.

  • Matthews, Suzanne J.; Tia Newhall, & Kevin C. Webb. (2022). Dive Into Systems: A Gentle Introduction to Computer Systems. No Starch Press.

Linux

  • [ h ][ g ] Albing, Carl & JP Vossen. (2022). bash Idioms: Write Powerful, FLexible, Readable Shell Scripts. O’Reilly.

  • [ h ][ g ] Albing, Carl & JP Vossen. (2017). bash Cookbook: Solutions and Example for bash Users. O’Reilly.

  • [ h ] Arpaci-Dusseau, Remzi H. & Andrea C. Arpaci-Dusseau. Operating Systems: Three Easy Pieces.

  • Barrett, Daniel J. (2024). Linux Pocket Guide: Essential Commands. 4e. O’Reilly.

  • [ h ][ g ] Barrett, Daniel J. (2022). Efficient Linux at the Command Line: Boost Your Command-Line Skills. O’Reilly.

  • [ h ] Bryant, Randal E. & David R. O’Hallaron. (2015). Computer Systems: A Programmer’s Perspective. 3e. Pearson.

  • Calavera, David & Lorenzo Fontana. (2019). Linux Observability with BPF: Advanced Programming for Performance Analysis and Networking. O’Reilly.

  • [ h ] Cooper, Mendel. (2014). Advanced Bash-Scripting Guide: An in-depth exploration of the art of shell scripting. The Linux Documentation Project.

  • [ h ] Garrels, Machtelt. (2008). Bash Guide for Beginners. The Linux Documentation Project.

  • Gregg, Brendan. (2019). BPF Performance Tools: Linux System and Application Observability. Addison-Wesley Professional.

  • [ h ][ g ] Hausenblas, Michael. (2022). Learning Modern Linux: A Handbook for the Cloud Native Practitioner. O’Reilly.

  • [ h ][ g ] Hess, Kenneth. (2023). Practical Linux System Administration: A Guide to Installation, Configuration, and Management. O’Reilly.

  • [ w ] Kernighan, Brian W. & Rob Pike. (1984). The Unix Programming Environment.

  • [ h ] Kerrisk, Michael. (2010). The Linux Programming Interface: A Linux and UNIX System Programming Handbook. No Starch Press.

  • [ h ] Matthews, Suzanne J.; Tia Newhall; & Kevin C. Webb. (2022). Dive Into Systems: A Gentle Introduction to Computer Systems. No Starch Press.

  • Messier, Ric. (2024). Learning Kali Linux: Security Testing, Penetration Testing, & Ethical Hacking. 2e. O’Reilly.

  • Nemeth, Evi et al. (2021). Unix and Linux System Administration Handbook. 5e. Pearson.

  • Nikkel, Bruce. (2021). Practical Linux Forensics: A Guide for Digital Investigators. No Starch Press.

  • [ h ] OccupyTheWeb. (2018). Linux Basics for Hackers: Getting Started with Networking, Scripting, and Security in Kali. No Starch Press.

  • [ w ] Raymond, Eric S. (2003). The Art of Unix Programming. Addison-Wesley Professional.

  • Robbins, Arnold. (2016). Bash Pocket Reference: Help for Power Users & Sys Admins. O’Reilly.

  • Schroder, Carla. (2021). Linux Cookbook: Essential Skills for Linux Users and System & Network Administrators. 2e. O’Reilly.

  • Shotts, William E. (2019). The Linux Command Line: A Complete Introduction. 2e. No Starch Press.

  • [ h ][ w ] Stevens, W. Richard & Stephen A. Rago. (2013). Advanced Programming in the UNIX Environment. 3e. Addison-Wesley Professional.

  • [ w ] Tanenbaum, Andrew S. Modern Operating Systems. 5e.

  • [ w ] Tanenbaum, Andrew S. Operating Systems: Design and Implementation. Pearson.

  • [ h ][ g ] Troncone, Paul & Carl Albing. (2019). Cybersecurity Ops with bash: Attack, Defend, and Analyze from the Command Line. O’Reilly.

  • Ward, Brian. (2021). How Linux Works: What Every Superuser Should Know. 3e. No Starch Press.

  • Weiss, Steward. (2025). Introduction to System Programming in Linux. No Starch Press.

2003 [ g ] Stevens, W. Richard; Bill Fenner; & Andrew M. Rudoff. The Sockets Networking API: UNIX Network Programming Vol. I. 3e. Addison-Wesley Professional.

2011 Fall, Kevin R. & W. Richard Stevens. TCP/IP Illustrated, Volume 1: The Protocols. 2e. Addison-Wesley Professional.

1995 Wright, Gary R. & W. Richard Stevens. TCP/IP Illustrated, Volume 2: The Implementation. Addison-Wesley Professional.

2014 Comer, Douglas. Internetworking with TCP/IP, Volume 1: Principles, Protocol, and Architecture. 6e. Pearson.

Networking

  • Chou, Eric. (2020). Mastering Python Networking. 3rd Ed. Packt.

  • [ g ][ g ] Collins, Michael. (2017). Network Security Through Data Analysis: From Data to Action. 2nd Ed. O’Reilly.

  • Fall, Kevin R. & W. Richard Stevens. (2011). TCP/IP Illustrated Vol I: The Protocols. 2nd Ed. Addison-Wesley Professional.

  • Fettig, Abe & Jessica McKellar. (2013). Twisted Network Programming Essentials: Event-Driven Network Programming with Python. 2nd Ed. O’Reilly.

  • Gilman, Evan & Doug Barth. (2017). Zero Trust Networks: Building Secure Systems in Untrusted Networks. O’Reilly.

  • Keshav, Srinivasan. (2012). Mathematical Foundations of Computer Networking. Addison-Wesley Professional.

  • Kozierok, Charles M. (2005). TCP/IP Guide: A Comprehensive, Illustrated Internet Protocols Reference. No Starch Press.

  • [ h ] Kurose, James & Keith Ross. (2016). Computer Networking: A Top-Down Approach. 8th Ed. Pearson.

  • Liu, Cricket & Paul Albitz. DNS and Bind: Help for System Administrators. 5th Ed. O’Reilly.

  • Nadeau, Thomas D. & Ken Gray. (2013). SDN Software Defined Networks: An Authoritative Review of Network Programmability Technologies. O’Reilly.

  • [ g ] Oswalt et al. (2023). Network Programmability and Automation: Skills for the Next-Generation Network Engineer. 2nd Ed. O’Reilly.

  • [ h ][ d ][ g ] Peterson, Larry & Bruce Davie. (2021). Computer Networks: A Systems Approach. The Morgan Kaufmann Series in Networking.

  • [ h ][ d ][ g ] Peterson, Larry et al. Software-Defined Networks: A Systems Approach.

  • Sanders, Chris. (2017). Practical Packet Analysis: Using Wireshark to Solve Real-World Network Problems. 3rd Ed. No Starch Press.

  • Tanenbaum, Andrew S.; Nick Feamster; & David Wehterall. (2021). Computer Networks. 6th Ed. Pearson.

  • White, Russ & Ethan Banks. (2017). Computer Networking Problems and Solutions: An Innovative Approach to Building Resilient, Modern Networks. Pearson.

Profiling and Performance

2020 Gregg, Brendan. Systems Performance: Enterprise and the Cloud. Addison-Wesley Professional.

[ h ][ pdf ] Arndt, Jörg. Matters Computational: Ideas, Algorithms, and Source Code.


Figures#

  • [ w ] 1937-2019 Cohen, Danny


Terms#

  • [ w ] Address Space

  • [ w ] ANSI C

  • [ w ] Bounds Checking

  • [ w ] BSS (Block Starting Symbol) Segment

  • [ w ] C11

  • [ w ] C17

  • [ w ] C23

  • [ w ] C99

  • [ w ] C Data Types

  • [ w ] C File I/O

  • [ w ] C POSIX Library

  • [ w ] C Preprocessor

  • [ w ] C

  • [ w ] C Standard Library

  • [ w ] C Syntax

  • [ w ] Call Stack

  • [ w ] Checksum

  • [ w ] Code/Text Segment

  • [ w ] Column-Major Order

  • [ w ] Data Segment

  • [ w ] Device Driver

  • [ w ] Directive

  • [ w ] Include Directive

  • [ w ] Instruction Scheduling

  • [ w ] Interrupt

  • [ w ] Interrupt Handler

  • [ w ] Linker

  • [ w ] Loader

  • [ w ] Make

  • [ w ] Memory Address

  • [ w ] Memory Management

  • [ w ] Object File

  • [ w ] Pointer

  • [ w ] Pragma

  • [ w ] printf

  • [ w ] Reference

  • [ w ] Relocation

  • [ w ] Row-Major Order

  • [ w ] scanf

  • [ w ] Shared Library

  • [ w ] sizeof

  • [ w ] Stack

  • [ w ] Stack-Based Memory Allocation

  • [ w ] struct

  • [ w ] Systems Programming

  • [ w ] typedef

  • [ w ] Union

  • [ w ] 64-bit computing

  • [ w ] Addressing Mode

  • [ w ] Berkeley Sockets

  • [ w ] Berkeley Software Distribution, history

  • [ w ] C Data Types

  • [ w ] C Standard Library

  • [ w ] Instruction Set Architecture

  • [ w ] Loader

  • [ w ] Network Socket

  • [ w ] 32-bit Computing

  • [ w ] Berkeley Sockets

  • [ w ] Berkeley Software Distribution (BSD)

  • [ w ] Bit Numbering

  • [ w ] Bitwise Operation

  • [ w ] Byte

  • [ w ] Byte Addressing

  • [ w ] Endianness

  • [ w ] File Descriptor

  • [ w ] Handle

  • [ w ] Inter Process Communication (IPC)

  • [ w ] Network Socket

  • [ w ] POSIX

  • [ w ] Standard Stream

  • [ w ] Unix Domain Socket

  • [ w ] Unix Philosophy

  • [ w ] Word

  • [ w ] Word Addressing

  • [ w ] Word Mark

Memory

  • [ w ] Address Space

  • [ w ] Bank Switching

  • [ w ] Cache

  • [ w ] CPU Cache

  • [ w ] Flat Memory Model

  • [ w ] Memory Address

  • [ w ] Memory Paging

Memory Segmentation

  • [ w ] .bss (Block Starting Symbol)

  • [ w ] .data

  • [ w ] .text

  • [ w ] x86 Memory Segmentation

C Standard Library

  • [ w ] errno.h

Programming

  • [ w ] Automatic Variable

  • [ w ] Continuation

  • [ w ] Continuation-Passing Style

  • [ w ] Control Flow

  • [ w ] Global Variable

  • [ w ] Local Variable

  • [ w ] Return Statement

  • [ w ] Re-entrancy

  • [ w ] Scope

  • [ w ] Static Variable

  • [ w ] Variable

Architecture

  • [ w ] 1-bit Computing

  • [ w ] 8-bit Computing

  • [ w ] 16-bit Computing

  • [ w ] 8008 (8-bit microprocessor)

  • [ w ] 8080 (8-bit microprocessor)

  • [ w ] 8086 (16-bit microprocessor)

  • [ w ] 8088

  • [ w ] 80186

  • [ w ] 80286 (16-bit microprocessor)

  • [ w ] 80386 (i386) (32-bit microprocessor)

  • [ w ] 80486 (i486) (32-bit microprocessor)

  • [ w ] Asynchronous Circuit

  • [ w ] Bus Interface Unit (BIU)

  • [ w ] Cache Prefetching

  • [ w ] Clock Generator

  • [ w ] Clock Signal

  • [ w ] Combinatorial Logic

  • [ w ] Complex Instruction Set Computer (CISC)

  • [ w ] Duty Cycle

  • [ w ] Execution Unit (EU)

  • [ w ] Flip-Flop

  • [ w ] Instruction Set Architecture (ISA)

  • [ w ] Instructions Per Second (IPS)

  • [ w ] Logic Gate

  • [ w ] Opcode

  • [ w ] Prefetch Input Queue (PIQ)

  • [ w ] Race Condition

  • [ w ] Reduced Instruction Set Computer (RISC)

  • [ w ] Sequential Logic

  • [ w ] Synchronous Circuit

  • [ w ] x86

  • [ w ] x86-64 (x64)

  • [ w ] x86 Memory Segmentation

  • [ w ] 32-bit x86

  • 64-bit x64