Everybody’s done a Hello World program before. But now that I’ve got a few years of experience with the language, I set out to ask one of the most pressing questions out there - how do we make Hello World in C as convoluted and hard to understand as possible? This post documents the final result of a sleep-deprived me trying to do exactly that.

I quickly realized I had to set a few ground rules first so there would be a sensible limit for lines of code that are “useless”:

  • Since we’re using pure C, we won’t have classes (so no HelloWorldFactoryFactoryFactorySingletons)
  • All #include directives should be essential to the program (so no chaining a billion .h files)
  • Each function must do something essential to the program (so no functions that just call another or useless #defines)
  • The program takes no input, and writes Hello World! exactly to the terminal (so no other calculations are done)

With that, let’s start by building our string in a separate function with malloc instead:

#include <stdio.h>
#include <stdlib.h>

char* generate_words() {
  char* ret = malloc(13);

  ret[0] = 'H';
  ret[1] = 'e';
  ret[2] = 'l';
  ret[3] = 'l';
  ret[4] = 'o';
  ret[5] = ' ';
  ret[6] = 'W';
  ret[7] = 'o';
  ret[8] = 'r';
  ret[9] = 'l';
  ret[10] = 'd';
  ret[11] = '!';
  ret[12] = '\0'; // Can't forget our null operator!

  return ret;
}

int main() {
  char *ret = generate_words();
  printf(ret);
  free(ret);
  return 0;
}

That’s already much more complicated than before, but we can make it even worse! Replacing our direct assignments of letters to bitwise operations gets us this monstrosity:

#include <stdio.h>
#include <stdlib.h>

char* generate_words() {
  char* ret = malloc(12);
  char c = 0x01;

  ret[0] = (c << 6) | (c << 3); // H is 0x48
  ret[1] = c | (c << 2) | (c << 5) | (c << 6); // 'e' is 101
  ret[2] = ((c | (c << 1)) << 2) | ((c | (c << 1)) << 5); // 'l' is 108
  ret[3] = ((c | (c << 1)) << 2) | ((c | (c << 1)) << 5); // 'l' is 108
  ret[4] = (c | (c << 1)) | ((c | (c << 1)) << 2) | ((c | (c << 1)) << 5); // 'o' is 111
  ret[5] = (c << 5); // ' ' is 32
  ret[6] = c | (c << 2) | (c << 1) | (c << 1) << 3 | (c << 1) << 5; // 'W' is 87
  ret[7] = (c | (c << 1)) | ((c | (c << 1)) << 2) | ((c | (c << 1)) << 5); // 'o' is 111
  ret[8] = (c << 1) | ((c << 1) | (c << 2) | c) << 4; // 'r' is 114
  ret[9] = ((c | (c << 1)) << 2) | ((c | (c << 1)) << 5); // 'l' is 108
  ret[10] = (c << 2) | (c << 5) | (c << 6); // 'd' is 100
  ret[11] = (c << 5) | c; // '!' is 33
  ret[12] = 0;
 
  return ret;
}

int main() {
  char *ret = generate_words();
  printf(ret);
  free(ret);
  return 0;
}

As you can see, I’ve added comments and some extra brackets for readability’s sake. However, what if we remove our c variable to save space and only depend on previously defined values in ret for our calculations?

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

volatile u_int8_t a = 0 - 1;
volatile u_int8_t * z = NULL;

u_int8_t* generate_words() {
  u_int8_t * ret = malloc(13);

  ret[0] = (1 << 6) | (1 << 3);
  ret[1] = (ret[0] & (1 << 6)) | (ret[0] >> 1) | (ret[0] >> 6);
  ret[2] = ret[0] | ret[1] & ~(ret[0] >> 6);
  ret[4] = ret[2] | ret[1] | (ret[0] >> 5);
  ret[5] = (ret[1] & ret[2] & ret[0]) >> 1;
  ret[6] = (ret[4] & ~ret[5] | (ret[0] >> 2)) & ~(ret[0] >> 4 << 1);
  ret[8] = (~ret[1] & ret[6]) | ret[5] | ret[5] << 1;
  ret[10] = ret[1] & ~(ret[5] >> 5);
  ret[11] = ret[5] | ((ret[10] & ret[8]) >> 6);
  ret[12] = 0;

  memcpy(ret + 3, ret + 2, 1);
  memcpy(ret + 9, ret + 3, 1);
  memcpy(ret + 7, ret + 4, 1);

  ret[12] = 0;

  return ret;
}

int main() {
  u_int8_t *ret = generate_words();
  printf("%s", ret);
  free(ret);
  return 0;
}

Now our code’s starting to look good (bad?). The only problem is that it takes way too long to run - my super-powerful laptop processor from two years ago took nearly 5 milliseconds to run it! Well, we could also optimize it to be cache-aligned for maximum performance:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <zconf.h>

u_int8_t* generate_words() {
    u_int8_t * ret = aligned_alloc(sysconf(_SC_LEVEL1_DCACHE_LINESIZE), 13);

    ret[0] = (1 << 6) | (1 << 3);
    ret[1] = (ret[0] & (1 << 6)) | (ret[0] >> 1) | (ret[0] >> 6);
    ret[2] = ret[0] | ret[1] & ~(ret[0] >> 6);
    ret[4] = ret[2] | ret[1] | (ret[0] >> 5);
    ret[5] = (ret[1] & ret[2] & ret[0]) >> 1;
    ret[6] = (ret[4] & ~ret[5] | (ret[0] >> 2)) & ~(ret[0] >> 4 << 1);
    ret[8] = (~ret[1] & ret[6]) | ret[5] | ret[5] << 1;
    ret[10] = ret[1] & ~(ret[5] >> 5);
    ret[11] = ret[5] | ((ret[10] & ret[8]) >> 6);
    ret[12] = 0;

    memcpy(ret + 3, ret + 2, 1);
    memcpy(ret + 9, ret + 3, 1);
    memcpy(ret + 7, ret + 4, 1);

    ret[12] = 0;

    return ret;
}

int main() {
    u_int8_t *ret = generate_words();
    printf("%s", ret);
    free(ret);
    return 0;
}

Well, now we’ve got way too many lines. Let’s trim down on it by getting rid of the function and putting our code into a for loop:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

int main() {
    u_int8_t * ret = aligned_alloc(sysconf(_SC_LEVEL1_DCACHE_LINESIZE), 13);

    for (__auto_type i = 0; i < 13; i++) {
        if (i == 0)
            ret[i] = (1 << (i + 6)) | (1 << (i + 3));
        else if (i == 1)
            ret[i] = (ret[i - 1] & (1 << (i + 5))) | (ret[i - 1] >> i) | (ret[i - 1] >> (i + 5));
        else if (i == 2)
            ret[i] = ret[i - 2] | ret[1] & ~(ret[i - 2] >> (i + 4));
        else if (i == 4)
            ret[i] = ret[i - 2] | ret[i - 3] | (ret[i - 4] >> (i + 1));
        else if (i == 5)
            ret[i] = (ret[i - 4] & ret[i - 3] & ret[i - 5]) >> (i - 4);
        else if (i == 6)
            ret[i] = (ret[i - 2] & ~ret[i - 1] | (ret[i - 6] >> (i - 4))) & ~(ret[i - 6] >> (i - 2) << (i - 5));
        else if (i == 8)
            ret[i] = (~ret[i - 7] & ret[i - 2]) | ret[i - 3] | ret[i - 3] << (i - 7);
        else if (i == 10)
            ret[i] = ret[i - 9] & ~(ret[i - 5] >> (i - 5));
        else if (i == 11)
            ret[i] = ret[i - 6] | ((ret[i - 1] & ret[i - 3]) >> (i - 5));
        else
            ret[i] = i - i;
    }

    memcpy(ret + 3, ret + 2, 1);
    memcpy(ret + 9, ret + 3, 1);
    memcpy(ret + 7, ret + 4, 1);

    ret[12] = 0;
    printf("%s", ret);
    free(ret);
    return 0;
}

Wait, that for loop we added there could be written in an even more convoluted way! Let’s do so:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

volatile unsigned int a = 0 - 1;
volatile u_int8_t * z = NULL;

int main() {
    switch(a) {
        case 0 - 1:
            z = aligned_alloc(sysconf(_SC_LEVEL1_DCACHE_LINESIZE), 1 * 10 + 3);
            break;
        case 1 * 10 + 3:
            memcpy(z + 3, z + 2, 1);
            memcpy(z + 9, z + 3, 1);
            memcpy(z + 7, z + 4, 1);
            printf("%s", z);
            free(z);
            return 0;
        default:
            z[a] = a == 0 ? z[a] = (1 << (a + 6)) | (1 << (a + 3)) : a == 1 ? (z[a - 1] & (1 << (a + 5))) | (z[a - 1] >> a) | (z[a - 1] >> (a + 5))  : a == 2 ? z[a - 2] | z[1] & ~(z[a - 2] >> (a + 4)) : a == 4 ? z[a - 2] | z[a - 3] | (z[a - 4] >> (a + 1)) : a == 5 ? (z[a - 4] & z[a - 3] & z[a - 5]) >> (a - 4) : a == 6 ? (z[a - 2] & ~z[a - 1] | (z[a - 6] >> (a - 4))) & ~(z[a - 6] >> (a - 2) << (a - 5)) : a == 8 ? (~z[a - 7] & z[a - 2]) | z[a - 3] | z[a - 3] << (a - 7) : a == 10 ? z[a - 9] & ~(z[a - 5] >> (a - 5)) : a == 11 ? z[a - 6] | ((z[a - 1] & z[a - 3]) >> (a - 5)) : a - a;
    }

    a = a + 1;
    main();
}

Well, our switch there is taking quite a bit of space, isn’t it? Let’s fix that and clean up the code a bit:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

volatile u_int8_t a = 0 - 1;
volatile u_int8_t * z = NULL;

int main() {
    a==0-1?z=aligned_alloc(sysconf(_SC_LEVEL1_DCACHE_LINESIZE),1*10+3):a!=1*10+3?z[a]=a==0?z[a]=(1<<(a+6))|(1<<(a+3)):a==1?(z[a-1]&(1<<(a+5)))|(z[a-1]>>a)|(z[a-1]>>(a+5)):a==2?z[a-2]|z[1]&~(z[a-2]>>(a+4)):a==4?z[a-2]|z[a-3]|(z[a-4]>>(a+1)):a==5?(z[a-4]&z[a-3]&z[a-5])>>(a-4):a==6?(z[a-2]&~z[a-1]|(z[a-6]>>(a-4)))&~(z[a-6]>>(a-2)<<(a-5)):a==8?(~z[a-7]&z[a-2])|z[a-3]|z[a-3]<<(a-7):a==10?z[a-9]&~(z[a-5]>>(a-5)):a==11?z[a-6]|((z[a-1]&z[a-3])>>(a-5)):a-a:0;if(a==1*10+3){memcpy(z+3,z+2,1);memcpy(z+9,z+3,1);memcpy(z+7,z+4,1);printf("%s",z);free(z);return 0;}a=a+1;main();
}

And there we go - a program that prints Hello World, in just a single line of logic!

I’m sure there’s more ways to improve the unreadability of the project, so if you see something that could be improved, please let me know!