This is What Peak Hello World Looks Like
Everybody’s done a Hello World program before. But now that I’ve got a few years of experience with the language, I set out to ask one of the most pressing questions out there - how do we make Hello World in C as convoluted and hard to understand as possible? This post documents the final result of a sleep-deprived me trying to do exactly that.
I quickly realized I had to set a few ground rules first so there would be a sensible limit for lines of code that are “useless”:
- Since we’re using pure C, we won’t have classes (so no
HelloWorldFactoryFactoryFactorySingleton
s) - All
#include
directives should be essential to the program (so no chaining a billion.h
files) - Each function must do something essential to the program (so no functions that just call another or useless
#defines
) - The program takes no input, and writes
Hello World!
exactly to the terminal (so no other calculations are done)
With that, let’s start by building our string in a separate function with malloc
instead:
#include <stdio.h>
#include <stdlib.h>
char* generate_words() {
char* ret = malloc(13);
ret[0] = 'H';
ret[1] = 'e';
ret[2] = 'l';
ret[3] = 'l';
ret[4] = 'o';
ret[5] = ' ';
ret[6] = 'W';
ret[7] = 'o';
ret[8] = 'r';
ret[9] = 'l';
ret[10] = 'd';
ret[11] = '!';
ret[12] = '\0'; // Can't forget our null operator!
return ret;
}
int main() {
char *ret = generate_words();
printf(ret);
free(ret);
return 0;
}
That’s already much more complicated than before, but we can make it even worse! Replacing our direct assignments of letters to bitwise operations gets us this monstrosity:
#include <stdio.h>
#include <stdlib.h>
char* generate_words() {
char* ret = malloc(12);
char c = 0x01;
ret[0] = (c << 6) | (c << 3); // H is 0x48
ret[1] = c | (c << 2) | (c << 5) | (c << 6); // 'e' is 101
ret[2] = ((c | (c << 1)) << 2) | ((c | (c << 1)) << 5); // 'l' is 108
ret[3] = ((c | (c << 1)) << 2) | ((c | (c << 1)) << 5); // 'l' is 108
ret[4] = (c | (c << 1)) | ((c | (c << 1)) << 2) | ((c | (c << 1)) << 5); // 'o' is 111
ret[5] = (c << 5); // ' ' is 32
ret[6] = c | (c << 2) | (c << 1) | (c << 1) << 3 | (c << 1) << 5; // 'W' is 87
ret[7] = (c | (c << 1)) | ((c | (c << 1)) << 2) | ((c | (c << 1)) << 5); // 'o' is 111
ret[8] = (c << 1) | ((c << 1) | (c << 2) | c) << 4; // 'r' is 114
ret[9] = ((c | (c << 1)) << 2) | ((c | (c << 1)) << 5); // 'l' is 108
ret[10] = (c << 2) | (c << 5) | (c << 6); // 'd' is 100
ret[11] = (c << 5) | c; // '!' is 33
ret[12] = 0;
return ret;
}
int main() {
char *ret = generate_words();
printf(ret);
free(ret);
return 0;
}
As you can see, I’ve added comments and some extra brackets for readability’s sake. However, what if we remove our c
variable to save space and only depend on previously defined values in ret
for our calculations?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
volatile u_int8_t a = 0 - 1;
volatile u_int8_t * z = NULL;
u_int8_t* generate_words() {
u_int8_t * ret = malloc(13);
ret[0] = (1 << 6) | (1 << 3);
ret[1] = (ret[0] & (1 << 6)) | (ret[0] >> 1) | (ret[0] >> 6);
ret[2] = ret[0] | ret[1] & ~(ret[0] >> 6);
ret[4] = ret[2] | ret[1] | (ret[0] >> 5);
ret[5] = (ret[1] & ret[2] & ret[0]) >> 1;
ret[6] = (ret[4] & ~ret[5] | (ret[0] >> 2)) & ~(ret[0] >> 4 << 1);
ret[8] = (~ret[1] & ret[6]) | ret[5] | ret[5] << 1;
ret[10] = ret[1] & ~(ret[5] >> 5);
ret[11] = ret[5] | ((ret[10] & ret[8]) >> 6);
ret[12] = 0;
memcpy(ret + 3, ret + 2, 1);
memcpy(ret + 9, ret + 3, 1);
memcpy(ret + 7, ret + 4, 1);
ret[12] = 0;
return ret;
}
int main() {
u_int8_t *ret = generate_words();
printf("%s", ret);
free(ret);
return 0;
}
Now our code’s starting to look good (bad?). The only problem is that it takes way too long to run - my super-powerful laptop processor from two years ago took nearly 5 milliseconds to run it! Well, we could also optimize it to be cache-aligned for maximum performance:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <zconf.h>
u_int8_t* generate_words() {
u_int8_t * ret = aligned_alloc(sysconf(_SC_LEVEL1_DCACHE_LINESIZE), 13);
ret[0] = (1 << 6) | (1 << 3);
ret[1] = (ret[0] & (1 << 6)) | (ret[0] >> 1) | (ret[0] >> 6);
ret[2] = ret[0] | ret[1] & ~(ret[0] >> 6);
ret[4] = ret[2] | ret[1] | (ret[0] >> 5);
ret[5] = (ret[1] & ret[2] & ret[0]) >> 1;
ret[6] = (ret[4] & ~ret[5] | (ret[0] >> 2)) & ~(ret[0] >> 4 << 1);
ret[8] = (~ret[1] & ret[6]) | ret[5] | ret[5] << 1;
ret[10] = ret[1] & ~(ret[5] >> 5);
ret[11] = ret[5] | ((ret[10] & ret[8]) >> 6);
ret[12] = 0;
memcpy(ret + 3, ret + 2, 1);
memcpy(ret + 9, ret + 3, 1);
memcpy(ret + 7, ret + 4, 1);
ret[12] = 0;
return ret;
}
int main() {
u_int8_t *ret = generate_words();
printf("%s", ret);
free(ret);
return 0;
}
Well, now we’ve got way too many lines. Let’s trim down on it by getting rid of the function and putting our code into a for
loop:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
int main() {
u_int8_t * ret = aligned_alloc(sysconf(_SC_LEVEL1_DCACHE_LINESIZE), 13);
for (__auto_type i = 0; i < 13; i++) {
if (i == 0)
ret[i] = (1 << (i + 6)) | (1 << (i + 3));
else if (i == 1)
ret[i] = (ret[i - 1] & (1 << (i + 5))) | (ret[i - 1] >> i) | (ret[i - 1] >> (i + 5));
else if (i == 2)
ret[i] = ret[i - 2] | ret[1] & ~(ret[i - 2] >> (i + 4));
else if (i == 4)
ret[i] = ret[i - 2] | ret[i - 3] | (ret[i - 4] >> (i + 1));
else if (i == 5)
ret[i] = (ret[i - 4] & ret[i - 3] & ret[i - 5]) >> (i - 4);
else if (i == 6)
ret[i] = (ret[i - 2] & ~ret[i - 1] | (ret[i - 6] >> (i - 4))) & ~(ret[i - 6] >> (i - 2) << (i - 5));
else if (i == 8)
ret[i] = (~ret[i - 7] & ret[i - 2]) | ret[i - 3] | ret[i - 3] << (i - 7);
else if (i == 10)
ret[i] = ret[i - 9] & ~(ret[i - 5] >> (i - 5));
else if (i == 11)
ret[i] = ret[i - 6] | ((ret[i - 1] & ret[i - 3]) >> (i - 5));
else
ret[i] = i - i;
}
memcpy(ret + 3, ret + 2, 1);
memcpy(ret + 9, ret + 3, 1);
memcpy(ret + 7, ret + 4, 1);
ret[12] = 0;
printf("%s", ret);
free(ret);
return 0;
}
Wait, that for
loop we added there could be written in an even more convoluted way! Let’s do so:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
volatile unsigned int a = 0 - 1;
volatile u_int8_t * z = NULL;
int main() {
switch(a) {
case 0 - 1:
z = aligned_alloc(sysconf(_SC_LEVEL1_DCACHE_LINESIZE), 1 * 10 + 3);
break;
case 1 * 10 + 3:
memcpy(z + 3, z + 2, 1);
memcpy(z + 9, z + 3, 1);
memcpy(z + 7, z + 4, 1);
printf("%s", z);
free(z);
return 0;
default:
z[a] = a == 0 ? z[a] = (1 << (a + 6)) | (1 << (a + 3)) : a == 1 ? (z[a - 1] & (1 << (a + 5))) | (z[a - 1] >> a) | (z[a - 1] >> (a + 5)) : a == 2 ? z[a - 2] | z[1] & ~(z[a - 2] >> (a + 4)) : a == 4 ? z[a - 2] | z[a - 3] | (z[a - 4] >> (a + 1)) : a == 5 ? (z[a - 4] & z[a - 3] & z[a - 5]) >> (a - 4) : a == 6 ? (z[a - 2] & ~z[a - 1] | (z[a - 6] >> (a - 4))) & ~(z[a - 6] >> (a - 2) << (a - 5)) : a == 8 ? (~z[a - 7] & z[a - 2]) | z[a - 3] | z[a - 3] << (a - 7) : a == 10 ? z[a - 9] & ~(z[a - 5] >> (a - 5)) : a == 11 ? z[a - 6] | ((z[a - 1] & z[a - 3]) >> (a - 5)) : a - a;
}
a = a + 1;
main();
}
Well, our switch there is taking quite a bit of space, isn’t it? Let’s fix that and clean up the code a bit:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
volatile u_int8_t a = 0 - 1;
volatile u_int8_t * z = NULL;
int main() {
a==0-1?z=aligned_alloc(sysconf(_SC_LEVEL1_DCACHE_LINESIZE),1*10+3):a!=1*10+3?z[a]=a==0?z[a]=(1<<(a+6))|(1<<(a+3)):a==1?(z[a-1]&(1<<(a+5)))|(z[a-1]>>a)|(z[a-1]>>(a+5)):a==2?z[a-2]|z[1]&~(z[a-2]>>(a+4)):a==4?z[a-2]|z[a-3]|(z[a-4]>>(a+1)):a==5?(z[a-4]&z[a-3]&z[a-5])>>(a-4):a==6?(z[a-2]&~z[a-1]|(z[a-6]>>(a-4)))&~(z[a-6]>>(a-2)<<(a-5)):a==8?(~z[a-7]&z[a-2])|z[a-3]|z[a-3]<<(a-7):a==10?z[a-9]&~(z[a-5]>>(a-5)):a==11?z[a-6]|((z[a-1]&z[a-3])>>(a-5)):a-a:0;if(a==1*10+3){memcpy(z+3,z+2,1);memcpy(z+9,z+3,1);memcpy(z+7,z+4,1);printf("%s",z);free(z);return 0;}a=a+1;main();
}
And there we go - a program that prints Hello World
, in just a single line of logic!
I’m sure there’s more ways to improve the unreadability of the project, so if you see something that could be improved, please let me know!