After reading A Parallel Random Number Generator I tried to make a hash function based on the idea of multiplying two 32 bit values to produce a 64 bit value, then doing something interesting with the 64 bit value.

Using the full 64 bit value of the multiplication seems to be troublesome for MSVC and gcc (32 bits) to optimize. MSVC insists on saving the frame pointer unless the function is __forceinline'd and doesn't understand you can do 'mul dword ptr [edi]' (thus saving a mov), and gcc/icc dump values on to the stack in the inner-loop. MSVC generates good, but not optimal, code when it's forceinlined, but gcc/icc are so bad the asm needs to be hand coded. 64 bit gcc/icc do not have this problem.

Note that the seed avalanching is less than perfect under 5 bytes, but those keys are not at risk to forced collisions.




finline u32 fastcall Crap8( const u8 *key, u32 len, u32 seed ) {
#if !defined(__LP64__) && !defined(_MSC_VER) && ( defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) )
	u32 hash;
		"addl %%ecx, %%esi\n"
		"lea 0x97e1cc59(%%ecx), %%ebx\n"
		"cmpl $8, %%ecx\n"
		"jb DW%=\n"
		"imull $0x83d2e73b, %%esi, %%esi\n"
		"movl $0x83d2e73b, %%eax\n"
		"mull (%%edi)\n"
		"addl $-8, %%ecx\n"
		"xorl %%eax, %%ebx\n"
		"xorl %%edx, %%esi\n"
		"imull $0x83d2e73b, %%esi, %%esi\n"
		"movl $0x83d2e73b, %%eax\n"
		"mull 4(%%edi)\n"
		"addl $8, %%edi\n"
		"xorl %%eax, %%ebx\n"
		"xorl %%edx, %%esi\n"
		"cmpl $8, %%ecx\n"
		"jae QW%=\n"
		"cmpl $4, %%ecx\n"
		"jb B%=\n"
		"imull $0x83d2e73b, %%esi, %%esi\n"
		"movl $0x83d2e73b, %%eax\n"
		"mull (%%edi)\n"
		"addl $4, %%edi\n"
		"addl $-4, %%ecx\n"
		"xorl %%eax, %%ebx\n"
		"xorl %%edx, %%esi\n"
		"testl %%ecx, %%ecx\n"
		"jz F%=\n"
		"imull $0x83d2e73b, %%esi, %%esi\n"
		"shll $3, %%ecx\n"
		"movl $1, %%edx\n"
		"shll %%cl, %%edx\n"
		"movl $0x83d2e73b, %%eax\n"
		"addl $-1, %%edx\n"
		"andl (%%edi), %%edx\n"
		"mull %%edx\n"
		"xorl %%eax, %%ebx\n"
		"xorl %%edx, %%esi\n"
		"xorl %%ebx, %%esi\n"
		"movl $0x97e1cc59, %%eax\n"
		"mull %%esi\n"
		"xorl %%edx, %%eax\n"
		"xorl %%ebx, %%eax\n"
		: "=a"(hash), "=c"(len), "=S"(len), "=D"(key)
		: "c"(len), "S"(seed), "D"(key)
		: "%ebx", "%edx", "cc" 
	return hash;
	#define c8fold( a, b, y, z ) { p = (u32)(a) * (u64)(b); y ^= (u32)p; z ^= (u32)(p >> 32); }
	#define c8mix( in ) { h *= m; c8fold( in, m, k, h ); }

	const u32 m = 0x83d2e73b, n = 0x97e1cc59, *key4 = (const u32 *)key;
	u32 h = len + seed, k = n + len;
	u64 p;

	while ( len >= 8 ) { c8mix(key4[0]) c8mix(key4[1]) key4 += 2; len -= 8; }
	if ( len >= 4 ) { c8mix(key4[0]) key4 += 1; len -= 4; }
	if ( len ) { c8mix( key4[0] & ( ( 1 << ( len * 8 ) ) - 1 ) ) }
	c8fold( h ^ k, n, k, k )
	return k;