Monday, January 18, 2010

Spherical harmonics spoilers

Spherical harmonics seems to have some impenetrable level of difficulty, especially among the indie scene which has little to go off of other than a few presentations and whitepapers, some of which even contain incorrect information (i.e. one of the formulas in the Sony paper on the topic is incorrect), and most of which are still using ZYZ rotations because it's so hard to find how to do a matrix rotation.

Hao Chen and Xinguo Liu did a presentation at SIGGRAPH '08 and the slides from it contain a good deal of useful stuff, nevermind one of the ONLY easy-to-find rotate-by-matrix functions. It also treats the Z axis a bit awkwardly, so I patched the rotation code up a bit, and a pre-integrated cosine convolution filter so you can easily get SH coefs for directional light.

There was also gratuitous use of sqrt(3) multipliers, which can be completely eliminated by simply premultiplying or predividing coef #6 by it, which incidentally causes all of the constants and multipliers to resolve to rational numbers.

As always, you can include multiple lights by simply adding the SH coefs for them together. If you want specular, you can approximate a directional light by using the linear component to determine the direction, and constant component to determine the color. You can do this per-channel, or use the average values to determine the direction and do it once.

Here are the spoilers:

#define SH_AMBIENT_FACTOR   (0.25f)
#define SH_LINEAR_FACTOR (0.5f)
#define SH_QUADRATIC_FACTOR (0.3125f)

void LambertDiffuseToSHCoefs(const terVec3 &dir, float out[9])
{
// Constant
out[0] = 1.0f * SH_AMBIENT_FACTOR;

// Linear
out[1] = dir[1] * SH_LINEAR_FACTOR;
out[2] = dir[2] * SH_LINEAR_FACTOR;
out[3] = dir[0] * SH_LINEAR_FACTOR;

// Quadratics
out[4] = ( dir[0]*dir[1] ) * 3.0f*SH_QUADRATIC_FACTOR;
out[5] = ( dir[1]*dir[2] ) * 3.0f*SH_QUADRATIC_FACTOR;
out[6] = ( 1.5f*( dir[2]*dir[2] ) - 0.5f ) * SH_QUADRATIC_FACTOR;
out[7] = ( dir[0]*dir[2] ) * 3.0f*SH_QUADRATIC_FACTOR;
out[8] = 0.5f*( dir[0]*dir[0] - dir[1]*dir[1] ) * 3.0f*SH_QUADRATIC_FACTOR;
}


void RotateCoefsByMatrix(float outCoefs[9], const float pIn[9], const terMat3x3 &rMat)
{
// DC
outCoefs[0] = pIn[0];

// Linear
outCoefs[1] = rMat[1][0]*pIn[3] + rMat[1][1]*pIn[1] + rMat[1][2]*pIn[2];
outCoefs[2] = rMat[2][0]*pIn[3] + rMat[2][1]*pIn[1] + rMat[2][2]*pIn[2];
outCoefs[3] = rMat[0][0]*pIn[3] + rMat[0][1]*pIn[1] + rMat[0][2]*pIn[2];

// Quadratics
outCoefs[4] = (
( rMat[0][0]*rMat[1][1] + rMat[0][1]*rMat[1][0] ) * ( pIn[4] )
+ ( rMat[0][1]*rMat[1][2] + rMat[0][2]*rMat[1][1] ) * ( pIn[5] )
+ ( rMat[0][2]*rMat[1][0] + rMat[0][0]*rMat[1][2] ) * ( pIn[7] )
+ ( rMat[0][0]*rMat[1][0] ) * ( pIn[8] )
+ ( rMat[0][1]*rMat[1][1] ) * ( -pIn[8] )
+ ( rMat[0][2]*rMat[1][2] ) * ( 3.0f*pIn[6] )
);

outCoefs[5] = (
( rMat[1][0]*rMat[2][1] + rMat[1][1]*rMat[2][0] ) * ( pIn[4] )
+ ( rMat[1][1]*rMat[2][2] + rMat[1][2]*rMat[2][1] ) * ( pIn[5] )
+ ( rMat[1][2]*rMat[2][0] + rMat[1][0]*rMat[2][2] ) * ( pIn[7] )
+ ( rMat[1][0]*rMat[2][0] ) * ( pIn[8] )
+ ( rMat[1][1]*rMat[2][1] ) * ( -pIn[8] )
+ ( rMat[1][2]*rMat[2][2] ) * ( 3.0f*pIn[6] )
);

outCoefs[6] = (
( rMat[2][1]*rMat[2][0] ) * ( pIn[4] )
+ ( rMat[2][2]*rMat[2][1] ) * ( pIn[5] )
+ ( rMat[2][0]*rMat[2][2] ) * ( pIn[7] )
+ 0.5f*( rMat[2][0]*rMat[2][0] ) * ( pIn[8])
+ 0.5f*( rMat[2][1]*rMat[2][1] ) * ( -pIn[8])
+ 1.5f*( rMat[2][2]*rMat[2][2] ) * ( pIn[6] )
- 0.5f * ( pIn[6] )
);

outCoefs[7] = (
( rMat[0][0]*rMat[2][1] + rMat[0][1]*rMat[2][0] ) * ( pIn[4] )
+ ( rMat[0][1]*rMat[2][2] + rMat[0][2]*rMat[2][1] ) * ( pIn[5] )
+ ( rMat[0][2]*rMat[2][0] + rMat[0][0]*rMat[2][2] ) * ( pIn[7] )
+ ( rMat[0][0]*rMat[2][0] ) * ( pIn[8] )
+ ( rMat[0][1]*rMat[2][1] ) * ( -pIn[8] )
+ ( rMat[0][2]*rMat[2][2] ) * ( 3.0f*pIn[6] )
);

outCoefs[8] = (
( rMat[0][1]*rMat[0][0] - rMat[1][1]*rMat[1][0] ) * ( pIn[4] )
+ ( rMat[0][2]*rMat[0][1] - rMat[1][2]*rMat[1][1] ) * ( pIn[5] )
+ ( rMat[0][0]*rMat[0][2] - rMat[1][0]*rMat[1][2] ) * ( pIn[7] )
+0.5f*( rMat[0][0]*rMat[0][0] - rMat[1][0]*rMat[1][0] ) * ( pIn[8] )
+0.5f*( rMat[0][1]*rMat[0][1] - rMat[1][1]*rMat[1][1] ) * ( -pIn[8] )
+0.5f*( rMat[0][2]*rMat[0][2] - rMat[1][2]*rMat[1][2] ) * ( 3.0f*pIn[6] )
);
}


... and to sample it in the shader ...


float3 SampleSHQuadratic(float3 dir, float3 shVector[9])
{
float3 ds1 = dir.xyz*dir.xyz;
float3 ds2 = dir*dir.yzx; // xy, zy, xz

float3 v = shVector[0];

v += dir.y * shVector[1];
v += dir.z * shVector[2];
v += dir.x * shVector[3];

v += ds2.x * shVector[4];
v += ds2.y * shVector[5];
v += (ds1.z * 1.5 - 0.5) * shVector[6];
v += ds2.z * shVector[7];
v += (ds1.x - ds1.y) * 0.5 * shVector[8];

return v;
}


For Monte Carlo integration, take sampling points, feed direction "dir" to the following function to get multipliers for each coefficient, then multiply by the intensity in that direction. Divide the total by the number of sampling points:


void SHForDirection(const terVec3 &dir, float out[9])
{
// Constant
out[0] = 1.0f;

// Linear
out[1] = dir[1] * 3.0f;
out[2] = dir[2] * 3.0f;
out[3] = dir[0] * 3.0f;

// Quadratics
out[4] = ( dir[0]*dir[1] ) * 15.0f;
out[5] = ( dir[1]*dir[2] ) * 15.0f;
out[6] = ( 1.5f*( dir[2]*dir[2] ) - 0.5f ) * 5.0f;
out[7] = ( dir[0]*dir[2] ) * 15.0f;
out[8] = 0.5f*( dir[0]*dir[0] - dir[1]*dir[1] ) * 15.0f;
}


... and finally, for a uniformly-distributed random point on a sphere ...


terVec3 RandomDirection(int (*randomFunc)(), int randMax)
{
float u = (((float)randomFunc()) / (float)(randMax - 1))*2.0f - 1.0f;
float n = sqrtf(1.0f - u*u);

float theta = 2.0f * M_PI * (((float)randomFunc()) / (float)(randMax));

return terVec3(n * cos(theta), n * sin(theta), u);
}

2.0 gamma textures and full-range scalars in YCoCg DXT5

A few years back there was a publication on real-time YCoCg DXT5 texture compression. There are two improvements on the technique I feel I should present:

There's a pretty clear problem right off the bat: It's not particularly friendly to linear textures. If you simply attempt to convert sRGB values into linear space and store the result in YCoCg, you will experience severe banding owing largely to the loss of precision at lower values. Gamma space provides a lot of precision at lower intensity values where the human visual system is more sensitive.

sRGB texture modes exist as a method to cheaply convert from gamma space to linear, and are pretty fast since GPUs can just use a look-up table to get the linear values, but YCoCg can't be treated as an sRGB texture and doing sRGB decodes in the shader is fairly slow since it involves a divide, power raise, and conditional.

This can be resolved first by simply converting from a 2.2-ish sRGB gamma ramp to a 2.0 gamma ramp, which preserves most of the original gamut: 255 input values map to 240 output values, low intensity values maintain most of their precision, and they can be linearized by simply squaring the result in the shader.


Another concern, which isn't really one if you're aiming for speed and doing things real-time, but is if you're considering using such a technique for offline processing, is the limited scale factor. DXT5 provides enough resolution for 32 possible scale factor values, so there isn't any reason to limit it to 1, 2, or 4 if you don't have to. Using the full range gives you more color resolution to work with.


Here's some sample code:


unsigned char Linearize(unsigned char inByte)
{
float srgbVal = ((float)inByte) / 255.0f;
float linearVal;

if(srgbVal < 0.04045)
linearVal = srgbVal / 12.92f;
else
linearVal = pow( (srgbVal + 0.055f) / 1.055f, 2.4f);

return (unsigned char)(floor(sqrt(linearVal)* 255.0 + 0.5));
}

void ConvertBlockToYCoCg(const unsigned char inPixels[16*3], unsigned char outPixels[16*4])
{
unsigned char linearizedPixels[16*3]; // Convert to linear values

for(int i=0;i<16*3;i++)
linearizedPixels[i] = Linearize(inPixels[i]);

// Calculate Co and Cg extents
int extents = 0;
int n = 0;
int iY, iCo, iCg;
int blockCo[16];
int blockCg[16];
const unsigned char *px = linearizedPixels;
for(int i=0;i<16;i++)
{
iCo = (px[0]<<1) - (px[2]<<1);
iCg = (px[1]<<1) - px[0] - px[2];
if(-iCo > extents) extents = -iCo;
if( iCo > extents) extents = iCo;
if(-iCg > extents) extents = -iCg;
if( iCg > extents) extents = iCg;

blockCo[n] = iCo;
blockCg[n++] = iCg;

px += 3;
}

// Co = -510..510
// Cg = -510..510
float scaleFactor = 1.0f;
if(extents > 127)
scaleFactor = (float)extents * 4.0f / 510.0f;

// Convert to quantized scalefactor
unsigned char scaleFactorQuantized = (unsigned char)(ceil((scaleFactor - 1.0f) * 31.0f / 3.0f));

// Unquantize
scaleFactor = 1.0f + (float)(scaleFactorQuantized / 31.0f) * 3.0f;

unsigned char bVal = (unsigned char)((scaleFactorQuantized << 3) | (scaleFactorQuantized >> 2));

unsigned char *outPx = outPixels;

n = 0;
px = linearizedPixels;
for(i=0;i<16;i++)
{
// Calculate components
iY = ( px[0] + (px[1]<<1) + px[2] + 2 ) / 4;
iCo = ((blockCo[n] / scaleFactor) + 128);
iCg = ((blockCg[n] / scaleFactor) + 128);

if(iCo < 0) iCo = 0; else if(iCo > 255) iCo = 255;
if(iCg < 0) iCg = 0; else if(iCg > 255) iCg = 255;
if(iY < 0) iY = 0; else if(iY > 255) iY = 255;

px += 3;

outPx[0] = (unsigned char)iCo;
outPx[1] = (unsigned char)iCg;
outPx[2] = bVal;
outPx[3] = (unsigned char)iY;

outPx += 4;
}
}




.... And to decode it in the shader ...



float3 DecodeYCoCg(float4 inColor)
{
float3 base = inColor.arg + float3(0, -0.5, -0.5);
float scale = (inColor.b*0.75 + 0.25);
float4 multipliers = float4(1.0, 0.0, scale, -scale);
float3 result;

result.r = dot(base, multipliers.xzw);
result.g = dot(base, multipliers.xyz);
result.b = dot(base, multipliers.xww);

// Convert from 2.0 gamma to linear
return result*result;
}