GameLib is a collection of libraries for creating applications in Cakelisp.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

#### 285 lines 11 KiB Raw Blame History

 `@property( syntax != glslvk )` ` #version 430` `@else` ` #version 450` `@end` ``` ``` `//See GaussianBlurBase_cs for the original.` `//This is a derived version which is used for filtering ESM (Exponential Shadow Maps).` `//Normally ESM is in exponential space: exp( K * linearSpaceDepth );` `//Filtering should be done in that space.` `//However because of precision reasons, we store linearSpaceDepth instead. In order to perform` `//correct filtering, we use the following formula:` `// exp( filteredDepth ) = w0 * exp( d0 ) + w1 * exp( d1 ) + w2 * exp( d2 ) + ...` `//` `//But this is not precision friendly. So we do instead:` `// = w0 * exp( d0 ) + w1 * exp( d1 ) + w2 * exp( d2 )` `// = exp( d0 ) * ( w0 + w1 * exp( d1 ) / exp( d0 ) + w2 * exp( d2 ) / exp( d0 ) )` `// = exp( d0 ) * ( w0 + w1 * exp( d1 - d0 ) + w2 * exp( d2 - d0 ) )` `// = exp( d0 ) * exp( log( w0 + w1 * exp( d1 - d0 ) + w2 * exp( d2 - d0 ) ) )` `// = exp( d0 + log( w0 + w1 * exp( d1 - d0 ) + w2 * exp( d2 - d0 ) ) )` `// exp( filteredDepth ) = exp( d0 + log( w0 + w1 * exp( d1 - d0 ) + w2 * exp( d2 - d0 ) ) )` `//Almost final formula:` `// filteredDepth = d0 + log( w0 + w1 * exp( d1 - d0 ) + w2 * exp( d2 - d0 ) )` `//` `//The formula is actually:` `// exp( K * filteredDepth ) = w0 * exp( K * d0 ) + w1 * exp( K * d1 ) + w2 * exp( K * d2 ) + ...` `//Final formula:` `// = d0 + log( w0 + w1 * exp( K * (d1 - d0) ) + w2 * exp( K * (d2 - d0) ) ) / K` ``` ``` `//Like in the original filter:` `// * Each thread works on 4 pixels at a time (for VLIW hardware, i.e. Radeon HD 5000 & 6000 series).` `// * 256 pixels per threadgroup. Each threadgroup works on 2 rows of 128 pixels each.` `// That means 32x2 threads = 64. 64 threads x 4 pixels per thread = 256` ``` ``` `// For this shader to work, several pieces need to be defined:` `// data_type (i.e. vec3)` `// lds_data_type (i.e. vec3, uint)` `// lds_definition` `// image_store` `// image_sample` `// decode_lds (optional, i.e. when lds_data_type != data_type)` `// Define the property "downscale" if you're doing a downsample.` `// Define "downscale_lq" (must also define downscale) for SLIGHTLY lower quality downscale` `// The script uses the template syntax to automatically set the num. of threadgroups` `// based on the bound input texture.` ``` ``` `vulkan( layout( ogre_s0 ) uniform sampler inputSampler );` `vulkan_layout( ogre_t0 ) uniform texture2D inputImage;` ``` ``` `layout( vulkan( ogre_u0 ) vk_comma @insertpiece(uav0_pf_type) )` `uniform restrict writeonly image2D outputImage;` ``` ``` `// 32 = 128 / 4` `layout( local_size_x = 32,` ` local_size_y = 2,` ` local_size_z = 1 ) in;` `@pset( threads_per_group_x, 32 )` `@pset( threads_per_group_y, 2 )` `@pset( threads_per_group_z, 1 )` ``` ``` `@pmul( pixelsPerRow, threads_per_group_x, 4 )` `@pset( rowsPerThreadGroup, threads_per_group_y )` `@pset( num_thread_groups_z, 1 )` ``` ``` `@set( input_width, uav0_width_with_lod )` `@set( input_height, uav0_height_with_lod )` ``` ``` `@property( horizontal_pass )` ` @property( downscale ) @mul( input_width, 2 ) @end` ``` ``` ` /// Calculate num_thread_groups_` ` /// num_thread_groups_x = (texture0_width + pixelsPerRow - 1) / pixelsPerRow` ` /// num_thread_groups_y = (texture0_height + rowsPerThreadGroup - 1) / rowsPerThreadGroup` ` @add( num_thread_groups_x, input_width, pixelsPerRow )` ` @sub( num_thread_groups_x, 1 )` ` @div( num_thread_groups_x, pixelsPerRow )` ``` ``` ` @add( num_thread_groups_y, input_height, rowsPerThreadGroup )` ` @sub( num_thread_groups_y, 1 )` ` @div( num_thread_groups_y, rowsPerThreadGroup )` `@end @property( !horizontal_pass )` ` @property( downscale ) @mul( input_height, 2 ) @end` ``` ``` ` /// Calculate num_thread_groups_` ` /// num_thread_groups_x = (texture0_width + rowsPerThreadGroup - 1) / rowsPerThreadGroup` ` /// num_thread_groups_y = (texture0_height + pixelsPerRow - 1) / pixelsPerRow` ` @add( num_thread_groups_x, input_width, rowsPerThreadGroup )` ` @sub( num_thread_groups_x, 1 )` ` @div( num_thread_groups_x, rowsPerThreadGroup )` ``` ``` ` @add( num_thread_groups_y, input_height, pixelsPerRow )` ` @sub( num_thread_groups_y, 1 )` ` @div( num_thread_groups_y, pixelsPerRow )` `@end` ``` ``` `/// shared vec3 g_f3LDS[ 2 ] [ @value( samples_per_threadgroup ) ];` `@insertpiece( lds_definition )` ``` ``` `#define C_WEIGHTS( x ) c_weights[(x) >> 2u][(x) & 3u]` ``` ``` `// weights_array_count = ( kernel_radius + 1u + 3u ) / 4u` `@padd( weights_array_count, kernel_radius, 4 )` `@pdiv( weights_array_count, 4 )` ``` ``` `vulkan( layout( ogre_P0 ) uniform Params { )` ` uniform vec4 g_f4OutputSize;` ` uniform vec4 c_weights[@value( weights_array_count )];` ` @insertpiece( extra_params )` `vulkan( }; )` ``` ``` `@insertpiece( lds_data_type ) sampleTex( ivec2 i2Position , vec2 f2Offset )` `{` ` vec2 f2SamplePosition = vec2( i2Position ) + vec2( 0.5f, 0.5f );` ``` ``` ` f2SamplePosition *= g_f4OutputSize.zw;` ` ///return textureLod( inputImage, f2SamplePosition, 0 ).xyz;` ` @insertpiece( image_sample )` `}` ``` ``` `void ComputeFilterKernel( int iPixelOffset, int iLineOffset, ivec2 i2Center, ivec2 i2Inc )` `{` ` @property( !downscale_lq )` ` @insertpiece( data_type ) outColour[ 4 ];` ` @insertpiece( data_type ) firstSmpl[ 4 ];` ` @end @property( downscale_lq )` ` @insertpiece( data_type ) outColour[ 2 ];` ` @insertpiece( data_type ) firstSmpl[ 4 ];` ` @end` ` @insertpiece( data_type ) RDI[ 4 ] ;` ``` ``` ` @foreach( 4, iPixel )` ` RDI[ @iPixel ] = @insertpiece( decode_lds )( g_f3LDS[ iLineOffset ][ iPixelOffset + @value( kernel_radius ) + @iPixel ] );@end` ``` ``` ` @property( !downscale_lq )` ` @foreach( 4, iPixel )` ` firstSmpl[ @iPixel ].x = RDI[ @iPixel ];` ` outColour[ @iPixel ].x = C_WEIGHTS( @value( kernel_radius ) );@end` ` @end @property( downscale_lq )` ` @foreach( 2, iPixel )` ` firstSmpl[ @iPixel ].x = RDI[ @iPixel * 2 ];` ` outColour[ @iPixel ].x = C_WEIGHTS( @value( kernel_radius ) );@end` ` @end` ``` ``` ` @foreach( 4, iPixel )` ` RDI[ @iPixel ] = @insertpiece( decode_lds )( g_f3LDS[ iLineOffset ][ iPixelOffset + @iPixel ] );@end` ``` ``` ` iPixelOffset += 4;` ``` ``` ` /// Deal with taps to our left.` ` /// for ( iIteration = 0; iIteration < radius; iIteration += 1 )` ` @foreach( kernel_radius, iIteration )` ` @property( !downscale_lq )` ` @foreach( 4, iPixel )` ` outColour[ @iPixel ].x += exp(@value(K)*(RDI[ @iPixel ] - firstSmpl[ @iPixel ].x)) * C_WEIGHTS( @iIteration );@end` ` @end @property( downscale_lq )` ` @foreach( 2, iPixel )` ` outColour[ @iPixel ].x += exp(@value(K)*(RDI[ @iPixel * 2 ] - firstSmpl[ @iPixel ].x)) * C_WEIGHTS( @iIteration );@end` ` @end` ` @foreach( 3, iPixel )` ` RDI[ @iPixel ] = RDI[ @iPixel + ( 1 ) ];@end` ` @foreach( 1, iPixel )` ` RDI[ 4 - 1 + @iPixel ] = @insertpiece( decode_lds )( g_f3LDS[ iLineOffset ][ iPixelOffset + @iIteration + @iPixel ] );@end` ` @end` ``` ``` ` @foreach( 4, iPixel )` ` RDI[ @iPixel ] = @insertpiece( decode_lds )( g_f3LDS[ iLineOffset ][ iPixelOffset - 4 + @value( kernel_radius ) + 1 + @iPixel ] );@end` ``` ``` ` @padd( kernel_radius_plus1, kernel_radius, 1 )` ` @pmul( kernel_radius2x_plus1, kernel_radius, 2 )` ` @padd( kernel_radius2x_plus1, 1 )` ``` ``` ` @pmul( kernel_radius2x, kernel_radius, 2 )` ``` ``` ` /// Deal with taps to our right.` ` /// for ( iIteration = radius + 1; iIteration < ( radius * 2 + 1 ); iIteration += 1 )` ` @foreach( kernel_radius2x_plus1, iIteration, kernel_radius_plus1 )` ` @property( !downscale_lq )` ` @foreach( 4, iPixel )` ` outColour[ @iPixel ].x += exp(@value(K)*(RDI[ @iPixel ] - firstSmpl[ @iPixel ].x)) * C_WEIGHTS( @value( kernel_radius2x ) - @iIteration );@end` ` @end @property( downscale_lq )` ` @foreach( 2, iPixel )` ` outColour[ @iPixel ].x += exp(@value(K)*(RDI[ @iPixel * 2 ] - firstSmpl[ @iPixel ].x)) * C_WEIGHTS( @value( kernel_radius2x ) - @iIteration );@end` ` @end` ` @foreach( 3, iPixel )` ` RDI[ @iPixel ] = RDI[ @iPixel + ( 1 ) ];@end` ` @foreach( 1, iPixel )` ` RDI[ 4 - 1 + @iPixel ] = @insertpiece( decode_lds )( g_f3LDS[ iLineOffset ][ iPixelOffset + @iIteration + @iPixel ] );@end` ` @end` ``` ``` ` @property( !downscale_lq )` ` @foreach( 4, iPixel )` ` outColour[ @iPixel ] = firstSmpl[ @iPixel ].x + log( outColour[ @iPixel ].x ) / @value(K);@end` ` @end @property( downscale_lq )` ` @foreach( 2, iPixel )` ` outColour[ @iPixel ] = firstSmpl[ @iPixel ].x + log( outColour[ @iPixel ].x ) / @value(K);@end` ` @end` ``` ``` ` /*` ` foreach( 4, iPixel )` ` imageStore( outputImage, ivec2( i2Center + iPixel * i2Inc ), vec4( outColour[ iPixel ], 1.0 ) );end` ` */` ` @insertpiece( image_store )` `}` ``` ``` `void main()` `{` ` /// samples_per_threadgroup = 128 + ( ( kernel_radius * 2 + 1 ) - 1 )` ` /// samples_per_thread = ( 128 + ( ( kernel_radius * 2 + 1 ) - 1 ) ) / ( 128 / 4 )` ` @padd( samples_per_threadgroup, 127, kernel_radius2x_plus1 )` ` @pdiv( samples_per_thread, samples_per_threadgroup, 32 )` ``` ``` `@property( horizontal_pass )` ` int iSampleOffset = int( gl_LocalInvocationID.x * @value( samples_per_thread ) );` ` int iLineOffset = int( gl_LocalInvocationID.y );` ``` ``` ` ivec2 i2GroupCoord = ivec2( ( gl_WorkGroupID.x << 7u ) - @value( kernel_radius )u, gl_WorkGroupID.y << 1u );` ` ivec2 i2Coord = ivec2( i2GroupCoord.x + iSampleOffset, i2GroupCoord.y );` ``` ``` ` @foreach( samples_per_thread, i )` ` g_f3LDS[ iLineOffset ][ iSampleOffset + @i ] = sampleTex( i2Coord + ivec2( @i, gl_LocalInvocationID.y ) , vec2( 0.5f, 0.0f ) );@end` ``` ``` ` if( gl_LocalInvocationID.x < @value( samples_per_threadgroup )u - 32u * @value( samples_per_thread )u )` ` {` ` g_f3LDS[ iLineOffset ][ @value(samples_per_threadgroup)u - 1u - gl_LocalInvocationID.x ] =` ` sampleTex( i2GroupCoord + ivec2( @value(samples_per_threadgroup)u - 1u - gl_LocalInvocationID.x, gl_LocalInvocationID.y ), vec2( 0.5f, 0.0f ) );` ` }` ``` ``` ` //memoryBarrierShared ensures our write is visible to everyone else (must be done BEFORE the barrier)` ` //barrier ensures every thread's execution reached here.` ` memoryBarrierShared();` ` barrier();` ``` ``` ` int iPixelOffset = int( gl_LocalInvocationID.x << 2u ); //gl_LocalInvocationID.x * 4u` ` i2Coord = ivec2( i2GroupCoord.x + iPixelOffset, i2GroupCoord.y );` ` i2Coord.x += @value( kernel_radius );` ``` ``` ` if( i2Coord.x < int(g_f4OutputSize.x) )` ` {` ` ivec2 i2Center = i2Coord + ivec2( 0, gl_LocalInvocationID.y );` ` ivec2 i2Inc = ivec2 ( 1, 0 );` ``` ``` ` @property( downscale )` ` i2Center.x = int( uint( i2Center.x ) >> 1u );` ` @end` ``` ``` ` ComputeFilterKernel( iPixelOffset, iLineOffset, i2Center, i2Inc );` ` }` `@end @property( !horizontal_pass )` ` int iSampleOffset = int( gl_LocalInvocationID.x * @value( samples_per_thread ) );` ` int iLineOffset = int( gl_LocalInvocationID.y );` ``` ``` ` ivec2 i2GroupCoord = ivec2( gl_WorkGroupID.x << 1u, ( gl_WorkGroupID.y << 7u ) - @value( kernel_radius )u );` ` ivec2 i2Coord = ivec2( i2GroupCoord.x, i2GroupCoord.y + iSampleOffset );` ``` ``` ` @foreach( samples_per_thread, i )` ` g_f3LDS[ iLineOffset ][ iSampleOffset + @i ] = sampleTex( i2Coord + ivec2( gl_LocalInvocationID.y, @i ) , vec2( 0.0f, 0.5f ) );@end` ``` ``` ` if( gl_LocalInvocationID.x < @value( samples_per_threadgroup )u - 32u * @value( samples_per_thread )u )` ` {` ` g_f3LDS[ iLineOffset ][ @value(samples_per_threadgroup)u - 1u - gl_LocalInvocationID.x ] =` ` sampleTex( i2GroupCoord + ivec2( gl_LocalInvocationID.y, @value(samples_per_threadgroup)u - 1u - gl_LocalInvocationID.x ), vec2( 0.0f, 0.5f ) );` ` }` ``` ``` ` //memoryBarrierShared ensures our write is visible to everyone else (must be done BEFORE the barrier)` ` //barrier ensures every thread's execution reached here.` ` memoryBarrierShared();` ` barrier();` ``` ``` ` int iPixelOffset = int( gl_LocalInvocationID.x << 2u ); //gl_LocalInvocationID.x * 4u` ` i2Coord = ivec2( i2GroupCoord.x, i2GroupCoord.y + iPixelOffset );` ` i2Coord.y += @value( kernel_radius );` ``` ``` ` if( i2Coord.y < int(g_f4OutputSize.y) )` ` {` ` ivec2 i2Center = i2Coord + ivec2( gl_LocalInvocationID.y, 0 );` ` ivec2 i2Inc = ivec2 ( 0, 1 );` ``` ``` ` @property( downscale )` ` i2Center.y = int( uint( i2Center.y ) >> 1u );` ` @end` ``` ``` ` ComputeFilterKernel( iPixelOffset, iLineOffset, i2Center, i2Inc );` ` }` `@end` ```} ``` ``` ```