510 lines
22 KiB
Plaintext
510 lines
22 KiB
Plaintext
|
#pragma kernel StpSetup
|
||
|
|
||
|
#pragma multi_compile _ ENABLE_DEBUG_MODE
|
||
|
#pragma multi_compile _ ENABLE_STENCIL_RESPONSIVE
|
||
|
#pragma multi_compile _ ENABLE_LARGE_KERNEL
|
||
|
|
||
|
#pragma multi_compile _ UNITY_DEVICE_SUPPORTS_NATIVE_16BIT
|
||
|
|
||
|
// TODO: Re-enable support for wave reductions (usage of UNITY_DEVICE_SUPPORTS_WAVE_ANY keyword)
|
||
|
//
|
||
|
// We've run into many platform specific problems when trying to use wave operations for STP's reductions so they're being
|
||
|
// disabled for now. Enabling support for wave operations also causes us to use DXC on the 32-bit path on some Qualcomm Android
|
||
|
// devices and this triggers visual artifacts that we have no other way to work around at the moment.
|
||
|
|
||
|
#pragma multi_compile _ DISABLE_TEXTURE2D_X_ARRAY
|
||
|
|
||
|
#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch
|
||
|
|
||
|
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
|
||
|
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Color.hlsl"
|
||
|
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/UnityInstancing.hlsl"
|
||
|
|
||
|
#define STP_PAT 1
|
||
|
|
||
|
#include "Packages/com.unity.render-pipelines.core/Runtime/STP/StpCommon.hlsl"
|
||
|
|
||
|
//
|
||
|
// Input
|
||
|
//
|
||
|
|
||
|
TEXTURE2D_X(_StpInputColor);
|
||
|
TEXTURE2D_X(_StpInputDepth);
|
||
|
TEXTURE2D_X(_StpInputMotion);
|
||
|
|
||
|
#if defined(ENABLE_STENCIL_RESPONSIVE)
|
||
|
TYPED_TEXTURE2D_X(uint2, _StpInputStencil);
|
||
|
#endif
|
||
|
|
||
|
//
|
||
|
// Intermediate Output
|
||
|
//
|
||
|
|
||
|
RW_TEXTURE2D_X(float4, _StpIntermediateColor);
|
||
|
RW_TEXTURE2D_X(float, _StpIntermediateConvergence);
|
||
|
|
||
|
//
|
||
|
// History Input/Output
|
||
|
//
|
||
|
|
||
|
TYPED_TEXTURE2D_X(uint, _StpPriorDepthMotion);
|
||
|
RW_TEXTURE2D_X(uint, _StpDepthMotion);
|
||
|
|
||
|
TEXTURE2D_X(_StpPriorLuma);
|
||
|
RW_TEXTURE2D_X(float2, _StpLuma);
|
||
|
|
||
|
TEXTURE2D_X(_StpPriorConvergence);
|
||
|
|
||
|
TEXTURE2D_X(_StpPriorFeedback);
|
||
|
|
||
|
#define STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET (SLICE_ARRAY_INDEX * STPSETUPPERVIEWCONSTANTS_COUNT)
|
||
|
|
||
|
#if defined(SHADER_API_PSSL) || defined(SHADER_API_SWITCH) || (defined(SHADER_API_METAL) && !defined(SHADER_API_MOBILE))
|
||
|
// Force usage of the 32-bit reduction path even in 16-bit environments
|
||
|
#define STP_FORCE_32BIT_REDUCTION
|
||
|
#endif
|
||
|
|
||
|
#if defined(SHADER_API_PSSL) || defined(SHADER_API_GAMECORE) || defined(SHADER_API_METAL) || (defined(SHADER_API_VULKAN) && defined(SHADER_API_MOBILE))
|
||
|
// Force usage of group shared memory instead using wave operations
|
||
|
#define STP_FORCE_GROUPSHARED
|
||
|
#endif
|
||
|
|
||
|
// Enable the use of wave operations when they're supported by the current hardware and usage of groupshared hasn't been forced.
|
||
|
#if defined(UNITY_HW_SUPPORTS_WAVE) && !defined(STP_FORCE_GROUPSHARED)
|
||
|
#define STP_ENABLE_WAVEOPS
|
||
|
#endif
|
||
|
|
||
|
// STP requires a 4x4 reduction which must be implemented by either wave operations, or group shared memory.
|
||
|
#if !defined(STP_ENABLE_WAVEOPS)
|
||
|
#if defined(STP_16BIT) && !defined(STP_FORCE_32BIT_REDUCTION)
|
||
|
groupshared uint4 gs_StpScratch[STP_GROUP_SIZE];
|
||
|
#else
|
||
|
groupshared float4 gs_StpScratch[STP_GROUP_SIZE * 2];
|
||
|
#endif
|
||
|
#endif
|
||
|
|
||
|
// In some cases, we have to expose the 32-bit reduction code in the 16-bit path
|
||
|
#if defined(STP_32BIT) || defined(STP_FORCE_32BIT_REDUCTION)
|
||
|
void StpPat4x4MaxF8(StpMU1 i, inout StpF4 a, inout StpF4 b)
|
||
|
{
|
||
|
#if defined(STP_ENABLE_WAVEOPS)
|
||
|
a.x = max(a.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 1)));
|
||
|
a.y = max(a.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 1)));
|
||
|
a.z = max(a.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 1)));
|
||
|
a.w = max(a.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 1)));
|
||
|
b.x = max(b.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.x), WaveGetLaneIndex() ^ 1)));
|
||
|
b.y = max(b.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.y), WaveGetLaneIndex() ^ 1)));
|
||
|
b.z = max(b.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.z), WaveGetLaneIndex() ^ 1)));
|
||
|
b.w = max(b.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.w), WaveGetLaneIndex() ^ 1)));
|
||
|
|
||
|
a.x = max(a.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 2)));
|
||
|
a.y = max(a.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 2)));
|
||
|
a.z = max(a.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 2)));
|
||
|
a.w = max(a.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 2)));
|
||
|
b.x = max(b.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.x), WaveGetLaneIndex() ^ 2)));
|
||
|
b.y = max(b.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.y), WaveGetLaneIndex() ^ 2)));
|
||
|
b.z = max(b.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.z), WaveGetLaneIndex() ^ 2)));
|
||
|
b.w = max(b.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.w), WaveGetLaneIndex() ^ 2)));
|
||
|
|
||
|
a.x = max(a.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 4)));
|
||
|
a.y = max(a.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 4)));
|
||
|
a.z = max(a.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 4)));
|
||
|
a.w = max(a.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 4)));
|
||
|
b.x = max(b.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.x), WaveGetLaneIndex() ^ 4)));
|
||
|
b.y = max(b.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.y), WaveGetLaneIndex() ^ 4)));
|
||
|
b.z = max(b.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.z), WaveGetLaneIndex() ^ 4)));
|
||
|
b.w = max(b.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.w), WaveGetLaneIndex() ^ 4)));
|
||
|
|
||
|
a.x = max(a.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 8)));
|
||
|
a.y = max(a.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 8)));
|
||
|
a.z = max(a.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 8)));
|
||
|
a.w = max(a.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 8)));
|
||
|
b.x = max(b.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.x), WaveGetLaneIndex() ^ 8)));
|
||
|
b.y = max(b.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.y), WaveGetLaneIndex() ^ 8)));
|
||
|
b.z = max(b.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.z), WaveGetLaneIndex() ^ 8)));
|
||
|
b.w = max(b.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.w), WaveGetLaneIndex() ^ 8)));
|
||
|
#else
|
||
|
gs_StpScratch[i] = a;
|
||
|
gs_StpScratch[i + STP_GROUP_SIZE] = b;
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
// 2x2 Reduction
|
||
|
{
|
||
|
StpMU1 offset = (i & ~StpMU1(3));
|
||
|
|
||
|
StpMU1 a0 = offset + ((i + StpMU1(1)) & StpMU1(3));
|
||
|
StpMU1 a1 = offset + ((i + StpMU1(2)) & StpMU1(3));
|
||
|
StpMU1 a2 = offset + ((i + StpMU1(3)) & StpMU1(3));
|
||
|
|
||
|
float4 x0 = gs_StpScratch[a0];
|
||
|
float4 x1 = gs_StpScratch[a1];
|
||
|
float4 x2 = gs_StpScratch[a2];
|
||
|
|
||
|
float4 y0 = gs_StpScratch[a0 + STP_GROUP_SIZE];
|
||
|
float4 y1 = gs_StpScratch[a1 + STP_GROUP_SIZE];
|
||
|
float4 y2 = gs_StpScratch[a2 + STP_GROUP_SIZE];
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
a = max(max(max(a, x0), x1), x2);
|
||
|
b = max(max(max(b, y0), y1), y2);
|
||
|
}
|
||
|
|
||
|
gs_StpScratch[i] = a;
|
||
|
gs_StpScratch[i + STP_GROUP_SIZE] = b;
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
// 4x4 Reduction
|
||
|
{
|
||
|
StpMU1 offset = (i & ~StpMU1(15));
|
||
|
|
||
|
StpMU1 a0 = offset + ((i + StpMU1(4)) & StpMU1(15));
|
||
|
StpMU1 a1 = offset + ((i + StpMU1(8)) & StpMU1(15));
|
||
|
StpMU1 a2 = offset + ((i + StpMU1(12)) & StpMU1(15));
|
||
|
|
||
|
float4 x0 = gs_StpScratch[a0];
|
||
|
float4 x1 = gs_StpScratch[a1];
|
||
|
float4 x2 = gs_StpScratch[a2];
|
||
|
|
||
|
float4 y0 = gs_StpScratch[a0 + STP_GROUP_SIZE];
|
||
|
float4 y1 = gs_StpScratch[a1 + STP_GROUP_SIZE];
|
||
|
float4 y2 = gs_StpScratch[a2 + STP_GROUP_SIZE];
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
a = max(max(max(a, x0), x1), x2);
|
||
|
b = max(max(max(b, y0), y1), y2);
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
void StpPat4x4SumF4(StpMU1 i, inout StpF4 a)
|
||
|
{
|
||
|
#if defined(STP_ENABLE_WAVEOPS)
|
||
|
a.x += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 1));
|
||
|
a.y += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 1));
|
||
|
a.z += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 1));
|
||
|
a.w += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 1));
|
||
|
|
||
|
a.x += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 2));
|
||
|
a.y += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 2));
|
||
|
a.z += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 2));
|
||
|
a.w += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 2));
|
||
|
|
||
|
a.x += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 4));
|
||
|
a.y += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 4));
|
||
|
a.z += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 4));
|
||
|
a.w += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 4));
|
||
|
|
||
|
a.x += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 8));
|
||
|
a.y += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 8));
|
||
|
a.z += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 8));
|
||
|
a.w += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 8));
|
||
|
#else
|
||
|
gs_StpScratch[i] = a;
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
// 2x2 Reduction
|
||
|
{
|
||
|
StpMU1 offset = (i & ~StpMU1(3));
|
||
|
|
||
|
StpMU1 a0 = offset + ((i + StpMU1(1)) & StpMU1(3));
|
||
|
StpMU1 a1 = offset + ((i + StpMU1(2)) & StpMU1(3));
|
||
|
StpMU1 a2 = offset + ((i + StpMU1(3)) & StpMU1(3));
|
||
|
|
||
|
float4 x0 = gs_StpScratch[a0];
|
||
|
float4 x1 = gs_StpScratch[a1];
|
||
|
float4 x2 = gs_StpScratch[a2];
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
a = a + x0 + x1 + x2;
|
||
|
}
|
||
|
|
||
|
gs_StpScratch[i] = a;
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
// 4x4 Reduction
|
||
|
{
|
||
|
StpMU1 offset = (i & ~StpMU1(15));
|
||
|
|
||
|
StpMU1 a0 = offset + ((i + StpMU1(4)) & StpMU1(15));
|
||
|
StpMU1 a1 = offset + ((i + StpMU1(8)) & StpMU1(15));
|
||
|
StpMU1 a2 = offset + ((i + StpMU1(12)) & StpMU1(15));
|
||
|
|
||
|
float4 x0 = gs_StpScratch[a0];
|
||
|
float4 x1 = gs_StpScratch[a1];
|
||
|
float4 x2 = gs_StpScratch[a2];
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
a = a + x0 + x1 + x2;
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#if defined(STP_16BIT)
|
||
|
void StpPat4x4MaxH8(StpW1 i, inout StpH4 a, inout StpH4 b)
|
||
|
{
|
||
|
#if defined(STP_FORCE_32BIT_REDUCTION)
|
||
|
StpPat4x4MaxF8(i, a, b);
|
||
|
#else
|
||
|
#if defined(STP_ENABLE_WAVEOPS)
|
||
|
a.xy = max(a.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 1)));
|
||
|
a.zw = max(a.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 1)));
|
||
|
b.xy = max(b.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.xy), WaveGetLaneIndex() ^ 1)));
|
||
|
b.zw = max(b.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.zw), WaveGetLaneIndex() ^ 1)));
|
||
|
|
||
|
a.xy = max(a.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 2)));
|
||
|
a.zw = max(a.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 2)));
|
||
|
b.xy = max(b.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.xy), WaveGetLaneIndex() ^ 2)));
|
||
|
b.zw = max(b.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.zw), WaveGetLaneIndex() ^ 2)));
|
||
|
|
||
|
a.xy = max(a.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 4)));
|
||
|
a.zw = max(a.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 4)));
|
||
|
b.xy = max(b.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.xy), WaveGetLaneIndex() ^ 4)));
|
||
|
b.zw = max(b.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.zw), WaveGetLaneIndex() ^ 4)));
|
||
|
|
||
|
a.xy = max(a.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 8)));
|
||
|
a.zw = max(a.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 8)));
|
||
|
b.xy = max(b.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.xy), WaveGetLaneIndex() ^ 8)));
|
||
|
b.zw = max(b.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.zw), WaveGetLaneIndex() ^ 8)));
|
||
|
#else
|
||
|
gs_StpScratch[i] = StpU4(StpU1_H2(a.xy), StpU1_H2(a.zw), StpU1_H2(b.xy), StpU1_H2(b.zw));
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
// 2x2 Reduction
|
||
|
{
|
||
|
StpW1 offset = (i & ~StpW1(3));
|
||
|
|
||
|
StpW1 a0 = offset + ((i + StpW1(1)) & StpW1(3));
|
||
|
StpW1 a1 = offset + ((i + StpW1(2)) & StpW1(3));
|
||
|
StpW1 a2 = offset + ((i + StpW1(3)) & StpW1(3));
|
||
|
|
||
|
uint4 x0 = gs_StpScratch[a0];
|
||
|
uint4 x1 = gs_StpScratch[a1];
|
||
|
uint4 x2 = gs_StpScratch[a2];
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
a.xy = max(max(max(a.xy, StpH2_U1(x0.x)), StpH2_U1(x1.x)), StpH2_U1(x2.x));
|
||
|
a.zw = max(max(max(a.zw, StpH2_U1(x0.y)), StpH2_U1(x1.y)), StpH2_U1(x2.y));
|
||
|
b.xy = max(max(max(b.xy, StpH2_U1(x0.z)), StpH2_U1(x1.z)), StpH2_U1(x2.z));
|
||
|
b.zw = max(max(max(b.zw, StpH2_U1(x0.w)), StpH2_U1(x1.w)), StpH2_U1(x2.w));
|
||
|
}
|
||
|
|
||
|
gs_StpScratch[i] = StpU4(StpU1_H2(a.xy), StpU1_H2(a.zw), StpU1_H2(b.xy), StpU1_H2(b.zw));
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
// 4x4 Reduction
|
||
|
{
|
||
|
StpW1 offset = (i & ~StpW1(15));
|
||
|
|
||
|
StpW1 a0 = offset + ((i + StpW1(4)) & StpW1(15));
|
||
|
StpW1 a1 = offset + ((i + StpW1(8)) & StpW1(15));
|
||
|
StpW1 a2 = offset + ((i + StpW1(12)) & StpW1(15));
|
||
|
|
||
|
uint4 x0 = gs_StpScratch[a0];
|
||
|
uint4 x1 = gs_StpScratch[a1];
|
||
|
uint4 x2 = gs_StpScratch[a2];
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
a.xy = max(max(max(a.xy, StpH2_U1(x0.x)), StpH2_U1(x1.x)), StpH2_U1(x2.x));
|
||
|
a.zw = max(max(max(a.zw, StpH2_U1(x0.y)), StpH2_U1(x1.y)), StpH2_U1(x2.y));
|
||
|
b.xy = max(max(max(b.xy, StpH2_U1(x0.z)), StpH2_U1(x1.z)), StpH2_U1(x2.z));
|
||
|
b.zw = max(max(max(b.zw, StpH2_U1(x0.w)), StpH2_U1(x1.w)), StpH2_U1(x2.w));
|
||
|
}
|
||
|
#endif
|
||
|
#endif
|
||
|
}
|
||
|
void StpPat4x4SumH4(StpW1 i, inout StpH4 a)
|
||
|
{
|
||
|
#if defined(STP_FORCE_32BIT_REDUCTION)
|
||
|
StpPat4x4SumF4(i, a);
|
||
|
#else
|
||
|
#if defined(STP_ENABLE_WAVEOPS)
|
||
|
a.xy += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 1));
|
||
|
a.zw += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 1));
|
||
|
|
||
|
a.xy += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 2));
|
||
|
a.zw += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 2));
|
||
|
|
||
|
a.xy += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 4));
|
||
|
a.zw += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 4));
|
||
|
|
||
|
a.xy += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 8));
|
||
|
a.zw += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 8));
|
||
|
#else
|
||
|
gs_StpScratch[i].xy = StpU2(StpU1_H2(a.xy), StpU1_H2(a.zw));
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
// 2x2 Reduction
|
||
|
{
|
||
|
StpW1 offset = (i & ~StpW1(3));
|
||
|
|
||
|
StpW1 a0 = offset + ((i + StpW1(1)) & StpW1(3));
|
||
|
StpW1 a1 = offset + ((i + StpW1(2)) & StpW1(3));
|
||
|
StpW1 a2 = offset + ((i + StpW1(3)) & StpW1(3));
|
||
|
|
||
|
uint2 x0 = gs_StpScratch[a0].xy;
|
||
|
uint2 x1 = gs_StpScratch[a1].xy;
|
||
|
uint2 x2 = gs_StpScratch[a2].xy;
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
a.xy = a.xy + StpH2_U1(x0.x) + StpH2_U1(x1.x) + StpH2_U1(x2.x);
|
||
|
a.zw = a.zw + StpH2_U1(x0.y) + StpH2_U1(x1.y) + StpH2_U1(x2.y);
|
||
|
}
|
||
|
|
||
|
gs_StpScratch[i].xy = StpU2(StpU1_H2(a.xy), StpU1_H2(a.zw));
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
// 4x4 Reduction
|
||
|
{
|
||
|
StpW1 offset = (i & ~StpW1(15));
|
||
|
|
||
|
StpW1 a0 = offset + ((i + StpW1(4)) & StpW1(15));
|
||
|
StpW1 a1 = offset + ((i + StpW1(8)) & StpW1(15));
|
||
|
StpW1 a2 = offset + ((i + StpW1(12)) & StpW1(15));
|
||
|
|
||
|
uint2 x0 = gs_StpScratch[a0].xy;
|
||
|
uint2 x1 = gs_StpScratch[a1].xy;
|
||
|
uint2 x2 = gs_StpScratch[a2].xy;
|
||
|
|
||
|
GroupMemoryBarrierWithGroupSync();
|
||
|
|
||
|
a.xy = a.xy + StpH2_U1(x0.x) + StpH2_U1(x1.x) + StpH2_U1(x2.x);
|
||
|
a.zw = a.zw + StpH2_U1(x0.y) + StpH2_U1(x1.y) + StpH2_U1(x2.y);
|
||
|
}
|
||
|
#endif
|
||
|
#endif
|
||
|
}
|
||
|
StpH1 StpPatPriConH(StpF2 p) { return (StpH1)SAMPLE_TEXTURE2D_X_LOD(_StpPriorConvergence, s_linear_clamp_sampler, p, 0); }
|
||
|
|
||
|
// These are separate to support inline operation (pass merged instead of loads).
|
||
|
StpF2 StpPatDatMotH(StpW2 o) { return LOAD_TEXTURE2D_X_LOD(_StpInputMotion, o, 0).xy; }
|
||
|
StpH3 StpPatDatColH(StpW2 o) { return (StpH3)LOAD_TEXTURE2D_X_LOD(_StpInputColor, o, 0).rgb; }
|
||
|
StpF1 StpPatDatZH(StpW2 o) { return LOAD_TEXTURE2D_X_LOD(_StpInputDepth, o, 0).x; }
|
||
|
// This provides a place to convert Z from depth to linear if not inlined and actually loaded.
|
||
|
StpF1 StpPatFixZH(StpF1 z) { return 1.0 / (STP_ZBUFFER_PARAMS_Z * z + STP_ZBUFFER_PARAMS_W); }
|
||
|
StpU1 StpPatDatRH(StpW2 o) {
|
||
|
#if defined(ENABLE_STENCIL_RESPONSIVE)
|
||
|
return GetStencilValue(LOAD_TEXTURE2D_X_LOD(_StpInputStencil, o, 0).xy);
|
||
|
#endif // defined(ENABLE_STENCIL_RESPONSIVE)
|
||
|
return StpU1_(0); }
|
||
|
StpH1 StpPatFixRH(StpU1 v) {
|
||
|
// Activate the "responsive" feature when we don't have valid history textures.
|
||
|
bool hasValidHistory = DecodeHasValidHistory(STP_COMMON_CONSTANT);
|
||
|
bool excludeTaa = false;
|
||
|
#if defined(ENABLE_STENCIL_RESPONSIVE)
|
||
|
excludeTaa = (v & DecodeStencilMask(STP_COMMON_CONSTANT)) != 0;
|
||
|
#endif // defined(ENABLE_STENCIL_RESPONSIVE)
|
||
|
return (hasValidHistory && !excludeTaa) ? StpH1_(1.0) : StpH1_(0.0); }
|
||
|
|
||
|
StpH1 StpPatDitH(StpW2 o) { return StpDitH1(o); }
|
||
|
StpH4 StpPatPriFedH(StpF2 p) { return (StpH4)SAMPLE_TEXTURE2D_X_LOD(_StpPriorFeedback, s_linear_clamp_sampler, p, 0); }
|
||
|
StpH4 StpPatPriFedR4H(StpF2 p) { return (StpH4)GATHER_RED_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
|
||
|
StpH4 StpPatPriFedG4H(StpF2 p) { return (StpH4)GATHER_GREEN_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
|
||
|
StpH4 StpPatPriFedB4H(StpF2 p) { return (StpH4)GATHER_BLUE_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
|
||
|
StpH2 StpPatPriLumH(StpF2 p) { return (StpH2)SAMPLE_TEXTURE2D_X_LOD(_StpPriorLuma, s_linear_clamp_sampler, p, 0); }
|
||
|
StpU4 StpPatPriMot4H(StpF2 p) { return GATHER_RED_TEXTURE2D_X(_StpPriorDepthMotion, s_point_clamp_sampler, p); }
|
||
|
|
||
|
void StpPatStMotH(StpW2 p, StpU1 v) { _StpDepthMotion[COORD_TEXTURE2D_X(p)] = v; }
|
||
|
void StpPatStColH(StpW2 p, StpH4 v) { _StpIntermediateColor[COORD_TEXTURE2D_X(p)] = v; }
|
||
|
void StpPatStLumH(StpW2 p, StpH2 v) { _StpLuma[COORD_TEXTURE2D_X(p)] = v; }
|
||
|
void StpPatStCnvH(StpW2 p, StpH1 v) { _StpIntermediateConvergence[COORD_TEXTURE2D_X(p >> StpW1(2))] = v; }
|
||
|
#endif
|
||
|
|
||
|
#if defined(STP_32BIT)
|
||
|
StpMF1 StpPatPriConF(StpF2 p) { return (StpMF1)SAMPLE_TEXTURE2D_X_LOD(_StpPriorConvergence, s_linear_clamp_sampler, p, 0); }
|
||
|
|
||
|
// These are separate to support inline operation (pass merged instead of loads).
|
||
|
StpF2 StpPatDatMotF(StpMU2 o) { return LOAD_TEXTURE2D_X_LOD(_StpInputMotion, o, 0).xy; }
|
||
|
StpMF3 StpPatDatColF(StpMU2 o) { return (StpMF3)LOAD_TEXTURE2D_X_LOD(_StpInputColor, o, 0).rgb; }
|
||
|
StpF1 StpPatDatZF(StpMU2 o) { return LOAD_TEXTURE2D_X_LOD(_StpInputDepth, o, 0).x; }
|
||
|
// This provides a place to convert Z from depth to linear if not inlined and actually loaded.
|
||
|
StpF1 StpPatFixZF(StpF1 z) { return 1.0 / (STP_ZBUFFER_PARAMS_Z * z + STP_ZBUFFER_PARAMS_W); }
|
||
|
StpU1 StpPatDatRF(StpMU2 o) {
|
||
|
#if defined(ENABLE_STENCIL_RESPONSIVE)
|
||
|
return GetStencilValue(LOAD_TEXTURE2D_X_LOD(_StpInputStencil, o, 0).xy);
|
||
|
#endif // defined(ENABLE_STENCIL_RESPONSIVE)
|
||
|
return StpU1_(0); }
|
||
|
StpMF1 StpPatFixRF(StpU1 v) {
|
||
|
// Activate the "responsive" feature when we don't have valid history textures.
|
||
|
bool hasValidHistory = DecodeHasValidHistory(STP_COMMON_CONSTANT);
|
||
|
bool excludeTaa = false;
|
||
|
#if defined(ENABLE_STENCIL_RESPONSIVE)
|
||
|
excludeTaa = (v & DecodeStencilMask(STP_COMMON_CONSTANT)) != 0;
|
||
|
#endif // defined(ENABLE_STENCIL_RESPONSIVE)
|
||
|
return (hasValidHistory && !excludeTaa) ? StpMF1_(1.0) : StpMF1_(0.0); }
|
||
|
|
||
|
StpMF1 StpPatDitF(StpMU2 o) { return (StpMF1)StpDitF1(o); }
|
||
|
StpMF4 StpPatPriFedF(StpF2 p) { return (StpMF4)SAMPLE_TEXTURE2D_X_LOD(_StpPriorFeedback, s_linear_clamp_sampler, p, 0); }
|
||
|
StpMF4 StpPatPriFedR4F(StpF2 p) { return (StpMF4)GATHER_RED_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
|
||
|
StpMF4 StpPatPriFedG4F(StpF2 p) { return (StpMF4)GATHER_GREEN_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
|
||
|
StpMF4 StpPatPriFedB4F(StpF2 p) { return (StpMF4)GATHER_BLUE_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
|
||
|
StpMF2 StpPatPriLumF(StpF2 p) { return (StpMF2)SAMPLE_TEXTURE2D_X_LOD(_StpPriorLuma, s_linear_clamp_sampler, p, 0); }
|
||
|
StpU4 StpPatPriMot4F(StpF2 p) { return GATHER_RED_TEXTURE2D_X(_StpPriorDepthMotion, s_point_clamp_sampler, p); }
|
||
|
|
||
|
void StpPatStMotF(StpMU2 p, StpU1 v) { _StpDepthMotion[COORD_TEXTURE2D_X(p)] = v; }
|
||
|
void StpPatStColF(StpMU2 p, StpMF4 v) { _StpIntermediateColor[COORD_TEXTURE2D_X(p)] = v; }
|
||
|
void StpPatStLumF(StpMU2 p, StpMF2 v) { _StpLuma[COORD_TEXTURE2D_X(p)] = v; }
|
||
|
void StpPatStCnvF(StpMU2 p, StpMF1 v) { _StpIntermediateConvergence[COORD_TEXTURE2D_X(p >> StpMU1(2))] = v; }
|
||
|
#endif
|
||
|
|
||
|
#define THREADING_BLOCK_SIZE STP_GROUP_SIZE
|
||
|
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Threading.hlsl"
|
||
|
|
||
|
[numthreads(STP_GROUP_SIZE, 1, 1)]
|
||
|
void StpSetup(Threading::Group group)
|
||
|
{
|
||
|
UNITY_XR_ASSIGN_VIEW_INDEX(group.groupID.z);
|
||
|
|
||
|
#if defined(STP_16BIT)
|
||
|
StpW1 lane = StpW1_(group.groupIndex);
|
||
|
StpW2 groupPos = ComputeGroupPos(StpW2(group.groupID.xy));
|
||
|
StpW2 pos = groupPos + StpRemapLaneTo8x16H(lane);
|
||
|
#else
|
||
|
StpMU1 lane = StpMU1_(group.groupIndex);
|
||
|
StpMU2 groupPos = ComputeGroupPos(StpMU2(group.groupID.xy));
|
||
|
StpMU2 pos = groupPos + StpRemapLaneTo8x16F(lane);
|
||
|
#endif
|
||
|
|
||
|
#if defined(STP_16BIT)
|
||
|
StpPatH(
|
||
|
lane,
|
||
|
pos,
|
||
|
#else
|
||
|
StpPatF(
|
||
|
lane,
|
||
|
pos,
|
||
|
#endif
|
||
|
|
||
|
asuint(_StpSetupConstants0),
|
||
|
asuint(_StpSetupConstants1),
|
||
|
asuint(_StpSetupConstants2),
|
||
|
asuint(_StpSetupConstants3),
|
||
|
asuint(_StpSetupConstants4),
|
||
|
asuint(_StpSetupConstants5),
|
||
|
|
||
|
asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 0]),
|
||
|
asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 1]),
|
||
|
asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 2]),
|
||
|
asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 3]),
|
||
|
asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 4]),
|
||
|
asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 5]),
|
||
|
asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 6]),
|
||
|
asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 7])
|
||
|
);
|
||
|
}
|
||
|
|