Function execute
Synopsis
#include <Source/Falcor/Utils/Algorithm/ComputeParallelReduction.h>
template <typename T>
bool execute(RenderContext *pRenderContext, const Texture::SharedPtr &pInput, Type operation, T *pResult=nullptr, Buffer::SharedPtr pResultBuffer=nullptr, uint64_t resultOffset=0)
Description
Perform parallel reduction. The computations are performed in type T, which must be compatible with the texture format:
- float4 for floating-point texture formats (float, snorm, unorm).
- uint4 for unsigned integer texture formats.
- int4 for signed integer texture formats.
For the Sum operation, unused components are set to zero if texture format has < 4 components.
For performance reasons, it is advisable to store the result in a buffer on the GPU, and then issue an asynchronous readback in user code to avoid a full GPU flush.
The size of the result buffer depends on the executed operation:
- Sum needs 16B
- MinMax needs 32B
- Parameters
[ in ]
pRenderContext
- The render context.[ in ]
pInput
- Input texture.[ in ]
operation
- Reduction operation.[ out ]
pResult
- (Optional) The result of the reduction operation is stored here if non-nullptr. Note that this requires a GPU flush![ out ]
pResultBuffer
- (Optional) Buffer on the GPU to which the result is copied (16B or 32B).[ out ]
resultOffset
- (Optional) Byte offset into pResultBuffer to where the result should be stored.- Returns
- True if successful, false if an error occured.
Source
Lines 73-206 in Source/Falcor/Utils/Algorithm/ComputeParallelReduction.cpp. Line 87 in Source/Falcor/Utils/Algorithm/ComputeParallelReduction.h.
template<typename T>
bool ComputeParallelReduction::execute(RenderContext* pRenderContext, const Texture::SharedPtr& pInput, Type operation, T* pResult, Buffer::SharedPtr pResultBuffer, uint64_t resultOffset)
{
PROFILE("ComputeParallelReduction::execute");
// Check texture array/mip/sample count.
if (pInput->getArraySize() != 1 || pInput->getMipCount() != 1 || pInput->getSampleCount() != 1)
{
logError("ComputeParallelReduction::execute() - Input texture is unsupported. Aborting.");
return false;
}
// Check texture format.
uint32_t formatType = FORMAT_TYPE_UNKNOWN;
switch (getFormatType(pInput->getFormat()))
{
case FormatType::Float:
case FormatType::Unorm:
case FormatType::Snorm:
formatType = FORMAT_TYPE_FLOAT;
break;
case FormatType::Sint:
formatType = FORMAT_TYPE_SINT;
break;
case FormatType::Uint:
formatType = FORMAT_TYPE_UINT;
break;
default:
logError("ComputeParallelReduction::execute() - Input texture format unsupported. Aborting.");
return false;
}
// Check that reduction type T is compatible with the resource format.
if (sizeof(typename T::value_type) != 4 || // The shader is written for 32-bit types
(formatType == FORMAT_TYPE_FLOAT && !std::is_floating_point<T::value_type>::value) ||
(formatType == FORMAT_TYPE_SINT && (!std::is_integral<T::value_type>::value || !std::is_signed<T::value_type>::value)) ||
(formatType == FORMAT_TYPE_UINT && (!std::is_integral<T::value_type>::value || !std::is_unsigned<T::value_type>::value)))
{
logError("ComputeParallelReduction::execute() - Template type T is not compatible with resource format. Aborting.");
return false;
}
uint32_t reductionType = REDUCTION_TYPE_UNKNOWN;
uint32_t elementSize = 0;
switch (operation)
{
case Type::Sum:
reductionType = REDUCTION_TYPE_SUM;
elementSize = 1;
break;
case Type::MinMax:
reductionType = REDUCTION_TYPE_MINMAX;
elementSize = 2;
break;
default:
logError("ComputeParallelReduction::execute() - Unknown reduction type. Aborting.");
return false;
}
// Allocate intermediate buffers if needed.
const uint2 resolution = uint2(pInput->getWidth(), pInput->getHeight());
assert(resolution.x > 0 && resolution.y > 0);
assert(elementSize > 0);
const uint2 numTiles = div_round_up(resolution, uint2(mpInitialProgram->getReflector()->getThreadGroupSize()));
allocate(numTiles.x * numTiles.y, elementSize);
assert(mpBuffers[0]);
assert(mpBuffers[1]);
// Configure program.
const uint32_t channelCount = getFormatChannelCount(pInput->getFormat());
assert(channelCount >= 1 && channelCount <= 4);
Program::DefineList defines;
defines.add("REDUCTION_TYPE", std::to_string(reductionType));
defines.add("FORMAT_CHANNELS", std::to_string(channelCount));
defines.add("FORMAT_TYPE", std::to_string(formatType));
mpInitialProgram->addDefines(defines);
mpFinalProgram->addDefines(defines);
// Initial pass: Reduction over tiles of pixels in input texture.
mpVars["PerFrameCB"]["gResolution"] = resolution;
mpVars["PerFrameCB"]["gNumTiles"] = numTiles;
mpVars["gInput"] = pInput;
mpVars->setBuffer("gResult", mpBuffers[0]);
mpState->setProgram(mpInitialProgram);
uint3 numGroups = div_round_up(uint3(resolution.x, resolution.y, 1), mpInitialProgram->getReflector()->getThreadGroupSize());
pRenderContext->dispatch(mpState.get(), mpVars.get(), numGroups);
// Final pass(es): Reduction by a factor N for each pass.
uint elems = numTiles.x * numTiles.y;
uint inputsBufferIndex = 0;
while (elems > 1)
{
mpVars["PerFrameCB"]["gElems"] = elems;
mpVars->setBuffer("gInputBuffer", mpBuffers[inputsBufferIndex]);
mpVars->setBuffer("gResult", mpBuffers[1 - inputsBufferIndex]);
mpState->setProgram(mpFinalProgram);
uint32_t numGroups = div_round_up(elems, mpFinalProgram->getReflector()->getThreadGroupSize().x);
pRenderContext->dispatch(mpState.get(), mpVars.get(), { numGroups, 1, 1 });
inputsBufferIndex = 1 - inputsBufferIndex;
elems = numGroups;
}
size_t resultSize = elementSize * 16;
// Copy the result to GPU buffer.
if (pResultBuffer)
{
if (resultOffset + resultSize > pResultBuffer->getSize())
{
logError("ComputeParallelReduction::execute() - Results buffer is too small. Aborting.");
return false;
}
pRenderContext->copyBufferRegion(pResultBuffer.get(), resultOffset, mpBuffers[inputsBufferIndex].get(), 0, resultSize);
}
// Read back the result to the CPU.
if (pResult)
{
const T* pBuf = static_cast<const T*>(mpBuffers[inputsBufferIndex]->map(Buffer::MapType::Read));
assert(pBuf);
std::memcpy(pResult, pBuf, resultSize);
mpBuffers[inputsBufferIndex]->unmap();
}
return true;
}