Function execute

Synopsis

#include <Source/Falcor/Utils/Algorithm/ComputeParallelReduction.h>

template <typename T>
bool execute(RenderContext *pRenderContext, const Texture::SharedPtr &pInput, Type operation, T *pResult=nullptr, Buffer::SharedPtr pResultBuffer=nullptr, uint64_t resultOffset=0)

Description

Perform parallel reduction. The computations are performed in type T, which must be compatible with the texture format:

  • float4 for floating-point texture formats (float, snorm, unorm).
  • uint4 for unsigned integer texture formats.
  • int4 for signed integer texture formats.

For the Sum operation, unused components are set to zero if texture format has < 4 components.

For performance reasons, it is advisable to store the result in a buffer on the GPU, and then issue an asynchronous readback in user code to avoid a full GPU flush.

The size of the result buffer depends on the executed operation:

  • Sum needs 16B
  • MinMax needs 32B
Parameters:

[ in ] pRenderContext - The render context.

[ in ] pInput - Input texture.

[ in ] operation - Reduction operation.

[ out ] pResult - (Optional) The result of the reduction operation is stored here if non-nullptr. Note that this requires a GPU flush!

[ out ] pResultBuffer - (Optional) Buffer on the GPU to which the result is copied (16B or 32B).

[ out ] resultOffset - (Optional) Byte offset into pResultBuffer to where the result should be stored.

Return
True if successful, false if an error occured.

Source

Lines 73-206 in Source/Falcor/Utils/Algorithm/ComputeParallelReduction.cpp. Line 87 in Source/Falcor/Utils/Algorithm/ComputeParallelReduction.h.

template<typename T>
bool ComputeParallelReduction::execute(RenderContext* pRenderContext, const Texture::SharedPtr& pInput, Type operation, T* pResult, Buffer::SharedPtr pResultBuffer, uint64_t resultOffset)
{
    PROFILE("ComputeParallelReduction::execute");
    // Check texture array/mip/sample count.
    if (pInput->getArraySize() != 1 || pInput->getMipCount() != 1 || pInput->getSampleCount() != 1)
    {
        logError("ComputeParallelReduction::execute() - Input texture is unsupported. Aborting.");
        return false;
    }
    // Check texture format.
    uint32_t formatType = FORMAT_TYPE_UNKNOWN;
    switch (getFormatType(pInput->getFormat()))
    {
    case FormatType::Float:
    case FormatType::Unorm:
    case FormatType::Snorm:
        formatType = FORMAT_TYPE_FLOAT;
        break;
    case FormatType::Sint:
        formatType = FORMAT_TYPE_SINT;
        break;
    case FormatType::Uint:
        formatType = FORMAT_TYPE_UINT;
        break;
    default:
        logError("ComputeParallelReduction::execute() - Input texture format unsupported. Aborting.");
        return false;
    }
    // Check that reduction type T is compatible with the resource format.
    if (sizeof(typename T::value_type) != 4 ||     // The shader is written for 32-bit types
        (formatType == FORMAT_TYPE_FLOAT && !std::is_floating_point<T::value_type>::value) ||
        (formatType == FORMAT_TYPE_SINT && (!std::is_integral<T::value_type>::value || !std::is_signed<T::value_type>::value)) ||
        (formatType == FORMAT_TYPE_UINT && (!std::is_integral<T::value_type>::value || !std::is_unsigned<T::value_type>::value)))
    {
        logError("ComputeParallelReduction::execute() - Template type T is not compatible with resource format. Aborting.");
        return false;
    }
    uint32_t reductionType = REDUCTION_TYPE_UNKNOWN;
    uint32_t elementSize = 0;
    switch (operation)
    {
    case Type::Sum:
        reductionType = REDUCTION_TYPE_SUM;
        elementSize = 1;
        break;
    case Type::MinMax:
        reductionType = REDUCTION_TYPE_MINMAX;
        elementSize = 2;
        break;
    default:
        logError("ComputeParallelReduction::execute() - Unknown reduction type. Aborting.");
        return false;
    }
    // Allocate intermediate buffers if needed.
    const uint2 resolution = uint2(pInput->getWidth(), pInput->getHeight());
    assert(resolution.x > 0 && resolution.y > 0);
    assert(elementSize > 0);
    const uint2 numTiles = div_round_up(resolution, uint2(mpInitialProgram->getReflector()->getThreadGroupSize()));
    allocate(numTiles.x * numTiles.y, elementSize);
    assert(mpBuffers[0]);
    assert(mpBuffers[1]);
    // Configure program.
    const uint32_t channelCount = getFormatChannelCount(pInput->getFormat());
    assert(channelCount >= 1 && channelCount <= 4);
    Program::DefineList defines;
    defines.add("REDUCTION_TYPE", std::to_string(reductionType));
    defines.add("FORMAT_CHANNELS", std::to_string(channelCount));
    defines.add("FORMAT_TYPE", std::to_string(formatType));
    mpInitialProgram->addDefines(defines);
    mpFinalProgram->addDefines(defines);
    // Initial pass: Reduction over tiles of pixels in input texture.
    mpVars["PerFrameCB"]["gResolution"] = resolution;
    mpVars["PerFrameCB"]["gNumTiles"] = numTiles;
    mpVars["gInput"] = pInput;
    mpVars->setBuffer("gResult", mpBuffers[0]);
    mpState->setProgram(mpInitialProgram);
    uint3 numGroups = div_round_up(uint3(resolution.x, resolution.y, 1), mpInitialProgram->getReflector()->getThreadGroupSize());
    pRenderContext->dispatch(mpState.get(), mpVars.get(), numGroups);
    // Final pass(es): Reduction by a factor N for each pass.
    uint elems = numTiles.x * numTiles.y;
    uint inputsBufferIndex = 0;
    while (elems > 1)
    {
        mpVars["PerFrameCB"]["gElems"] = elems;
        mpVars->setBuffer("gInputBuffer", mpBuffers[inputsBufferIndex]);
        mpVars->setBuffer("gResult", mpBuffers[1 - inputsBufferIndex]);
        mpState->setProgram(mpFinalProgram);
        uint32_t numGroups = div_round_up(elems, mpFinalProgram->getReflector()->getThreadGroupSize().x);
        pRenderContext->dispatch(mpState.get(), mpVars.get(), { numGroups, 1, 1 });
        inputsBufferIndex = 1 - inputsBufferIndex;
        elems = numGroups;
    }
    size_t resultSize = elementSize * 16;
    // Copy the result to GPU buffer.
    if (pResultBuffer)
    {
        if (resultOffset + resultSize > pResultBuffer->getSize())
        {
            logError("ComputeParallelReduction::execute() - Results buffer is too small. Aborting.");
            return false;
        }
        pRenderContext->copyBufferRegion(pResultBuffer.get(), resultOffset, mpBuffers[inputsBufferIndex].get(), 0, resultSize);
    }
    // Read back the result to the CPU.
    if (pResult)
    {
        const T* pBuf = static_cast<const T*>(mpBuffers[inputsBufferIndex]->map(Buffer::MapType::Read));
        assert(pBuf);
        std::memcpy(pResult, pBuf, resultSize);
        mpBuffers[inputsBufferIndex]->unmap();
    }
    return true;
}





Add Discussion as Guest

Log in to DocsForge