/** 
 * @file    kCudaUtils.x.h
 *
 * @internal
 * Copyright (C) 2019-2022 by LMI Technologies Inc.  All rights reserved.
 */
#ifndef K_FIRESYNC_CUDA_UTILS_X_H
#define K_FIRESYNC_CUDA_UTILS_X_H

#include <kFireSync/Cuda/kCudaDevice.h>
#include <kFireSync/Cuda/kCudaStream.h>

typedef k32s xkCudaMemoryType; 

#define xkCUDA_MEMORY_TYPE_UNKNOWN          (0)         //Unrecognized, e.g., unregistered host address
#define xkCUDA_MEMORY_TYPE_PINNED           (1)         //Pinned host memory address
#define xkCUDA_MEMORY_TYPE_DEVICE           (2)         //Device memory adddress
#define xkCUDA_MEMORY_TYPE_MANAGED          (3)         //Managed memory address


/* 
*  Forward declarations
*/

kInlineFx(kStatus) kCudaUtils_AttachToStream(kCudaStream stream, void* pointer, kCudaSync sync);

kFsFx(kStatus) kCudaUtils_Copy(kCudaStream stream, void* dest, const void* src, kSize size, kCudaSync sync);

kFsFx(kStatus) kCudaUtils_CloneObject(kCudaStream stream, kObject* object, kObject source, kAlloc objectAllocator, kAlloc valueAllocator, kCudaSync sync);

kFsFx(kStatus) kCudaUtils_AssignArray(kCudaStream stream, kArrayProvider destination, kArrayProvider source, kCudaSync sync);

kInlineFx(kBool) kCudaUtils_IsArrayDeviceAccessible(kArrayProvider array, kBool isExclusive);

kFsFx(kStatus) kCudaUtils_ConstructArrayFromTemplate(kCudaStream stream, kArrayProvider* outputArray, kArrayProvider templateArray, kAlloc objectAlloc, kAlloc valueAlloc, kCudaSync sync);

kFsFx(kStatus) kCudaUtils_PrepareDeviceArray(kCudaStream stream, kArrayProvider* outputArray, kArrayProvider inputArray, kBool copyData, kAlloc objectAlloc, kAlloc valueAlloc, kBool inputExclusive, kCudaSync sync);


/* 
*  Private methods
*/

kFsFx(xkCudaMemoryType) xkCudaUtils_MemoryType(const void* pointer);

kInlineFx(kBool) xkCudaUtils_IsDeviceMemory(const void* pointer)
{
    return xkCudaUtils_MemoryType(pointer) == xkCUDA_MEMORY_TYPE_DEVICE;
}

kInlineFx(kBool) xkCudaUtils_IsManagedMemory(const void* pointer)
{
    return xkCudaUtils_MemoryType(pointer) == xkCUDA_MEMORY_TYPE_MANAGED;
}

template<class T>
kInlineFx(kStatus) xkCudaUtils_AllocValue(kCudaStream stream, kAlloc allocator, T** pointer, kCudaSync sync)
{
    kStatus status;
    T* temp = kNULL;

    kTry
    {
        kTest(kAlloc_Get(allocator, sizeof(T), &temp));

        kTest(kCudaUtils_AttachToStream(stream, temp, sync)); 

        *pointer = temp;
    }
    kCatch(&status)
    {
        kAlloc_Free(allocator, temp);

        kEndCatch(status);
    }

    return kOK;
}

template<class T>
kInlineFx(kStatus) xkCudaUtils_CloneValue(kCudaStream stream, kAlloc destAlloc, T** dest, const T* source, kCudaSync sync)
{
    kStatus status;
    T* temp = kNULL;

    kTry
    {
        kTest(xkCudaUtils_AllocValue(stream, destAlloc, &temp, sync)); 

        kTest(kCudaUtils_Copy(stream, temp, source, sizeof(T), sync));

        *dest = temp;
    }
    kCatch(&status)
    {
        kAlloc_Free(destAlloc, temp);

        kEndCatch(status);
    }

    return kOK;
}

/* 
* Deprecated (Stage 1): not recommended for further use, but not yet announced via kDeprecate
*/

//[Deprecated] No direct replacement; discuss requirements with FSS to determine if an additional method is needed to replace this call.
kInlineFx(kStatus) kCudaUtils_ExportDeviceArray(kCudaStream stream, kArrayProvider array, kArrayProvider source)
{
    kCheckArgs(kCudaUtils_IsArrayDeviceAccessible(source, kTRUE));

    kCheck(kCudaUtils_AssignArray(kNULL, array, source, kCUDA_SYNC_WAIT_HOST));

    return kOK;
}

//[Deprecated] Refactor code to use kCudaStream_Synchronize; note, kCudaStream_Synchronize does not accept a null stream.
kInlineFx(kStatus) kCudaUtils_Synchronize(kCudaStream cudaStream = kNULL)
{
    return kCudaStream_Synchronize(xkCudaStream_Fallback(cudaStream)); 
}

template<class T>
kInlineFx(kStatus) kCudaUtils_PrepareDeviceValue(kCudaStream stream, T** dest, T* source, kAlloc destAlloc = kNULL, kCudaSync sync = kCUDA_SYNC_WAIT)
{
    kAlloc resolvedAlloc = (destAlloc == kNULL) ? kCudaDeviceAlloc_Instance() : destAlloc;

    return xkCudaUtils_CloneValue(stream, resolvedAlloc, dest, source, sync);
}

template<class T>
kInlineFx(kStatus) kCudaUtils_ConstructValueFromType(kCudaStream stream, T** pointer, kAlloc allocator = kNULL, kCudaSync sync = kCUDA_SYNC_WAIT)
{
    kAlloc resolvedAlloc = (allocator == kNULL) ? kCudaDeviceAlloc_Instance() : allocator;

    return xkCudaUtils_AllocValue(stream, resolvedAlloc, pointer, sync);
}


template<class T>
kInlineFx(kStatus) kCudaUtils_ExportDeviceValue(kCudaStream stream, T* dest, T* source)
{
    return kCudaUtils_Copy(stream, dest, source, sizeof(T), kCUDA_SYNC_WAIT_HOST);
}

//[Deprecated] Refactor to use kCudaUtils_FreeValueRef or kAlloc_FreeRef.
//
//Note, this method has been deprecated because it encourages a suboptimal coding style. In general, it 
//will lead to safer code if a reference to the allocator associated with a memory allocation is cached
//at allocation time and then used when freeing the memory. This will prevent potential bugs that could 
//arise if the allocator instance is changed without updating the corresponding deallocation call.  
template<class T>
kInlineFx(kStatus) kCudaUtils_FreeDeviceRef(T** pointer, kAlloc allocator = kNULL)
{
    kAlloc resolvedAlloc = (allocator == kNULL) ? kCudaDeviceAlloc_Instance() : allocator;

    return kAlloc_FreeRef(resolvedAlloc, pointer);
}

#endif
