diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index 3467486cf..62296ab92 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -91,7 +91,7 @@ __global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) { } template -void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, +void cudaMemcpy_pullback(T* destPtr, const T* srcPtr, size_t count, cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr, size_t* d_count, cudaMemcpyKind* d_kind) __attribute__((host)) { diff --git a/include/clad/Differentiator/CladUtils.h b/include/clad/Differentiator/CladUtils.h index fa14e5629..071a0e516 100644 --- a/include/clad/Differentiator/CladUtils.h +++ b/include/clad/Differentiator/CladUtils.h @@ -128,6 +128,16 @@ namespace clad { clang::DeclContext* DC1, clang::DeclContext* DC2); + /// Finds the qualified name `name` in the declaration context `DC`. + /// + /// \param[in] name + /// \param[in] S + /// \param[in] DC + /// \returns lookup result. + clang::LookupResult LookupQualifiedName(llvm::StringRef name, + clang::Sema& S, + clang::DeclContext* DC = nullptr); + /// Finds namespace 'namespc` under the declaration context `DC` or the /// translation unit declaration if `DC` is null. /// diff --git a/include/clad/Differentiator/Compatibility.h b/include/clad/Differentiator/Compatibility.h index efd3d629c..b9901e20f 100644 --- a/include/clad/Differentiator/Compatibility.h +++ b/include/clad/Differentiator/Compatibility.h @@ -178,7 +178,7 @@ static inline IfStmt* IfStmt_Create(const ASTContext &Ctx, #endif } -// Compatibility helper function for creation CallExpr. +// Compatibility helper function for creation CallExpr and CUDAKernelCallExpr. // Clang 12 and above use one extra param. #if CLANG_VERSION_MAJOR < 12 @@ -188,6 +188,15 @@ static inline CallExpr* CallExpr_Create(const ASTContext &Ctx, Expr *Fn, ArrayRe { return CallExpr::Create(Ctx, Fn, Args, Ty, VK, RParenLoc, MinNumArgs, UsesADL); } + +static inline CUDAKernelCallExpr* +CUDAKernelCallExpr_Create(const ASTContext& Ctx, Expr* Fn, CallExpr* Config, + ArrayRef Args, QualType Ty, ExprValueKind VK, + SourceLocation RParenLoc, unsigned MinNumArgs = 0, + CallExpr::ADLCallKind UsesADL = CallExpr::NotADL) { + return CUDAKernelCallExpr::Create(Ctx, Fn, Config, Args, Ty, VK, RParenLoc, + MinNumArgs); +} #elif CLANG_VERSION_MAJOR >= 12 static inline CallExpr* CallExpr_Create(const ASTContext &Ctx, Expr *Fn, ArrayRef< Expr *> Args, QualType Ty, ExprValueKind VK, SourceLocation RParenLoc, FPOptionsOverride FPFeatures, @@ -195,6 +204,16 @@ static inline CallExpr* CallExpr_Create(const ASTContext &Ctx, Expr *Fn, ArrayRe { return CallExpr::Create(Ctx, Fn, Args, Ty, VK, RParenLoc, FPFeatures, MinNumArgs, UsesADL); } + +static inline CUDAKernelCallExpr* +CUDAKernelCallExpr_Create(const ASTContext& Ctx, Expr* Fn, CallExpr* Config, + ArrayRef Args, QualType Ty, ExprValueKind VK, + SourceLocation RParenLoc, + FPOptionsOverride FPFeatures, unsigned MinNumArgs = 0, + CallExpr::ADLCallKind UsesADL = CallExpr::NotADL) { + return CUDAKernelCallExpr::Create(Ctx, Fn, Config, Args, Ty, VK, RParenLoc, + FPFeatures, MinNumArgs); +} #endif // Clang 12 and above use one extra param. diff --git a/include/clad/Differentiator/Differentiator.h b/include/clad/Differentiator/Differentiator.h index d5e51c4c0..a4d450aff 100644 --- a/include/clad/Differentiator/Differentiator.h +++ b/include/clad/Differentiator/Differentiator.h @@ -126,7 +126,7 @@ CUDA_HOST_DEVICE T push(tape& to, ArgsT... val) { #if defined(__CUDACC__) && !defined(__CUDA_ARCH__) if (CUDAkernel) { constexpr size_t totalArgs = sizeof...(args) + sizeof...(Rest); - std::array argPtrs = {static_cast(&args)..., + std::array argPtrs = {(void*)(&args)..., static_cast(nullptr)...}; void* null_param = nullptr; diff --git a/include/clad/Differentiator/StmtClone.h b/include/clad/Differentiator/StmtClone.h index 00c901cfa..83d91599c 100644 --- a/include/clad/Differentiator/StmtClone.h +++ b/include/clad/Differentiator/StmtClone.h @@ -104,6 +104,7 @@ namespace utils { DECLARE_CLONE_FN(ExtVectorElementExpr) DECLARE_CLONE_FN(UnaryExprOrTypeTraitExpr) DECLARE_CLONE_FN(CallExpr) + DECLARE_CLONE_FN(CUDAKernelCallExpr) DECLARE_CLONE_FN(ShuffleVectorExpr) DECLARE_CLONE_FN(ExprWithCleanups) DECLARE_CLONE_FN(CXXOperatorCallExpr) diff --git a/lib/Differentiator/CladUtils.cpp b/lib/Differentiator/CladUtils.cpp index 350eeea07..fc3bb0b02 100644 --- a/lib/Differentiator/CladUtils.cpp +++ b/lib/Differentiator/CladUtils.cpp @@ -251,6 +251,18 @@ namespace clad { return DC->getPrimaryContext(); } + LookupResult LookupQualifiedName(llvm::StringRef name, clang::Sema& S, + clang::DeclContext* DC) { + ASTContext& C = S.getASTContext(); + DeclarationName declName = &C.Idents.get(name); + LookupResult Result(S, declName, SourceLocation(), + Sema::LookupOrdinaryName); + if (!DC) + DC = C.getTranslationUnitDecl(); + S.LookupQualifiedName(Result, DC); + return Result; + } + NamespaceDecl* LookupNSD(Sema& S, llvm::StringRef namespc, bool shouldExist, DeclContext* DC) { ASTContext& C = S.getASTContext(); diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 026164498..19b902ce1 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -1911,7 +1911,73 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, QualType dArgTy = getNonConstType(arg->getType(), m_Context, m_Sema); VarDecl* dArgDecl = BuildVarDecl(dArgTy, "_r", getZeroInit(dArgTy)); PreCallStmts.push_back(BuildDeclStmt(dArgDecl)); - CallArgDx.push_back(BuildDeclRef(dArgDecl)); + DeclRefExpr* dArgRef = BuildDeclRef(dArgDecl); + if (isa(CE)) { + // Create variables to be allocated and initialized on the device, and + // then be passed to the kernel pullback. + // + // These need to be pointers because cudaMalloc expects a + // pointer-to-pointer as an arg. + // The memory addresses they point to are initialized to zero through + // cudaMemset. + // After the pullback call, their values will be copied back to the + // corresponding _r variables on the host and the device variables + // will be freed. + // + // Example of the generated code: + // + // double _r0 = 0; + // double* _r1 = nullptr; + // cudaMalloc(&_r1, sizeof(double)); + // cudaMemset(_r1, 0, 8); + // kernel_pullback<<<...>>>(..., _r1); + // cudaMemcpy(&_r0, _r1, 8, cudaMemcpyDeviceToHost); + // cudaFree(_r1); + + // Create a literal for the size of the type + Expr* sizeLiteral = ConstantFolder::synthesizeLiteral( + m_Context.IntTy, m_Context, m_Context.getTypeSize(dArgTy) / 8); + dArgTy = m_Context.getPointerType(dArgTy); + VarDecl* dArgDeclCUDA = + BuildVarDecl(dArgTy, "_r", getZeroInit(dArgTy)); + + // Create the cudaMemcpyDeviceToHost argument + LookupResult deviceToHostResult = + utils::LookupQualifiedName("cudaMemcpyDeviceToHost", m_Sema); + if (deviceToHostResult.empty()) { + diag(DiagnosticsEngine::Error, CE->getEndLoc(), + "Failed to create cudaMemcpy call; cudaMemcpyDeviceToHost not " + "found. Creating kernel pullback aborted."); + return StmtDiff(Clone(CE)); + } + CXXScopeSpec SS; + Expr* deviceToHostExpr = + m_Sema + .BuildDeclarationNameExpr(SS, deviceToHostResult, + /*ADL=*/false) + .get(); + + // Add calls to cudaMalloc, cudaMemset, cudaMemcpy, and cudaFree + PreCallStmts.push_back(BuildDeclStmt(dArgDeclCUDA)); + Expr* refOp = BuildOp(UO_AddrOf, BuildDeclRef(dArgDeclCUDA)); + llvm::SmallVector mallocArgs = {refOp, sizeLiteral}; + PreCallStmts.push_back(GetFunctionCall("cudaMalloc", "", mallocArgs)); + llvm::SmallVector memsetArgs = { + BuildDeclRef(dArgDeclCUDA), getZeroInit(m_Context.IntTy), + sizeLiteral}; + PreCallStmts.push_back(GetFunctionCall("cudaMemset", "", memsetArgs)); + llvm::SmallVector cudaMemcpyArgs = { + BuildOp(UO_AddrOf, dArgRef), BuildDeclRef(dArgDeclCUDA), + sizeLiteral, deviceToHostExpr}; + PostCallStmts.push_back( + GetFunctionCall("cudaMemcpy", "", cudaMemcpyArgs)); + llvm::SmallVector freeArgs = {BuildDeclRef(dArgDeclCUDA)}; + PostCallStmts.push_back(GetFunctionCall("cudaFree", "", freeArgs)); + + // Update arg to be passed to pullback call + dArgRef = BuildDeclRef(dArgDeclCUDA); + } + CallArgDx.push_back(dArgRef); // Visit using uninitialized reference. argDiff = Visit(arg, BuildDeclRef(dArgDecl)); } @@ -2040,7 +2106,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, Expr* gradArgExpr = nullptr; QualType paramTy = FD->getParamDecl(idx)->getType(); if (!argDerivative || utils::isArrayOrPointerType(paramTy) || - isCladArrayType(argDerivative->getType())) + isCladArrayType(argDerivative->getType()) || + isa(CE)) gradArgExpr = argDerivative; else gradArgExpr = @@ -2228,6 +2295,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, m_ExternalSource->ActBeforeFinalizingVisitCallExpr( CE, OverloadedDerivedFn, DerivedCallArgs, CallArgDx, asGrad); + if (isa(CE)) + return StmtDiff(Clone(CE)); + Expr* call = nullptr; QualType returnType = FD->getReturnType(); diff --git a/lib/Differentiator/StmtClone.cpp b/lib/Differentiator/StmtClone.cpp index 15b32aebe..81413ba95 100644 --- a/lib/Differentiator/StmtClone.cpp +++ b/lib/Differentiator/StmtClone.cpp @@ -313,13 +313,30 @@ Stmt* StmtClone::VisitUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr* Node) { } Stmt* StmtClone::VisitCallExpr(CallExpr* Node) { + llvm::SmallVector clonedArgs; + for (Expr* arg : Node->arguments()) + clonedArgs.push_back(Clone(arg)); + CallExpr* result = clad_compat::CallExpr_Create( - Ctx, Clone(Node->getCallee()), llvm::ArrayRef(), + Ctx, Clone(Node->getCallee()), clonedArgs, CloneType(Node->getType()), + Node->getValueKind(), + Node->getRParenLoc() CLAD_COMPAT_CLANG8_CallExpr_ExtraParams); + + // Copy Value and Type dependent + clad_compat::ExprSetDeps(result, Node); + + return result; +} + +Stmt* StmtClone::VisitCUDAKernelCallExpr(CUDAKernelCallExpr* Node) { + llvm::SmallVector clonedArgs; + for (Expr* arg : Node->arguments()) + clonedArgs.push_back(Clone(arg)); + + CUDAKernelCallExpr* result = clad_compat::CUDAKernelCallExpr_Create( + Ctx, Clone(Node->getCallee()), Clone(Node->getConfig()), clonedArgs, CloneType(Node->getType()), Node->getValueKind(), Node->getRParenLoc() CLAD_COMPAT_CLANG8_CallExpr_ExtraParams); - result->setNumArgsUnsafe(Node->getNumArgs()); - for (unsigned i = 0, e = Node->getNumArgs(); i < e; ++i) - result->setArg(i, Clone(Node->getArg(i))); // Copy Value and Type dependent clad_compat::ExprSetDeps(result, Node); @@ -352,11 +369,6 @@ Stmt* StmtClone::VisitCXXOperatorCallExpr(CXXOperatorCallExpr* Node) { Node->getFPFeatures() CLAD_COMPAT_CLANG11_CXXOperatorCallExpr_Create_ExtraParamsUse); - //### result->setNumArgs(Ctx, Node->getNumArgs()); - result->setNumArgsUnsafe(Node->getNumArgs()); - for (unsigned i = 0, e = Node->getNumArgs(); i < e; ++i) - result->setArg(i, Clone(Node->getArg(i))); - // Copy Value and Type dependent clad_compat::ExprSetDeps(result, Node); @@ -364,16 +376,15 @@ Stmt* StmtClone::VisitCXXOperatorCallExpr(CXXOperatorCallExpr* Node) { } Stmt* StmtClone::VisitCXXMemberCallExpr(CXXMemberCallExpr * Node) { + llvm::SmallVector clonedArgs; + for (Expr* arg : Node->arguments()) + clonedArgs.push_back(Clone(arg)); + CXXMemberCallExpr* result = clad_compat::CXXMemberCallExpr_Create( - Ctx, Clone(Node->getCallee()), {}, CloneType(Node->getType()), + Ctx, Clone(Node->getCallee()), clonedArgs, CloneType(Node->getType()), Node->getValueKind(), Node->getRParenLoc() /*FP*/ CLAD_COMPAT_CLANG12_CastExpr_GetFPO(Node)); - // ### result->setNumArgs(Ctx, Node->getNumArgs()); - result->setNumArgsUnsafe(Node->getNumArgs()); - - for (unsigned i = 0, e = Node->getNumArgs(); i < e; ++i) - result->setArg(i, Clone(Node->getArg(i))); // Copy Value and Type dependent clad_compat::ExprSetDeps(result, Node); diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index fcbae3b5b..328a1f50d 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -503,6 +503,71 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: cudaFree(_d_in_dev); //CHECK-NEXT:} +void launch_add_kernel_4(int *out, int *in, const int N) { + int *in_dev = nullptr; + cudaMalloc(&in_dev, N * sizeof(int)); + cudaMemcpy(in_dev, in, N * sizeof(int), cudaMemcpyHostToDevice); + int *out_dev = nullptr; + cudaMalloc(&out_dev, N * sizeof(int)); + cudaMemcpy(out_dev, out, N * sizeof(int), cudaMemcpyHostToDevice); + + add_kernel_4<<<1, 5>>>(out_dev, in_dev, N); + + cudaMemcpy(out, out_dev, N * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(in_dev); + cudaFree(out_dev); +} + +// CHECK: void launch_add_kernel_4_grad_0_1(int *out, int *in, const int N, int *_d_out, int *_d_in) { +//CHECK-NEXT: int _d_N = 0; +//CHECK-NEXT: int *_d_in_dev = nullptr; +//CHECK-NEXT: int *in_dev = nullptr; +//CHECK-NEXT: cudaMalloc(&_d_in_dev, N * sizeof(int)); +//CHECK-NEXT: cudaMemset(_d_in_dev, 0, N * sizeof(int)); +//CHECK-NEXT: cudaMalloc(&in_dev, N * sizeof(int)); +//CHECK-NEXT: cudaMemcpy(in_dev, in, N * sizeof(int), cudaMemcpyHostToDevice); +//CHECK-NEXT: int *_d_out_dev = nullptr; +//CHECK-NEXT: int *out_dev = nullptr; +//CHECK-NEXT: cudaMalloc(&_d_out_dev, N * sizeof(int)); +//CHECK-NEXT: cudaMemset(_d_out_dev, 0, N * sizeof(int)); +//CHECK-NEXT: cudaMalloc(&out_dev, N * sizeof(int)); +//CHECK-NEXT: cudaMemcpy(out_dev, out, N * sizeof(int), cudaMemcpyHostToDevice); +//CHECK-NEXT: add_kernel_4<<<1, 5>>>(out_dev, in_dev, N); +//CHECK-NEXT: cudaMemcpy(out, out_dev, N * sizeof(int), cudaMemcpyDeviceToHost); +//CHECK-NEXT: { +//CHECK-NEXT: unsigned long _r6 = 0UL; +//CHECK-NEXT: cudaMemcpyKind _r7 = static_cast(0U); +//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out, out_dev, N * sizeof(int), cudaMemcpyDeviceToHost, _d_out, _d_out_dev, &_r6, &_r7); +//CHECK-NEXT: _d_N += _r6 * sizeof(int); +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: int _r4 = 0; +//CHECK-NEXT: int *_r5 = nullptr; +//CHECK-NEXT: cudaMalloc(&_r5, 4); +//CHECK-NEXT: cudaMemset(_r5, 0, 4); +//CHECK-NEXT: add_kernel_4_pullback<<<1, 5>>>(out_dev, in_dev, N, _d_out_dev, _d_in_dev, _r5); +//CHECK-NEXT: cudaMemcpy(&_r4, _r5, 4, cudaMemcpyDeviceToHost); +//CHECK-NEXT: cudaFree(_r5); +//CHECK-NEXT: _d_N += _r4; +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: unsigned long _r2 = 0UL; +//CHECK-NEXT: cudaMemcpyKind _r3 = static_cast(0U); +//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_dev, out, N * sizeof(int), cudaMemcpyHostToDevice, _d_out_dev, _d_out, &_r2, &_r3); +//CHECK-NEXT: _d_N += _r2 * sizeof(int); +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: unsigned long _r0 = 0UL; +//CHECK-NEXT: cudaMemcpyKind _r1 = static_cast(0U); +//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(in_dev, in, N * sizeof(int), cudaMemcpyHostToDevice, _d_in_dev, _d_in, &_r0, &_r1); +//CHECK-NEXT: _d_N += _r0 * sizeof(int); +//CHECK-NEXT: } +//CHECK-NEXT: cudaFree(in_dev); +//CHECK-NEXT: cudaFree(_d_in_dev); +//CHECK-NEXT: cudaFree(out_dev); +//CHECK-NEXT: cudaFree(_d_out_dev); +//CHECK-NEXT:} + // CHECK: __attribute__((device)) void device_fn_pullback_1(double in, double val, double _d_y, double *_d_in, double *_d_val) { //CHECK-NEXT: { //CHECK-NEXT: *_d_in += _d_y; @@ -548,6 +613,66 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: } //CHECK-NEXT:} +// CHECK: __attribute__((global)) void add_kernel_4_pullback(int *out, int *in, int N, int *_d_out, int *_d_in, int *_d_N) { +//CHECK-NEXT: bool _cond0; +//CHECK-NEXT: int _d_sum = 0; +//CHECK-NEXT: int sum = 0; +//CHECK-NEXT: unsigned long _t2; +//CHECK-NEXT: int _d_i = 0; +//CHECK-NEXT: int i = 0; +//CHECK-NEXT: clad::tape _t3 = {}; +//CHECK-NEXT: clad::tape _t4 = {}; +//CHECK-NEXT: int _t5; +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: { +//CHECK-NEXT: _cond0 = index0 < N; +//CHECK-NEXT: if (_cond0) { +//CHECK-NEXT: sum = 0; +//CHECK-NEXT: _t2 = 0UL; +//CHECK-NEXT: for (i = index0; ; clad::push(_t3, i) , (i += warpSize)) { +//CHECK-NEXT: { +//CHECK-NEXT: if (!(i < N)) +//CHECK-NEXT: break; +//CHECK-NEXT: } +//CHECK-NEXT: _t2++; +//CHECK-NEXT: clad::push(_t4, sum); +//CHECK-NEXT: sum += in[i]; +//CHECK-NEXT: } +//CHECK-NEXT: _t5 = out[index0]; +//CHECK-NEXT: out[index0] = sum; +//CHECK-NEXT: } +//CHECK-NEXT: } +//CHECK-NEXT: if (_cond0) { +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t5; +//CHECK-NEXT: int _r_d2 = _d_out[index0]; +//CHECK-NEXT: _d_out[index0] = 0; +//CHECK-NEXT: _d_sum += _r_d2; +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: for (;; _t2--) { +//CHECK-NEXT: { +//CHECK-NEXT: if (!_t2) +//CHECK-NEXT: break; +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: i = clad::pop(_t3); +//CHECK-NEXT: int _r_d0 = _d_i; +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: sum = clad::pop(_t4); +//CHECK-NEXT: int _r_d1 = _d_sum; +//CHECK-NEXT: atomicAdd(&_d_in[i], _r_d1); +//CHECK-NEXT: } +//CHECK-NEXT: } +//CHECK-NEXT: _d_index += _d_i; +//CHECK-NEXT: } +//CHECK-NEXT: } +//CHECK-NEXT:} + // CHECK: __attribute__((device)) void device_fn_4_pullback_0_1_3(double *in, double val, double _d_y, double *_d_in, double *_d_val) { //CHECK-NEXT: unsigned int _t1 = blockIdx.x; //CHECK-NEXT: unsigned int _t0 = blockDim.x; @@ -816,9 +941,23 @@ int main(void) { test_memory.execute(dummy_out_double, fives, d_out_double, zeros); printf("%0.2f, %0.2f, %0.2f\n", zeros[0], zeros[1], zeros[2]); // CHECK-EXEC: 60.00, 0.00, 0.00 + auto launch_kernel_4_test = clad::gradient(launch_add_kernel_4, "out, in"); + int *out_res = (int*)malloc(10 * sizeof(int)); + int *in_res = (int*)calloc(10, sizeof(int)); + int *zeros_int = (int*)calloc(10, sizeof(int)); + int *fives_int = (int*)malloc(10 * sizeof(int)); + for(int i = 0; i < 10; i++) { fives_int[i] = 5; out_res[i] = 5; } + + launch_kernel_4_test.execute(zeros_int, fives_int, 10, out_res, in_res); + printf("%d, %d, %d\n", in_res[0], in_res[1], in_res[2]); // CHECK-EXEC: 5, 5, 5 + free(res); free(fives); free(zeros); + free(fives_int); + free(zeros_int); + free(out_res); + free(in_res); cudaFree(d_out_double); cudaFree(d_in_double); cudaFree(val);