diff --git a/include/clang/Basic/Builtins.def b/include/clang/Basic/Builtins.def index e65f240edf19..bd5bc5aacd63 100644 --- a/include/clang/Basic/Builtins.def +++ b/include/clang/Basic/Builtins.def @@ -872,6 +872,8 @@ BUILTIN(cm_scoreboard_bti, "Ui","ni") BUILTIN(cm_scoreboard_deltas, "Q16c", "ni") BUILTIN(cm_scoreboard_depcnt, "UiUi", "ni") BUILTIN(cm_sbarrier, "vUi", "ni") +BUILTIN(cm_nbarrier_init, "vUc", "ni") +BUILTIN(cm_nbarrier_wait, "vUc", "ni") BUILTIN(cm_yield, "v", "ni") BUILTIN(cm_print_buffer, "LUi", "ni") BUILTIN(cm_print_format_index, "icC*", "ni") diff --git a/include/clang/Basic/TargetOptions.h b/include/clang/Basic/TargetOptions.h index 0ae1db9d67c2..b3109ff0605a 100644 --- a/include/clang/Basic/TargetOptions.h +++ b/include/clang/Basic/TargetOptions.h @@ -39,6 +39,9 @@ class TargetOptions { /// If given, the name of the target CPU to generate code for. std::string CPU; + /// If given, the name of the stepping to generate code for. + std::string Stepping; + // If given, revision id to pass to ocloc int RevId; diff --git a/include/clang/Driver/CMOptions.td b/include/clang/Driver/CMOptions.td index b5765b41821d..42aa3f8ef4e2 100644 --- a/include/clang/Driver/CMOptions.td +++ b/include/clang/Driver/CMOptions.td @@ -238,6 +238,13 @@ def dump_stats_cwd : Flag<["-"], "dump-stats">, Group, HelpText<"Alias for -dump-stats=cwd">, Alias, AliasArgs<["cwd"]>, Flags<[CMOption, CC1Option, CC1AsOption]>; +def Qxcm_doubleGRF : CMIgnoredFlag<"Qxcm_doubleGRF">, + HelpText<"increase total GRF number to 256">; + +def Qxcm_stepping: CMJoined<"Qxcm_stepping">, + HelpText<"Generate code taking platform stepping into account. " + "Valid only for some platforms">; + def Qxcm_revid: CMCC1Joined<"Qxcm_revid=">, HelpText<"Pass specified revision ID to libocloc " "Valid only for some platforms">; diff --git a/lib/Basic/Targets/GenX.cpp b/lib/Basic/Targets/GenX.cpp index 98dc00d4c242..f4421337bca6 100644 --- a/lib/Basic/Targets/GenX.cpp +++ b/lib/Basic/Targets/GenX.cpp @@ -61,6 +61,9 @@ bool GenXTargetInfo::handleTargetFeatures(std::vector &Features, .Case("RKL", false) .Case("TGLLP", false) .Case("DG1", false) + .Case("ADLS", false) + .Case("ADLP", false) + .Case("DG2", false) .Default(true); NativeDoubleSupport = llvm::StringSwitch(CPU) @@ -68,6 +71,9 @@ bool GenXTargetInfo::handleTargetFeatures(std::vector &Features, .Case("TGLLP", false) .Case("RKL", false) .Case("DG1", false) + .Case("ADLS", false) + .Case("ADLP", false) + .Case("DG2", false) .Default(true); // OCL runtime specific headers support @@ -92,6 +98,11 @@ bool GenXTargetInfo::setCPU(const std::string &Name) { .Case("RKL", true) .Case("DG1", true) .Case("XEHP_SDV", true) + .Case("ADLP", true) + .Case("ADLS", true) + .Case("DG2", true) + .Case("PVC", true) + .Case("PVCXT", true) .Default(false); if (CPUKnown) diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index 3bd738b460ba..af5589a86194 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -2295,6 +2295,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIcm_scoreboard_deltas: case Builtin::BIcm_scoreboard_depcnt: case Builtin::BIcm_sbarrier: + case Builtin::BIcm_nbarrier_init: + case Builtin::BIcm_nbarrier_wait: case Builtin::BIcm_yield : case Builtin::BIcm_print_buffer : case Builtin::BIcm_print_format_index: diff --git a/lib/CodeGen/CGCM.cpp b/lib/CodeGen/CGCM.cpp index c3dac5c09b4b..0d78a9b02fad 100644 --- a/lib/CodeGen/CGCM.cpp +++ b/lib/CodeGen/CGCM.cpp @@ -1423,7 +1423,9 @@ void CGCMRuntime::EmitCMKernelMetadata(const FunctionDecl *FD, getMD(llvm::ConstantInt::getNullValue(I32Ty)), getMD(llvm::ConstantInt::getNullValue(I32Ty)), // placeholder for arg offsets IOKinds, - ArgDescs + ArgDescs, + getMD(llvm::ConstantInt::getNullValue(I32Ty)), + getMD(llvm::ConstantInt::getNullValue(I32Ty)) }; // Add this kernel to the root. diff --git a/lib/CodeGen/CGCM.h b/lib/CodeGen/CGCM.h index aa367a44f4e9..4552c0aa0205 100644 --- a/lib/CodeGen/CGCM.h +++ b/lib/CodeGen/CGCM.h @@ -634,12 +634,31 @@ class CGCMRuntime { /// \brief Postprocess cm_bfn implementation builtin llvm::Value *HandleBuiltinBFNImpl(CMCallInfo &CallInfo, CMBuiltinKind Kind); + + /// \brief Postprocess cm_srnd implementation builtin + llvm::Value *HandleBuiltinSRNDImpl(CMCallInfo &CallInfo, CMBuiltinKind Kind); /// \brief Postprocess dpas implementation builtin. llvm::Value *HandleBuiltinDPASImpl(CMCallInfo &CallInfo, CMBuiltinKind Kind); /// \brief Postprocess dpas2 implementation builtin. llvm::Value *HandleBuiltinDPAS2Impl(CMCallInfo &CallInfo, CMBuiltinKind Kind); /// \brief Postprocess builtin cm_bf_cvt. llvm::Value *HandleBuiltinBFCVTImpl(CMCallInfo &CallInfo, CMBuiltinKind Kind); + /// \brief Postprocess builtin cm_qf_cvt. + llvm::Value *HandleBuiltinQFCVTImpl(CMCallInfo &CallInfo, CMBuiltinKind Kind); + /// \brief Postprocess builtin cm_tf32_cvt. + llvm::Value *HandleBuiltinTF32CVTImpl(CMCallInfo &CallInfo, CMBuiltinKind Kind); + /// \brief Postprocess block 2d builtins load/store/prefetch. + llvm::Value *HandleBuiltinLSC2dImpl(CMCallInfo &CallInfo, CMBuiltinKind Kind); + + /// \brief Postprocess wave 2.5 load/store + llvm::Value *HandleBuiltinLSCWaveImpl(CMCallInfo &CallInfo, CMBuiltinKind Kind); + + /// \brief Postprocess BTI-based load/store/prefetch + llvm::Value *HandleBuiltinLSCImpl(CMCallInfo &CallInfo, CMBuiltinKind Kind); + + /// \brief Postprocess cm_lsc_fence implementation builtins. + llvm::Value *HandleBuiltinLscFenceImpl(CMCallInfo &CallInfo, + CMBuiltinKind Kind); /// \brief Emit 1D/2D select expression. LValue EmitSelect(CodeGenFunction &CGF, const CMSelectExpr *E, LValue Base); @@ -712,6 +731,8 @@ class CGCMRuntime { /// Emit cm_slm_free builtin call. llvm::Value *EmitBuiltinSLMFree(CodeGenFunction &CGF, const CallExpr *E); + /// Emit cm_nbarrier_init builtin call. + void EmitBuiltinNBarrierInit(CodeGenFunction &CGF, const CallExpr *E); /// \brief Emit one of scatter_scaled, scatter4_scaled. llvm::CallInst *EmitScatterScaled(CodeGenFunction &CGF, unsigned IntrinsicID, diff --git a/lib/CodeGen/CGCMBuiltin.cpp b/lib/CodeGen/CGCMBuiltin.cpp index 77853911031a..d49fe62ed1c2 100644 --- a/lib/CodeGen/CGCMBuiltin.cpp +++ b/lib/CodeGen/CGCMBuiltin.cpp @@ -233,6 +233,40 @@ CMBuiltinKind CGCMRuntime::getCMBuiltinKind(StringRef MangledName) const { .StartsWith("__cm_intrinsic_impl_dpas_nosrc0", CMBK_cm_dpas_nosrc0_impl) .StartsWith("__cm_intrinsic_impl_dpas", CMBK_cm_dpas2_impl) .StartsWith("__cm_intrinsic_impl_bf_cvt", CMBK_cm_bf_cvt_impl) + .StartsWith("__cm_intrinsic_impl_tf32_cvt", CMBK_cm_tf32_cvt_impl) + .StartsWith("__cm_intrinsic_impl_srnd", CMBK_cm_srnd_impl) + .StartsWith("__cm_intrinsic_impl_prefetch_bti", CMBK_cm_prefetch_impl) + .StartsWith("__cm_intrinsic_impl_block_prefetch_bti", CMBK_cm_block_prefetch_impl) + .StartsWith("__cm_intrinsic_impl_prefetch_flat", CMBK_cm_prefetch_flat_impl) + .StartsWith("__cm_intrinsic_impl_block_prefetch_flat", CMBK_cm_block_prefetch_flat_impl) + .StartsWith("__cm_intrinsic_impl_load_bti", CMBK_cm_load_impl) + .StartsWith("__cm_intrinsic_impl_load4_bti", CMBK_cm_load4_impl) + .StartsWith("__cm_intrinsic_impl_block_load_bti", CMBK_cm_block_load_impl) + .StartsWith("__cm_intrinsic_impl_store_bti", CMBK_cm_store_impl) + .StartsWith("__cm_intrinsic_impl_store4_bti", CMBK_cm_store4_impl) + .StartsWith("__cm_intrinsic_impl_block_store_bti", CMBK_cm_block_store_impl) + .StartsWith("__cm_intrinsic_impl_load_flat", CMBK_cm_load_flat_impl) + .StartsWith("__cm_intrinsic_impl_load4_flat", CMBK_cm_load4_flat_impl) + .StartsWith("__cm_intrinsic_impl_block_load_flat", CMBK_cm_block_load_flat_impl) + .StartsWith("__cm_intrinsic_impl_load_bindless", CMBK_cm_load_bindless_impl) + .StartsWith("__cm_intrinsic_impl_store_flat", CMBK_cm_store_flat_impl) + .StartsWith("__cm_intrinsic_impl_store4_flat", CMBK_cm_store4_flat_impl) + .StartsWith("__cm_intrinsic_impl_block_store_flat", CMBK_cm_block_store_flat_impl) + .StartsWith("__cm_intrinsic_impl_store_bindless", CMBK_cm_store_bindless_impl) + .StartsWith("__cm_intrinsic_impl_load_slm", CMBK_cm_load_slm_impl) + .StartsWith("__cm_intrinsic_impl_load4_slm", CMBK_cm_load4_slm_impl) + .StartsWith("__cm_intrinsic_impl_block_load_slm", CMBK_cm_block_load_slm_impl) + .StartsWith("__cm_intrinsic_impl_store_slm", CMBK_cm_store_slm_impl) + .StartsWith("__cm_intrinsic_impl_store4_slm", CMBK_cm_store4_slm_impl) + .StartsWith("__cm_intrinsic_impl_block_store_slm", CMBK_cm_block_store_slm_impl) + .StartsWith("__cm_intrinsic_impl_block_prefetch2d_flat", CMBK_cm_block_prefetch2d_flat_impl) + .StartsWith("__cm_intrinsic_impl_block_load2d_flat", CMBK_cm_block_load2d_flat_impl) + .StartsWith("__cm_intrinsic_impl_block_store2d_flat", CMBK_cm_block_store2d_flat_impl) + .StartsWith("__cm_intrinsic_impl_lsc_atomic_bti", CMBK_cm_atomic_bti_impl) + .StartsWith("__cm_intrinsic_impl_lsc_atomic_slm", CMBK_cm_atomic_slm_impl) + .StartsWith("__cm_intrinsic_impl_lsc_atomic_flat", CMBK_cm_atomic_flat_impl) + .StartsWith("__cm_intrinsic_impl_lsc_fence", CMBK_cm_lsc_fence_impl) + .StartsWith("__cm_intrinsic_impl_lsc_atomic_bindless", CMBK_cm_atomic_bindless_impl) .StartsWith("__cm_intrinsic_impl_dp4a", CMBK_cm_dp4a_impl) .StartsWith("__cm_intrinsic_impl_oword_read_dwaligned", CMBK_oword_read_dwaligned_impl) @@ -872,6 +906,17 @@ RValue CGCMRuntime::EmitCMBuiltin(CodeGenFunction &CGF, unsigned ID, // Clear reserved bits and only use first 1 bit MaskVal &= 0x1; + // Update regular barrier count in kernel metadata. + if (llvm::MDNode *Node = getSLMSizeMDNode(CGF.CurFn)) { + if (llvm::Value *OldSz = getVal(Node->getOperand(llvm::genx::KernelMDOp::BarrierCnt))) { + assert(isa(OldSz) && "integer constant expected"); + uint64_t OldVal = cast(OldSz)->getZExtValue(); + if (OldVal == 0) { + llvm::Value *NewSz = llvm::ConstantInt::get(OldSz->getType(), 1); + Node->replaceOperandWith(llvm::genx::KernelMDOp::BarrierCnt, getMD(NewSz)); + } + } + } return RValue::get(CGF.Builder.CreateCall( Fn, llvm::ConstantInt::get(Fn->getFunctionType()->getParamType(0), MaskVal), @@ -881,6 +926,26 @@ RValue CGCMRuntime::EmitCMBuiltin(CodeGenFunction &CGF, unsigned ID, Error(E->getExprLoc(), "One signal flag argument expected"); return RValue::get(0); } + case Builtin::BIcm_nbarrier_init: + EmitBuiltinNBarrierInit(CGF, E); + return RValue::get(0); + case Builtin::BIcm_nbarrier_wait: + { + Fn = CGF.CGM.getGenXIntrinsic(llvm::GenXIntrinsic::genx_nbarrier); + if (E->getNumArgs() == 1) { + SmallVector Args; + Args.push_back(llvm::ConstantInt::get(CGF.Int8Ty, 0)); + const Expr *ArgE = E->getArg(0); + llvm::Value *Id = CGF.EmitAnyExpr(ArgE).getScalarVal(); + Args.push_back(Id); + Args.push_back(llvm::ConstantInt::get(CGF.Int8Ty, 0)); + return RValue::get(CGF.Builder.CreateCall(Fn, Args, "")); + } + else { + Error(E->getExprLoc(), "One barrier id argument expected"); + return RValue::get(0); + } + } case Builtin::BIcm_slm_init: EmitBuiltinSLMInit(CGF, E); return RValue::get(0); @@ -892,6 +957,17 @@ RValue CGCMRuntime::EmitCMBuiltin(CodeGenFunction &CGF, unsigned ID, { Fn = CGF.CGM.getGenXIntrinsic(llvm::GenXIntrinsic::genx_barrier); + // Update regular barrier count in kernel metadata. + if (llvm::MDNode *Node = getSLMSizeMDNode(CGF.CurFn)) { + if (llvm::Value *OldSz = getVal(Node->getOperand(llvm::genx::KernelMDOp::BarrierCnt))) { + assert(isa(OldSz) && "integer constant expected"); + uint64_t OldVal = cast(OldSz)->getZExtValue(); + if (OldVal == 0) { + llvm::Value *NewSz = llvm::ConstantInt::get(OldSz->getType(), 1); + Node->replaceOperandWith(llvm::genx::KernelMDOp::BarrierCnt, getMD(NewSz)); + } + } + } CGF.Builder.CreateCall(Fn); return RValue::get(0); @@ -1034,6 +1110,40 @@ llvm::Value *CGCMRuntime::EmitBuiltinSLMFree(CodeGenFunction &CGF, return NextIndex; } +void CGCMRuntime::EmitBuiltinNBarrierInit(CodeGenFunction &CGF, const CallExpr *E) { + // We check whether this call is inside a kernel function. + if (!CGF.CurFuncDecl->hasAttr()) { + Error(E->getExprLoc(), "cm_nbarrier_init shall only be called in a kernel"); + return; + } + + const Expr *Arg = E->getArg(0); + llvm::APSInt Size(8); + Expr::EvalResult SizeResult; + if (!Arg->EvaluateAsInt(SizeResult, CGF.getContext())) { + Error(Arg->getExprLoc(), "integral constant expected for nbarrier count"); + return; + } + Size = SizeResult.Val.getInt(); + + // Size in bytes being requested. + uint32_t NewVal = Size.getZExtValue(); + if (NewVal == 0) { + Error(Arg->getExprLoc(), "zero nbarrier count being requested"); + return; + } + + // Update named barrier count in kernel metadata. + if (llvm::MDNode *Node = getSLMSizeMDNode(CGF.CurFn)) { + if (llvm::Value *OldSz = getVal(Node->getOperand(llvm::genx::KernelMDOp::NBarrierCnt))) { + assert(isa(OldSz) && "integer constant expected"); + llvm::Value *NewSz = llvm::ConstantInt::get(OldSz->getType(), NewVal); + uint64_t OldVal = cast(OldSz)->getZExtValue(); + if (OldVal < NewVal) + Node->replaceOperandWith(llvm::genx::KernelMDOp::NBarrierCnt, getMD(NewSz)); + } + } +} RValue CGCMRuntime::EmitCMCallExpr(CodeGenFunction &CGF, const CallExpr *E, ReturnValueSlot ReturnValue) { @@ -1374,6 +1484,50 @@ RValue CGCMRuntime::EmitCMCallExpr(CodeGenFunction &CGF, const CallExpr *E, return RValue::get(HandleBuiltinDPAS2Impl(getCurCMCallInfo(), Kind)); case CMBK_cm_bf_cvt_impl: return RValue::get(HandleBuiltinBFCVTImpl(getCurCMCallInfo(), Kind)); + case CMBK_cm_tf32_cvt_impl: + return RValue::get(HandleBuiltinTF32CVTImpl(getCurCMCallInfo(), Kind)); + case CMBK_cm_srnd_impl: + return RValue::get(HandleBuiltinSRNDImpl(getCurCMCallInfo(), Kind)); + case CMBK_cm_load_impl: + case CMBK_cm_load4_impl: + case CMBK_cm_block_load_impl: + case CMBK_cm_load_flat_impl: + case CMBK_cm_load4_flat_impl: + case CMBK_cm_block_load_flat_impl: + case CMBK_cm_load_bindless_impl: + case CMBK_cm_load_slm_impl: + case CMBK_cm_load4_slm_impl: + case CMBK_cm_block_load_slm_impl: + case CMBK_cm_atomic_bti_impl: + case CMBK_cm_atomic_slm_impl: + case CMBK_cm_atomic_flat_impl: + case CMBK_cm_atomic_bindless_impl: + return RValue::get(HandleBuiltinLSCImpl(getCurCMCallInfo(), Kind)); + case CMBK_cm_store_impl: + case CMBK_cm_store4_impl: + case CMBK_cm_block_store_impl: + case CMBK_cm_store_flat_impl: + case CMBK_cm_store4_flat_impl: + case CMBK_cm_block_store_flat_impl: + case CMBK_cm_prefetch_impl: + case CMBK_cm_block_prefetch_impl: + case CMBK_cm_prefetch_flat_impl: + case CMBK_cm_block_prefetch_flat_impl: + case CMBK_cm_store_bindless_impl: + case CMBK_cm_store_slm_impl: + case CMBK_cm_store4_slm_impl: + case CMBK_cm_block_store_slm_impl: + HandleBuiltinLSCImpl(getCurCMCallInfo(), Kind); + return RValue::get(0); + case CMBK_cm_block_load2d_flat_impl: + return RValue::get(HandleBuiltinLSC2dImpl(getCurCMCallInfo(), Kind)); + case CMBK_cm_block_store2d_flat_impl: + case CMBK_cm_block_prefetch2d_flat_impl: + HandleBuiltinLSC2dImpl(getCurCMCallInfo(), Kind); + return RValue::get(0); + case CMBK_cm_lsc_fence_impl: + HandleBuiltinLscFenceImpl(getCurCMCallInfo(), Kind); + return RValue::get(0); } // Returns the normal call rvalue. @@ -2706,6 +2860,12 @@ unsigned CGCMRuntime::GetGenxIntrinsicID(CMCallInfo &CallInfo, case CMBK_cm_bf_cvt: ID = llvm::GenXIntrinsic::genx_bf_cvt; break; + case CMBK_cm_tf32_cvt: + ID = llvm::GenXIntrinsic::genx_tf32_cvt; + break; + case CMBK_cm_srnd: + ID = llvm::GenXIntrinsic::genx_srnd; + break; case CMBK_sample16_impl: ID = llvm::GenXIntrinsic::genx_sample; break; @@ -3947,6 +4107,8 @@ typedef enum _CmAtomicOpType_ { ATOMIC_FMAX = 0x10, ATOMIC_FMIN = 0x11, ATOMIC_FCMPWR = 0x12, + ATOMIC_FADD = 0x13, + ATOMIC_FSUB = 0x14 } CmAtomicOpType; unsigned getAtomicIntrinsicID(CmAtomicOpType Op) { @@ -3983,6 +4145,10 @@ unsigned getAtomicIntrinsicID(CmAtomicOpType Op) { return llvm::GenXIntrinsic::genx_dword_atomic2_fmin; case ATOMIC_FCMPWR: return llvm::GenXIntrinsic::genx_dword_atomic2_fcmpwr; + case ATOMIC_FADD: + return llvm::GenXIntrinsic::genx_dword_atomic2_fadd; + case ATOMIC_FSUB: + return llvm::GenXIntrinsic::genx_dword_atomic2_fsub; } llvm_unreachable("invalid atomic operation"); @@ -4022,6 +4188,10 @@ unsigned getAtomicTypedIntrinsicID(CmAtomicOpType Op) { return llvm::GenXIntrinsic::genx_typed_atomic_fmin; case ATOMIC_FCMPWR: return llvm::GenXIntrinsic::genx_typed_atomic_fcmpwr; + case ATOMIC_FADD: + return llvm::GenXIntrinsic::genx_typed_atomic_fadd; + case ATOMIC_FSUB: + return llvm::GenXIntrinsic::genx_typed_atomic_fsub; } llvm_unreachable("invalid atomic operation"); @@ -5157,6 +5327,8 @@ AtomicCheckResult checkSLMAtomicOp(CmAtomicOpType Op, unsigned NumSrc) { case ATOMIC_MAXSINT: case ATOMIC_FMAX: case ATOMIC_FMIN: + case ATOMIC_FADD: + case ATOMIC_FSUB: return (NumSrc == 1) ? AR_Valid : AR_NotOneSrc; } @@ -5183,6 +5355,8 @@ AtomicCheckResult checkSLMAtomicOperands(CmAtomicOpType Op, QualType Ty) { case ATOMIC_FMAX: case ATOMIC_FMIN: case ATOMIC_FCMPWR: + case ATOMIC_FADD: + case ATOMIC_FSUB: return Ty->isFloatingType() ? AR_Valid : AR_NotFloat; } @@ -5295,6 +5469,10 @@ void CGCMRuntime::HandleBuiltinSLMAtomic(CMCallInfo &CallInfo) { ID == llvm::GenXIntrinsic::genx_dword_atomic_fmax || ID == llvm::GenXIntrinsic::genx_dword_atomic2_fmin || ID == llvm::GenXIntrinsic::genx_dword_atomic2_fmax || + ID == llvm::GenXIntrinsic::genx_dword_atomic_fadd || + ID == llvm::GenXIntrinsic::genx_dword_atomic_fsub || + ID == llvm::GenXIntrinsic::genx_dword_atomic2_fadd || + ID == llvm::GenXIntrinsic::genx_dword_atomic2_fsub || ID == llvm::GenXIntrinsic::genx_dword_atomic_fcmpwr || ID == llvm::GenXIntrinsic::genx_dword_atomic2_fcmpwr) Tys.push_back(llvm::VectorType::get(CGF.FloatTy, N)); @@ -5722,6 +5900,9 @@ unsigned getAtomicSVMIntrinsicID(CmAtomicOpType Op) { return llvm::GenXIntrinsic::genx_svm_atomic_fmin; case ATOMIC_FCMPWR: return llvm::GenXIntrinsic::genx_svm_atomic_fcmpwr; + case ATOMIC_FADD: + case ATOMIC_FSUB: + return llvm::GenXIntrinsic::not_genx_intrinsic; } llvm_unreachable("invalid atomic operation"); @@ -5748,6 +5929,8 @@ AtomicCheckResult checkSVMAtomicOp(CmAtomicOpType Op, unsigned NumSrc) { case ATOMIC_FMAX: case ATOMIC_FMIN: case ATOMIC_FCMPWR: + case ATOMIC_FADD: + case ATOMIC_FSUB: return AR_Invalid; } @@ -5781,6 +5964,10 @@ QualType checkSVMAtomicOperands(CmAtomicOpType Op, ASTContext &CTX, QualType Ori case ATOMIC_FMIN: case ATOMIC_FCMPWR: return OrigElType; + case ATOMIC_FADD: + case ATOMIC_FSUB: + assert(false && "floating point operands not support for SVM atomic operation."); + return OrigElType; } QualType NewElType; @@ -7432,6 +7619,407 @@ llvm::Value *CGCMRuntime::HandleBuiltinBFCVTImpl(CMCallInfo &CallInfo, return Result; } +/// \brief Postprocess builtin cm_tf32_cvt. +/// +/// template +/// vector +/// __cm_intrinsic_impl_tf32_cvt(vector src0) +/// +llvm::Value *CGCMRuntime::HandleBuiltinTF32CVTImpl(CMCallInfo &CallInfo, + CMBuiltinKind Kind) { + assert(Kind == CMBK_cm_tf32_cvt_impl); + + const CallExpr *CE = CallInfo.CE; + assert(CE->getType()->isCMVectorMatrixType()); + assert(CE->getNumArgs() == 1); + + llvm::CallInst *CI = CallInfo.CI; + CGBuilderTy Builder(*CallInfo.CGF, CI); + + llvm::Type *Tys[] = {CI->getType(), CI->getOperand(0)->getType()}; + llvm::Function *F = getGenXIntrinsic(llvm::GenXIntrinsic::genx_tf32_cvt, Tys); + llvm::CallInst *Result = + Builder.CreateCall(F, CI->getOperand(0), CI->getName()); + Result->setDebugLoc(CI->getDebugLoc()); + + CI->eraseFromParent(); + return Result; +} + +/// \brief Postprocess builtin cm_srnd. +/// +/// template +/// vector +/// __cm_intrinsic_impl_srnd(vector src0, vector src1) +/// +llvm::Value *CGCMRuntime::HandleBuiltinSRNDImpl(CMCallInfo &CallInfo, + CMBuiltinKind Kind) { + assert(Kind == CMBK_cm_srnd_impl); + + const CallExpr *CE = CallInfo.CE; + + assert(CE->getType()->isCMVectorMatrixType()); + assert(CE->getNumArgs() == 2); + + llvm::CallInst *CI = CallInfo.CI; + llvm::Value *Arg0 = CI->getArgOperand(0); + llvm::Value *Arg1 = CI->getArgOperand(1); + + SmallVector Tys; + Tys.push_back(CI->getType()); + Tys.push_back(Arg0->getType()); + Tys.push_back(Arg1->getType()); + + CGBuilderTy Builder(*CallInfo.CGF, CI); + + SmallVector Args; + Args.push_back(Arg0); + Args.push_back(Arg1); + + llvm::Function *F = getGenXIntrinsic(llvm::GenXIntrinsic::genx_srnd, Tys); + llvm::CallInst *Result = Builder.CreateCall(F, Args, CI->getName()); + Result->setDebugLoc(CI->getDebugLoc()); + CallInfo.CI->eraseFromParent(); + return Result; +} + + +#define LSCINC +#include "CMLSCDef.h" +#undef LSCINC + +/// \brief Postprocess block 2d builtins load/store/prefetch. +/// +/// template +/// vector +/// __cm_intrinsic_impl_load2d(uint64_t Base, unsigned SurfaceWidth, +/// unsigned SurfaceHeight, unsigned SurfacePitch, +/// int X, int Y); +/// +/// template +/// void __cm_intrinsic_impl_store2d(uint64_t Base, unsigned SurfaceWidth, +/// unsigned SurfaceHeight, unsigned SurfacePitch, +/// int X, int Y, vector Data); +/// +/// template +/// void __cm_intrinsic_impl_prefetch2d(uint64_t Base, unsigned SurfaceWidth, +/// unsigned SurfaceHeight, unsigned SurfacePitch, +/// int X, int Y); +/// +llvm::Value *CGCMRuntime::HandleBuiltinLSC2dImpl(CMCallInfo &CallInfo, + CMBuiltinKind Kind) { + assert(Kind == CMBK_cm_block_load2d_flat_impl || + Kind == CMBK_cm_block_prefetch2d_flat_impl || + Kind == CMBK_cm_block_store2d_flat_impl); + + auto &CGF = *CallInfo.CGF; + const FunctionDecl *FD = CallInfo.CE->getDirectCallee(); + assert(FD && FD->isTemplateInstantiation()); + unsigned NBlocks = getIntegralValue(FD, 1); + unsigned Width = getIntegralValue(FD, 2); + unsigned Height = getIntegralValue(FD, 3); + auto L1H = CacheHint::Default; + auto L3H = CacheHint::Default; + auto Tranposed = LSC_DATA_ORDER::LSC_DATA_ORDER_NONTRANSPOSE; + uint8_t Transformed = 0; + + switch (Kind) { + default: + break; + case CMBK_cm_block_load2d_flat_impl: + Tranposed = getIntegralValue(FD, 4) + ? LSC_DATA_ORDER::LSC_DATA_ORDER_TRANSPOSE + : LSC_DATA_ORDER::LSC_DATA_ORDER_NONTRANSPOSE; + Transformed = getIntegralValue(FD, 5); + L1H = static_cast(getIntegralValue(FD, 6)); + L3H = static_cast(getIntegralValue(FD, 7)); + break; + case CMBK_cm_block_prefetch2d_flat_impl: + case CMBK_cm_block_store2d_flat_impl: + L1H = static_cast(getIntegralValue(FD, 4)); + L3H = static_cast(getIntegralValue(FD, 5)); + break; + } + + DataSize DS = DataSize::U8; + { + const TemplateArgumentList *TempArgs = FD->getTemplateSpecializationArgs(); + QualType T = TempArgs->get(0).getAsType(); + size_t SizeInBits = CGM.getContext().getTypeSize(T); + switch (SizeInBits) { + default: + CGM.ErrorUnsupported(FD, "unsupported template type"); + break; + case 8: + DS = DataSize::U8; + break; + case 16: + DS = DataSize::U16; + break; + case 32: + DS = DataSize::U32; + break; + case 64: + DS = DataSize::U64; + break; + } + } + + llvm::CallInst *CI = CallInfo.CI; + CGBuilderTy Builder(*CallInfo.CGF, CI); + + llvm::Value *Pred = Builder.getTrue(); + llvm::Value *Addr = CI->getArgOperand(0); + + std::vector Args = { + Pred, + Builder.getInt8(static_cast(L1H)), + Builder.getInt8(static_cast(L3H)), + Builder.getInt8(static_cast(DS)), + Builder.getInt8(static_cast(Tranposed)), + Builder.getInt8(static_cast(NBlocks)), + Builder.getInt16(static_cast(Width)), + Builder.getInt16(static_cast(Height)), + Builder.getInt8(static_cast(Transformed)), // VNNI + Addr, // Surface base + CI->getArgOperand(1), // Surface width + CI->getArgOperand(2), // Surface height + CI->getArgOperand(3), // Surface pitch + CI->getArgOperand(4), // X + CI->getArgOperand(5) // Y + }; + + assert((Addr->getType() == CGF.Int32Ty) || (Addr->getType() == CGF.Int64Ty)); + + auto ID = llvm::GenXIntrinsic::genx_lsc_load2d_stateless; + std::vector Tys; + if (Kind == CMBK_cm_block_load2d_flat_impl) { + Tys.push_back(CI->getType()); // return type + Tys.push_back(Pred->getType()); // predicate + Tys.push_back(Addr->getType()); // address type (i32/i64) + } else if (Kind == CMBK_cm_block_prefetch2d_flat_impl) { + Tys.push_back(Pred->getType()); // predicate + Tys.push_back(Addr->getType()); // address type (i32/i64) + ID = llvm::GenXIntrinsic::genx_lsc_prefetch2d_stateless; + } else { + Tys.push_back(Pred->getType()); // predicate + Tys.push_back(Addr->getType()); // address type (i32/i64) + Tys.push_back(CI->getArgOperand(6)->getType()); // data type + ID = llvm::GenXIntrinsic::genx_lsc_store2d_stateless; + Args.push_back(CI->getArgOperand(6)); // Data to write + } + + llvm::Function *F = getGenXIntrinsic(ID, Tys); + auto NewCI = Builder.CreateCall(F, Args); + NewCI->setDebugLoc(CI->getDebugLoc()); + if (!CI->getType()->isVoidTy()) + CI->replaceAllUsesWith(NewCI); + CI->eraseFromParent(); + return NewCI; +} + +/// \brief Postprocess basic LSC messages +llvm::Value *CGCMRuntime::HandleBuiltinLSCImpl(CMCallInfo &CallInfo, + CMBuiltinKind Kind) { + // main config from builtin kind + const int *Config = getConfig(Kind); + LDTYPE Ldt = getOpType(Kind); + SFTYPE Sft = getSFType(Kind); + bool IsBlock = getBlock(Kind); + + // params from function decl + LSCParams Params{CallInfo, Config, Kind}; + CGBuilderTy Builder(*CallInfo.CGF, Params.CI_); + + // special logic for subopcode for atomics + if (Ldt != ATOMIC) { + LSC_SubOpcode Op; + Op = getSubOp(Kind); + Params.setOp(Op); + } + + // special logic for SLM Idx + if (Sft == SLM) + Params.Idx_ = Builder.getInt32(0); + + // special logic for pred + if (IsBlock) + Params.Pred_ = Builder.getTrue(); + else { + assert(Params.Pred_ && Params.Pred_->getType()->isVectorTy() && + "We expect vector Pred if exists"); + auto VTy = llvm::VectorType::get( + Builder.getInt1Ty(), Params.Pred_->getType()->getVectorNumElements()); + Params.Pred_ = Builder.CreateTruncOrBitCast(Params.Pred_, VTy); + } + + // special logic for transpose + // LSC_DATA_ORDER_TRANSPOSE requires ExecSize of 1 + if (!IsBlock) + Params.setNonTranspose(); + + // LSC_DATA_ELEMS_1 illegal on transposed operation + if (Params.VS_ == static_cast(VectorSize::N1)) + Params.setNonTranspose(); + + // Special logic for flat addresses + if ((Sft == FLAT) || (Sft == BINDLESS)) { + llvm::Value *BaseAddr = Params.Idx_; + llvm::Value *Offsets = Params.Offset_; + assert(BaseAddr && Offsets); + // Base Addr shall be int64 in all cases + auto Int64Ty = CallInfo.CGF->Int64Ty; + if (BaseAddr->getType() != Int64Ty) { + if (BaseAddr->getType()->isPointerTy()) + BaseAddr = Builder.CreatePtrToInt(BaseAddr, Int64Ty); + else + BaseAddr = Builder.CreateZExt(BaseAddr, Int64Ty); + } + + // Creating add off = off + baseaddr with possible extend to vectors + if (IsBlock) { + if (Offsets->getType() != Int64Ty) + Offsets = Builder.CreateZExt(Offsets, Int64Ty); + Offsets = Builder.CreateAdd(Offsets, Params.Idx_); + } else { + assert(Offsets->getType()->isVectorTy()); + auto N = Offsets->getType()->getVectorNumElements(); + + // Convert the offest type to UD if it is not. + if (Offsets->getType()->getVectorElementType() != Int64Ty) { + llvm::Type *Ty = llvm::VectorType::get(Int64Ty, N); + Offsets = Builder.CreateZExt(Offsets, Ty); + } + // workaround, hw does not really have a global-address, add + // base-address and the offset + llvm::Value *Splat = Builder.CreateVectorSplat(N, BaseAddr); + Offsets = Builder.CreateAdd(Offsets, Splat); + } + + // in both flat and bindless base addr ignored after used + Params.Idx_ = Builder.getInt32(0); + Params.Offset_ = Offsets; + } + + // undef value for atomic oldval + llvm::Value *OldVal = llvm::UndefValue::get(Params.CI_->getType()); + + // undef sources if null + if (Ldt == ATOMIC) { + if (Params.Data_ == nullptr) + Params.Data_ = OldVal; + if (Params.Data1_ == nullptr) + Params.Data1_ = OldVal; + } + + SmallVector Tys; + + if (Ldt == LOAD || Ldt == ATOMIC) + Tys.push_back(Params.CI_->getType()); + + assert(Params.Pred_ && Params.Offset_); + Tys.push_back(Params.Pred_->getType()); + Tys.push_back(Params.Offset_->getType()); + + if (Ldt == STORE) { + assert(Params.Data_); + Tys.push_back(Params.Data_->getType()); + } + + // if channel mask not specified, it is constant zero + if (Params.ChM_ == nullptr) + Params.ChM_ = Builder.getInt8(0); + else { + // making char from enum + auto Int8Ty = CallInfo.CGF->Int8Ty; + if (Params.ChM_->getType() != Int8Ty) + Params.ChM_ = Builder.CreateTruncOrBitCast(Params.ChM_, Int8Ty); + } + + llvm::Function *F = getGenXIntrinsic(getLSCIntrinsic(Kind), Tys); + + SmallVector Args; + Args.push_back(Params.Pred_); // Pred + Args.push_back(Builder.getInt8(Params.Op_)); // Subop + Args.push_back(Builder.getInt8(Params.L1H_)); // L1H + Args.push_back(Builder.getInt8(Params.L3H_)); // L3H + Args.push_back(Builder.getInt16(1)); // Addr scale + Args.push_back(Builder.getInt32(Params.ImmOffset_)); // imm offset + Args.push_back(Builder.getInt8(Params.DS_)); // Datum size + Args.push_back(Builder.getInt8(Params.VS_)); // Vector size + Args.push_back(Builder.getInt8(Params.Transposed_)); // transposed + Args.push_back(Params.ChM_); // channel mask + Args.push_back(Params.Offset_); // offsets + if (Ldt == STORE) + Args.push_back(Params.Data_); // data + if (Ldt == ATOMIC) { + Args.push_back(Params.Data_); // src0 or undef + Args.push_back(Params.Data1_); // src1 or undef + } + Args.push_back(Params.Idx_); // surface + if (Ldt == ATOMIC) + Args.push_back(OldVal); // oldval + + auto NewCI = Builder.CreateCall(F, Args); + NewCI->setDebugLoc(Params.CI_->getDebugLoc()); + Params.CI_->replaceAllUsesWith(NewCI); + Params.CI_->eraseFromParent(); + return NewCI; +} + + +/// \brief Postprocess cm_fence implementation builtins. +// +// template +// RetTy __cm_intrinsic_impl_lsc_fence(vector Pred); +// + +llvm::Value * +CGCMRuntime::HandleBuiltinLscFenceImpl(CMCallInfo &CallInfo, + CMBuiltinKind Kind) { + assert(Kind == CMBK_cm_lsc_fence_impl); + const FunctionDecl *FD = CallInfo.CE->getDirectCallee(); + auto Sfid = static_cast(getIntegralValue(FD, 0)); + auto FenceOp = static_cast(getIntegralValue(FD, 1)); + auto Scope = static_cast(getIntegralValue(FD, 2)); + llvm::CallInst *CI = CallInfo.CI; + CGBuilderTy Builder(*CallInfo.CGF, CI); + // Convert predicate from N x i16 to N x i1 + llvm::Value *Pred = CI->getArgOperand(0); + { + assert(Pred->getType()->isVectorTy()); + auto VTy = llvm::VectorType::get(Builder.getInt1Ty(), + Pred->getType()->getVectorNumElements()); + Pred = Builder.CreateTruncOrBitCast(Pred, VTy); + } + llvm::Type *Tys[] = { + Pred->getType() // predicate + }; + + llvm::Function *F = + getGenXIntrinsic(llvm::GenXIntrinsic::genx_lsc_fence, Tys); + SmallVector Args; + Args.push_back(Pred); // predicate + Args.push_back(Builder.getInt8(static_cast(Sfid))); // SFID + Args.push_back(Builder.getInt8(static_cast(FenceOp))); // FenceOp + Args.push_back(Builder.getInt8(static_cast(Scope))); // Scope + + auto NewCI = Builder.CreateCall(F, Args); + NewCI->setDebugLoc(CI->getDebugLoc()); + llvm::Value *Result = nullptr; + if (!CI->getType()->isVoidTy()) { + Result = NewCI; + CI->replaceAllUsesWith(NewCI); + } + CI->eraseFromParent(); + return Result; +} + /// \brief Emit one of scatter_scaled, scatter4_scaled. /// /// \param Selector log2 number of blocks for scatter_scaled diff --git a/lib/CodeGen/CGCMBuiltin.h b/lib/CodeGen/CGCMBuiltin.h index 099df12d2c6f..4562950982c1 100644 --- a/lib/CodeGen/CGCMBuiltin.h +++ b/lib/CodeGen/CGCMBuiltin.h @@ -88,6 +88,42 @@ enum CMBuiltinKind { CMBK_cm_dpasw, CMBK_cm_dpasw_impl, CMBK_cm_dpasw_nosrc0, CMBK_cm_dpasw_nosrc0_impl, CMBK_cm_bf_cvt, CMBK_cm_bf_cvt_impl, + CMBK_cm_tf32_cvt, CMBK_cm_tf32_cvt_impl, + CMBK_cm_srnd, CMBK_cm_srnd_impl, + // LSC interface + CMBK_cm_prefetch_impl, + CMBK_cm_block_prefetch_impl, + CMBK_cm_prefetch_flat_impl, + CMBK_cm_block_prefetch_flat_impl, + CMBK_cm_prefetch_bindless_impl, + CMBK_cm_load_impl, + CMBK_cm_load4_impl, + CMBK_cm_block_load_impl, + CMBK_cm_load_flat_impl, + CMBK_cm_load4_flat_impl, + CMBK_cm_block_load_flat_impl, + CMBK_cm_store_impl, + CMBK_cm_store4_impl, + CMBK_cm_block_store_impl, + CMBK_cm_store_flat_impl, + CMBK_cm_store4_flat_impl, + CMBK_cm_block_store_flat_impl, + CMBK_cm_block_prefetch2d_flat_impl, + CMBK_cm_block_load2d_flat_impl, + CMBK_cm_block_store2d_flat_impl, + CMBK_cm_block_load_slm_impl, + CMBK_cm_load_slm_impl, + CMBK_cm_load4_slm_impl, + CMBK_cm_block_store_slm_impl, + CMBK_cm_store_slm_impl, + CMBK_cm_store4_slm_impl, + CMBK_cm_load_bindless_impl, + CMBK_cm_store_bindless_impl, + CMBK_cm_atomic_bti_impl, + CMBK_cm_atomic_slm_impl, + CMBK_cm_atomic_flat_impl, + CMBK_cm_atomic_bindless_impl, + CMBK_cm_lsc_fence_impl, // Data port interface. CMBK_oword_read_impl, CMBK_oword_read_dwaligned_impl, diff --git a/lib/CodeGen/CMLSCDef.h b/lib/CodeGen/CMLSCDef.h new file mode 100644 index 000000000000..10644983ba3b --- /dev/null +++ b/lib/CodeGen/CMLSCDef.h @@ -0,0 +1,632 @@ +/*========================== begin_copyright_notice ============================ + +Copyright (C) 2020-2021 Intel Corporation + +SPDX-License-Identifier: MIT + +============================= end_copyright_notice ===========================*/ + +#ifdef LSCINC + +// !!! --- Keep in sync with implementation header begin --- !!! +// TODO: we shall generate at least this stuff + +enum class LSC_SubOpcode : uint8_t { + LSC_LOAD = 0x00, + LSC_LOAD_STRIDED = 0x01, // aka "load_block" + LSC_LOAD_QUAD = 0x02, // aka "load_cmask" + LSC_LOAD_BLOCK2D = 0x03, + LSC_STORE = 0x04, + LSC_STORE_STRIDED = 0x05, // aka "load_block" + LSC_STORE_QUAD = 0x06, // aka "store_cmask" + LSC_STORE_BLOCK2D = 0x07, + // + LSC_ATOMIC_IINC = 0x08, + LSC_ATOMIC_IDEC = 0x09, + LSC_ATOMIC_LOAD = 0x0A, + LSC_ATOMIC_STORE = 0x0B, + LSC_ATOMIC_IADD = 0x0C, + LSC_ATOMIC_ISUB = 0x0D, + LSC_ATOMIC_SMIN = 0x0E, + LSC_ATOMIC_SMAX = 0x0F, + LSC_ATOMIC_UMIN = 0x10, + LSC_ATOMIC_UMAX = 0x11, + LSC_ATOMIC_ICAS = 0x12, + LSC_ATOMIC_FADD = 0x13, + LSC_ATOMIC_FSUB = 0x14, + LSC_ATOMIC_FMIN = 0x15, + LSC_ATOMIC_FMAX = 0x16, + LSC_ATOMIC_FCAS = 0x17, + LSC_ATOMIC_AND = 0x18, + LSC_ATOMIC_OR = 0x19, + LSC_ATOMIC_XOR = 0x1A, + // + LSC_LOAD_STATUS = 0x1B, + LSC_STORE_UNCOMPRESSED = 0x1C, + LSC_CCS_UPDATE = 0x1D, + LSC_READ_STATE_INFO = 0x1E, + LSC_FENCE = 0x1F, +}; + +// L1 or L3 cache hint kinds. +enum class CacheHint : uint8_t { + Default = 0, + Uncached = 1, + WriteBack = 2, + WriteThrough = 3, + Streaming = 4, + ReadInvalidate = 5 +}; + +// Data size or format to read or store. +enum class DataSize : uint8_t { + U8 = 1, + U16 = 2, + U32 = 3, + U64 = 4, + U8U32 = 5, // load 8b, zero extend to 32b; store the opposite + U16U32 = 6, // load 8b, zero extend to 32b; store the opposite + U16U32H = 7 // load 16b into high 16 of each 32b; store the high 16 +}; + +// The number of elements to load per address (vector size) +enum class VectorSize : uint8_t { + N1 = 1, // 1 element + N2 = 2, // 2 element + N3 = 3, // 3 element + N4 = 4, // 4 element + N8 = 5, // 8 element + N16 = 6, // 16 element + N32 = 7, // 32 element + N64 = 8 // 64 element +}; + +enum class LSC_DATA_ORDER : uint8_t { + LSC_DATA_ORDER_INVALID, + LSC_DATA_ORDER_NONTRANSPOSE, + LSC_DATA_ORDER_TRANSPOSE, +}; + +enum class LSC_SCOPE : uint8_t { + LSC_SCOPE_GROUP, + LSC_SCOPE_LOCAL, + LSC_SCOPE_TILE, + LSC_SCOPE_GPU, + LSC_SCOPE_GPUS, + LSC_SCOPE_SYSTEM, + LSC_SCOPE_SYSACQ +}; + +enum class LSC_FENCE_OP : uint8_t { + LSC_FENCE_OP_NONE, + LSC_FENCE_OP_EVICT, + LSC_FENCE_OP_INVALIDATE, + LSC_FENCE_OP_DISCARD, + LSC_FENCE_OP_CLEAN, + LSC_FENCE_OP_FLUSHL3 +}; +enum class LSC_SFID : uint8_t { + LSC_UGM, + LSC_UGML, + LSC_TGM, + LSC_SLM +}; +// !!! --- Keep in sync with implementation header end --- !!! + +// forward decl for static functions + +static bool getBlock(CMBuiltinKind Kind); + +// configuration for LSC templates +// +// Basically we have one format of intrinsics: +// template parameters: +// * optional RetTy param somewhere +// * optional N param somewhere +// * optional Transposed param somewhere +// * common block of (DS, VS, ImmOffset, L1H, L3H) at some offset +// * probably atomic Op value +// non-template parameters +// * return value +// * Idx (or Addr) +// * Offsets +// * Data (stores only) +// * Pred +// Idea is to abstract this to handle for any LSC message in uniform way and not +// duplicate code + +// config[XXX_IDX] is place in template params to look for XXX +constexpr int TY_IDX = 0; +constexpr int DS_IDX = 1; +constexpr int VS_IDX = 2; +constexpr int IMOFF_IDX = 3; +constexpr int L1_IDX = 4; +constexpr int L3_IDX = 5; +constexpr int TRANSPOSED_IDX = 6; +constexpr int N_IDX = 7; +constexpr int OP_IDX = 8; +constexpr int ILLEGAL_TEMPLATE = 9; +// config[XXX_AIDX] is place in non-template params to look for XXX +constexpr int IDX_AIDX = 9; +constexpr int OFFSET_AIDX = 10; +constexpr int DATA_AIDX = 11; +constexpr int DATA1_AIDX = 12; +constexpr int PRED_AIDX = 13; +constexpr int CHMASK_AIDX = 14; +constexpr int ILLEGAL_NONTEMPLATE = 15; + +// data structure for basic LSC parameters +struct LSCParams { + llvm::Value *Ty_; + const FunctionDecl *FD_; + llvm::CallInst *CI_; + const int *Config_; + unsigned char DS_; // data size + unsigned char VS_; // vector size + int ImmOffset_ = 0; // immediate offset + unsigned char L1H_; // L1 chache hint + unsigned char L3H_; // L3 chache hint + unsigned char Transposed_; // is transposed access + int N_; + unsigned char Op_; + + llvm::Value *Idx_ = nullptr; + llvm::Value *Offset_ = nullptr; // offset(s) + llvm::Value *Pred_ = nullptr; // predicate(s) + llvm::Value *Data_ = nullptr; // src0 + llvm::Value *Data1_ = nullptr; // src1 if any + llvm::Value *ChM_ = nullptr; // channel mask + + template void getTemplateIntParam(T &Data, unsigned N) { + assert(N < ILLEGAL_TEMPLATE); + if (Config_[N] == -1) + return; + Data = getIntegralValue(FD_, Config_[N]); + } + + llvm::Value *getNonTemplateValue(unsigned N) { + assert(N < ILLEGAL_NONTEMPLATE); + int OpNum = Config_[N]; + // any field may exist or not + if (OpNum < 0) + return nullptr; + // sources may exist or not in atomic + if (CI_->getNumArgOperands() <= static_cast(OpNum)) + return nullptr; + return CI_->getArgOperand(OpNum); + } + + LSCParams(CGCMRuntime::CMCallInfo &CallInfo, const int *Config, CMBuiltinKind Kind) + : FD_(CallInfo.CE->getDirectCallee()), CI_(CallInfo.CI), Config_(Config) { + assert(CI_ && "Callinst required"); + assert(FD_ && "Func decl required"); + + // start template params + getTemplateIntParam(DS_, DS_IDX); + getTemplateIntParam(VS_, VS_IDX); + getTemplateIntParam(ImmOffset_, IMOFF_IDX); + getTemplateIntParam(L1H_, L1_IDX); + getTemplateIntParam(L3H_, L3_IDX); + getTemplateIntParam(N_, N_IDX); + getTemplateIntParam(Op_, OP_IDX); + + // defaults (mostly for SLM) + if (Config_[L1_IDX] == -1) + L1H_ = static_cast(CacheHint::Default); + if (Config_[L3_IDX] == -1) + L3H_ = static_cast(CacheHint::Default); + + // special logic for transpose + int TVal = getBlock(Kind); + getTemplateIntParam(TVal, TRANSPOSED_IDX); + LSC_DATA_ORDER Transposed = TVal ? + LSC_DATA_ORDER::LSC_DATA_ORDER_TRANSPOSE : + LSC_DATA_ORDER::LSC_DATA_ORDER_NONTRANSPOSE; + Transposed_ = static_cast(Transposed); + + // start non-template params + Idx_ = getNonTemplateValue(IDX_AIDX); + Offset_ = getNonTemplateValue(OFFSET_AIDX); + Data_ = getNonTemplateValue(DATA_AIDX); + Data1_ = getNonTemplateValue(DATA1_AIDX); + Pred_ = getNonTemplateValue(PRED_AIDX); + ChM_ = getNonTemplateValue(CHMASK_AIDX); + } + + void setNonTranspose() { + auto Transposed = LSC_DATA_ORDER::LSC_DATA_ORDER_NONTRANSPOSE; + Transposed_ = static_cast(Transposed); + } + + void setOp(LSC_SubOpcode Op) { + Op_ = static_cast(Op); + } +}; + +// type of operation +enum LDTYPE { LOAD, STORE, PREFETCH, ATOMIC }; + +// type of surface +enum SFTYPE { BTI, FLAT, BINDLESS, SLM }; + +/// \brief Emit LSC load, store and prefetch (BTI-based) +/// +/// template +/// RetTy __cm_intrinsic_impl_load_bti(SurfaceIndex Idx, +/// vector Offset, +/// vector Pred); + +/* TY, DS, VS, IMOFF, L1, L3, TRANS, N, OP, IDX, OFF, DATA, DATA1, PRED, CHMASK */ +constexpr int Lsc_load_bti[] = {0, 1, 2, 3, 4, 5, 6, 7, -1, 0, 1, -1, -1, 2, -1}; +constexpr int Lsc_load_flat[] = {0, 1, 2, 3, 4, 5, 6, 7, -1, 0, 1, -1, -1, 2, -1}; +constexpr int Lsc_load_bindless[] = {0, 1, 2, 3, 4, 5, 6, 7, -1, 0, 1, -1, -1, 2, -1}; +constexpr int Lsc_load_slm[] = {0, 1, 2, 3, -1, -1, 4, 5, -1, -1, 0, -1, -1, 1, -1}; + +constexpr int Lsc_load4_bti[] = {0, 1, 2, 3, 4, 5, 6, 7, -1, 0, 1, -1, -1, 2, 3}; +constexpr int Lsc_load4_flat[] = {0, 1, 2, 3, 4, 5, 6, 7, -1, 0, 1, -1, -1, 2, 3}; +constexpr int Lsc_load4_bindless[] = {0, 1, 2, 3, 4, 5, 6, 7, -1, 0, 1, -1, -1, 2, 3}; +constexpr int Lsc_load4_slm[] = {0, 1, 2, 3, -1, -1, 4, 5, -1, -1, 0, -1, -1, 1, 2}; + +/// template +/// RetTy __cm_intrinsic_impl_block_load_bti(SurfaceIndex Idx, +/// uint Offset); + +/* TY, DS, VS, IMOFF, L1, L3, TRANS, N, OP, IDX, OFF, DATA, DATA1, PRED, CHMASK */ +constexpr int Lsc_block_load_bti[] = {0, 1, 2, 3, 4, 5, 6, -1, -1, 0, 1, -1, -1, -1, -1}; +constexpr int Lsc_block_load_flat[] = {0, 1, 2, 3, 4, 5, 6, -1, -1, 0, 1, -1, -1, -1, -1}; +constexpr int Lsc_block_load_slm[] = {0, 1, 2, 3, -1, -1, 4, -1, -1, -1, 0, -1, -1, -1, -1}; + +/// template +/// void __cm_intrinsic_impl_prefetch_bti(SurfaceIndex Idx, +/// vector Offset, +/// vector Pred); + +/* TY, DS, VS, IMOFF, L1, L3, TRANS, N, OP, IDX, OFF, DATA, DATA1, PRED, CHMASK */ +constexpr int Lsc_prefetch_bti[] = {-1, 0, 1, 2, 3, 4, -1, 5, -1, 0, 1, -1, -1, 2, -1}; +constexpr int Lsc_prefetch_flat[] = {-1, 0, 1, 2, 3, 4, -1, 5, -1, 0, 1, -1, -1, 2, -1}; +constexpr int Lsc_prefetch_bindless[] = {-1, 0, 1, 2, 3, 4, -1, 5, -1, 0, 1, -1, -1, 2, -1}; + +/// +/// template +/// void __cm_intrinsic_impl_block_prefetch_bti(SurfaceIndex Idx, +/// unsigned Offset); + +/* TY, DS, VS, IMOFF, L1, L3, TRANS, N, OP, IDX, OFF, DATA, DATA1, PRED, CHMASK */ +constexpr int Lsc_block_prefetch_bti[] = {-1, 0, 1, 2, 3, 4, -1, -1, -1, 0, 1, -1, -1, -1, -1}; +constexpr int Lsc_block_prefetch_flat[] = {-1, 0, 1, 2, 3, 4, -1, -1, -1, 0, 1, -1, -1, -1, -1}; + +/// template +/// void __cm_intrinsic_impl_store_bti(SurfaceIndex Idx, +/// vector Offset, +/// vector Data, +/// vector Pred); + +/* TY, DS, VS, IMOFF, L1, L3, TRANS, N, OP, IDX, OFF, DATA, DATA1, PRED, CHMASK */ +constexpr int Lsc_store_bti[] = {0, 1, 2, 3, 4, 5, 6, 7, -1, 0, 1, 2, -1, 3, -1}; +constexpr int Lsc_store_flat[] = {0, 1, 2, 3, 4, 5, 6, 7, -1, 0, 1, 2, -1, 3, -1}; +constexpr int Lsc_store_bindless[] = {0, 1, 2, 3, 4, 5, 6, 7, -1, 0, 1, 2, -1, 3, -1}; +constexpr int Lsc_store_slm[] = {0, 1, 2, 3, -1, -1, 4, 5, -1, -1, 0, 1, -1, 2, -1}; + +constexpr int Lsc_store4_bti[] = {0, 1, 2, 3, 4, 5, 6, 7, -1, 0, 1, 2, -1, 3, 4}; +constexpr int Lsc_store4_flat[] = {0, 1, 2, 3, 4, 5, 6, 7, -1, 0, 1, 2, -1, 3, 4}; +constexpr int Lsc_store4_bindless[] = {0, 1, 2, 3, 4, 5, 6, 7, -1, 0, 1, 2, -1, 3, 4}; +constexpr int Lsc_store4_slm[] = {0, 1, 2, 3, -1, -1, 4, 5, -1, -1, 0, 1, -1, 2, 3}; + +/// template Data) + +/* TY, DS, VS, IMOFF, L1, L3, TRANS, N, OP, IDX, OFF, DATA, DATA1, PRED, CHMASK */ +constexpr int Lsc_block_store_bti[] = {0, 1, 2, 3, 4, 5, 6, -1, -1, 0, 1, 2, -1, -1, -1}; +constexpr int Lsc_block_store_flat[] = {0, 1, 2, 3, 4, 5, 6, -1, -1, 0, 1, 2, -1, -1, -1}; +constexpr int Lsc_block_store_slm[] = {0, 1, 2, 3, -1, -1, 4, -1, -1, -1, 0, 1, -1, -1, -1}; + +// template +// RetTy _cm_intrinsic_impl_lsc_atomic_bti(vector Pred, +// SurfaceIndex Idx, +// vector Offset, +// Args... args); + +/* TY, DS, VS, IMOFF, L1, L3, TRANS, N, OP, IDX, OFF, DATA, DATA1, PRED, CHMASK */ +constexpr int Lsc_atomic_bti[] = {6, 1, 2, -1, 4, 5, 3, -1, 0, 1, 2, 3, 4, 0, -1}; +constexpr int Lsc_atomic_flat[] = {6, 1, 2, -1, 4, 5, 3, -1, 0, 1, 2, 3, 4, 0, -1}; +constexpr int Lsc_atomic_bindless[] = {6, 1, 2, -1, 4, 5, 3, -1, 0, 1, 2, 3, 4, 0, -1}; +constexpr int Lsc_atomic_slm[] = {6, 1, 2, -1, 4, 5, 3, -1, 0, -1, 1, 2, 3, 0, -1}; + +// intrinsics from kind +static int getLSCIntrinsic(CMBuiltinKind Kind) { + switch (Kind) { + case CMBK_cm_load_impl: + case CMBK_cm_block_load_impl: + return llvm::GenXIntrinsic::genx_lsc_load_bti; + case CMBK_cm_load4_impl: + return llvm::GenXIntrinsic::genx_lsc_load_quad_bti; + case CMBK_cm_load_flat_impl: + case CMBK_cm_block_load_flat_impl: + return llvm::GenXIntrinsic::genx_lsc_load_stateless; + case CMBK_cm_load4_flat_impl: + return llvm::GenXIntrinsic::genx_lsc_load_quad_stateless; + case CMBK_cm_load_bindless_impl: + return llvm::GenXIntrinsic::genx_lsc_load_bindless; + case CMBK_cm_load_slm_impl: + case CMBK_cm_block_load_slm_impl: + return llvm::GenXIntrinsic::genx_lsc_load_slm; + case CMBK_cm_load4_slm_impl: + return llvm::GenXIntrinsic::genx_lsc_load_quad_slm; + case CMBK_cm_store_impl: + case CMBK_cm_block_store_impl: + return llvm::GenXIntrinsic::genx_lsc_store_bti; + case CMBK_cm_store4_impl: + return llvm::GenXIntrinsic::genx_lsc_store_quad_bti; + case CMBK_cm_store_flat_impl: + case CMBK_cm_block_store_flat_impl: + return llvm::GenXIntrinsic::genx_lsc_store_stateless; + case CMBK_cm_store4_flat_impl: + return llvm::GenXIntrinsic::genx_lsc_store_quad_stateless; + case CMBK_cm_store_bindless_impl: + return llvm::GenXIntrinsic::genx_lsc_store_bindless; + case CMBK_cm_store_slm_impl: + case CMBK_cm_block_store_slm_impl: + return llvm::GenXIntrinsic::genx_lsc_store_slm; + case CMBK_cm_store4_slm_impl: + return llvm::GenXIntrinsic::genx_lsc_store_quad_slm; + case CMBK_cm_prefetch_impl: + case CMBK_cm_block_prefetch_impl: + return llvm::GenXIntrinsic::genx_lsc_prefetch_bti; + case CMBK_cm_prefetch_flat_impl: + case CMBK_cm_block_prefetch_flat_impl: + return llvm::GenXIntrinsic::genx_lsc_prefetch_stateless; + case CMBK_cm_prefetch_bindless_impl: + return llvm::GenXIntrinsic::genx_lsc_prefetch_bindless; + case CMBK_cm_atomic_bti_impl: + return llvm::GenXIntrinsic::genx_lsc_xatomic_bti; + case CMBK_cm_atomic_flat_impl: + return llvm::GenXIntrinsic::genx_lsc_xatomic_stateless; + case CMBK_cm_atomic_bindless_impl: + return llvm::GenXIntrinsic::genx_lsc_xatomic_bindless; + case CMBK_cm_atomic_slm_impl: + return llvm::GenXIntrinsic::genx_lsc_xatomic_slm; + default: + assert(0 && "Kind not supported"); + } +} + +// config from kind +static const int *getConfig(CMBuiltinKind Kind) { + switch (Kind) { + case CMBK_cm_prefetch_impl: return Lsc_prefetch_bti; + case CMBK_cm_block_prefetch_impl: return Lsc_block_prefetch_bti; + case CMBK_cm_prefetch_flat_impl: return Lsc_prefetch_flat; + case CMBK_cm_block_prefetch_flat_impl: return Lsc_block_prefetch_flat; + case CMBK_cm_prefetch_bindless_impl: return Lsc_prefetch_bindless; + case CMBK_cm_load_impl: return Lsc_load_bti; + case CMBK_cm_load4_impl: return Lsc_load4_bti; + case CMBK_cm_block_load_impl: return Lsc_block_load_bti; + case CMBK_cm_load_flat_impl: return Lsc_load_flat; + case CMBK_cm_load4_flat_impl: return Lsc_load4_flat; + case CMBK_cm_block_load_flat_impl: return Lsc_block_load_flat; + case CMBK_cm_load_bindless_impl: return Lsc_load_bindless; + case CMBK_cm_load_slm_impl: return Lsc_load_slm; + case CMBK_cm_load4_slm_impl: return Lsc_load4_slm; + case CMBK_cm_block_load_slm_impl: return Lsc_block_load_slm; + case CMBK_cm_store_impl: return Lsc_store_bti; + case CMBK_cm_store4_impl: return Lsc_store4_bti; + case CMBK_cm_block_store_impl: return Lsc_block_store_bti; + case CMBK_cm_store_flat_impl: return Lsc_store_flat; + case CMBK_cm_store4_flat_impl: return Lsc_store4_flat; + case CMBK_cm_block_store_flat_impl: return Lsc_block_store_flat; + case CMBK_cm_store_bindless_impl: return Lsc_store_bindless; + case CMBK_cm_store_slm_impl: return Lsc_store_slm; + case CMBK_cm_store4_slm_impl: return Lsc_store4_slm; + case CMBK_cm_block_store_slm_impl: return Lsc_block_store_slm; + case CMBK_cm_atomic_bti_impl: return Lsc_atomic_bti; + case CMBK_cm_atomic_flat_impl: return Lsc_atomic_flat; + case CMBK_cm_atomic_bindless_impl: return Lsc_atomic_bindless; + case CMBK_cm_atomic_slm_impl: return Lsc_atomic_slm; + default: + assert(0 && "Not a valid builtin"); + } +} + +// subop from kind (except atomics) +static LSC_SubOpcode getSubOp(CMBuiltinKind Kind) { + switch (Kind) { + case CMBK_cm_prefetch_impl: + case CMBK_cm_block_prefetch_impl: + case CMBK_cm_prefetch_flat_impl: + case CMBK_cm_block_prefetch_flat_impl: + case CMBK_cm_prefetch_bindless_impl: + case CMBK_cm_load_impl: + case CMBK_cm_block_load_impl: + case CMBK_cm_load_flat_impl: + case CMBK_cm_block_load_flat_impl: + case CMBK_cm_load_bindless_impl: + case CMBK_cm_load_slm_impl: + case CMBK_cm_block_load_slm_impl: + return LSC_SubOpcode::LSC_LOAD; + case CMBK_cm_load4_impl: + case CMBK_cm_load4_flat_impl: + case CMBK_cm_load4_slm_impl: + return LSC_SubOpcode::LSC_LOAD_QUAD; + case CMBK_cm_store_impl: + case CMBK_cm_block_store_impl: + case CMBK_cm_store_flat_impl: + case CMBK_cm_block_store_flat_impl: + case CMBK_cm_store_bindless_impl: + case CMBK_cm_store_slm_impl: + case CMBK_cm_block_store_slm_impl: + return LSC_SubOpcode::LSC_STORE; + case CMBK_cm_store4_impl: + case CMBK_cm_store4_flat_impl: + case CMBK_cm_store4_slm_impl: + return LSC_SubOpcode::LSC_STORE_QUAD; + default: + assert(0 && "Not a valid builtin"); + } +} + +// optype from kind +static LDTYPE getOpType(CMBuiltinKind Kind) { + switch (Kind) { + case CMBK_cm_prefetch_impl: + case CMBK_cm_block_prefetch_impl: + case CMBK_cm_prefetch_flat_impl: + case CMBK_cm_block_prefetch_flat_impl: + case CMBK_cm_prefetch_bindless_impl: + return PREFETCH; + case CMBK_cm_load_impl: + case CMBK_cm_block_load_impl: + case CMBK_cm_load_flat_impl: + case CMBK_cm_block_load_flat_impl: + case CMBK_cm_load_bindless_impl: + case CMBK_cm_load_slm_impl: + case CMBK_cm_block_load_slm_impl: + case CMBK_cm_load4_impl: + case CMBK_cm_load4_flat_impl: + case CMBK_cm_load4_slm_impl: + return LOAD; + case CMBK_cm_store_impl: + case CMBK_cm_block_store_impl: + case CMBK_cm_store_flat_impl: + case CMBK_cm_block_store_flat_impl: + case CMBK_cm_store_bindless_impl: + case CMBK_cm_store_slm_impl: + case CMBK_cm_block_store_slm_impl: + case CMBK_cm_store4_impl: + case CMBK_cm_store4_flat_impl: + case CMBK_cm_store4_slm_impl: + return STORE; + case CMBK_cm_atomic_bti_impl: + case CMBK_cm_atomic_flat_impl: + case CMBK_cm_atomic_bindless_impl: + case CMBK_cm_atomic_slm_impl: + return ATOMIC; + default: + assert(0 && "Not a valid builtin"); + } +} + +// surface from kind +static SFTYPE getSFType(CMBuiltinKind Kind) { + switch (Kind) { + case CMBK_cm_prefetch_impl: + case CMBK_cm_block_prefetch_impl: + case CMBK_cm_load_impl: + case CMBK_cm_block_load_impl: + case CMBK_cm_store_impl: + case CMBK_cm_block_store_impl: + case CMBK_cm_atomic_bti_impl: + case CMBK_cm_load4_impl: + case CMBK_cm_store4_impl: + return BTI; + case CMBK_cm_prefetch_flat_impl: + case CMBK_cm_block_prefetch_flat_impl: + case CMBK_cm_load_flat_impl: + case CMBK_cm_block_load_flat_impl: + case CMBK_cm_store_flat_impl: + case CMBK_cm_block_store_flat_impl: + case CMBK_cm_atomic_flat_impl: + case CMBK_cm_load4_flat_impl: + case CMBK_cm_store4_flat_impl: + return FLAT; + case CMBK_cm_prefetch_bindless_impl: + case CMBK_cm_load_bindless_impl: + case CMBK_cm_store_bindless_impl: + case CMBK_cm_atomic_bindless_impl: + return BINDLESS; + case CMBK_cm_load_slm_impl: + case CMBK_cm_block_load_slm_impl: + case CMBK_cm_store_slm_impl: + case CMBK_cm_block_store_slm_impl: + case CMBK_cm_atomic_slm_impl: + case CMBK_cm_store4_slm_impl: + case CMBK_cm_load4_slm_impl: + return SLM; + default: + assert(0 && "Not a valid builtin"); + } +} + +// is block or not +static bool getBlock(CMBuiltinKind Kind) { + switch (Kind) { + case CMBK_cm_prefetch_impl: + case CMBK_cm_prefetch_flat_impl: + case CMBK_cm_prefetch_bindless_impl: + case CMBK_cm_load_impl: + case CMBK_cm_load_flat_impl: + case CMBK_cm_load_bindless_impl: + case CMBK_cm_store_impl: + case CMBK_cm_store_flat_impl: + case CMBK_cm_store_bindless_impl: + case CMBK_cm_load_slm_impl: + case CMBK_cm_store_slm_impl: + case CMBK_cm_atomic_bti_impl: + case CMBK_cm_atomic_flat_impl: + case CMBK_cm_atomic_bindless_impl: + case CMBK_cm_atomic_slm_impl: + case CMBK_cm_load4_impl: + case CMBK_cm_load4_flat_impl: + case CMBK_cm_load4_slm_impl: + case CMBK_cm_store4_impl: + case CMBK_cm_store4_flat_impl: + case CMBK_cm_store4_slm_impl: + return false; + case CMBK_cm_block_prefetch_impl: + case CMBK_cm_block_prefetch_flat_impl: + case CMBK_cm_block_load_impl: + case CMBK_cm_block_load_flat_impl: + case CMBK_cm_block_store_impl: + case CMBK_cm_block_store_flat_impl: + case CMBK_cm_block_load_slm_impl: + case CMBK_cm_block_store_slm_impl: + return true; + default: + assert(0 && "Not a valid builtin"); + } +} + +#endif diff --git a/lib/Driver/ToolChains/Arch/GenX.cpp b/lib/Driver/ToolChains/Arch/GenX.cpp index f499b362900b..d0818f554075 100644 --- a/lib/Driver/ToolChains/Arch/GenX.cpp +++ b/lib/Driver/ToolChains/Arch/GenX.cpp @@ -42,11 +42,42 @@ int GenX::getGenXRevId(const std::string &CPU, // if no option, try to deduce from CPU RevId = llvm::StringSwitch(CPU) + .Case("PVC", 0) + .Case("PVCXT", 5) .Default(0); return RevId; } + +static void reportUnsupportedStepping(const std::string &CPU, + const std::string &Stepping) { + std::string Err = std::string( + (Twine("stepping <") + Stepping + "> is not supported for <" + CPU + ">") + .str()); + llvm::report_fatal_error(Err); +} + +static std::string deriveFinalCpuNameFromStepping(const std::string &CPU, + const std::string &Stepping) { + if (Stepping.empty()) { + return CPU; + } + + if (Stepping == "A" && CPU == "PVC") + return "PVC"; + + if (Stepping == "B" && CPU == "PVC") + return "PVCXT"; + + + if (CPU == "DG2") + return CPU; + + reportUnsupportedStepping(CPU, Stepping); + return CPU; +} + static std::string getCanonicalGenXTargetCPU(const std::string &CPU, const ArgList &Args, const Driver *Drv) { @@ -74,8 +105,16 @@ static std::string getCanonicalGenXTargetCPU(const std::string &CPU, .Case("RKL", "RKL") .Case("DG1", "DG1") .Case("XEHP_SDV", "XEHP_SDV") + .Case("DG2", "DG2") + .Case("ADLP", "ADLP") + .Case("ADLS", "ADLS") + .Case("PVC", "PVC") + .Case("PVCXT", "PVCXT") .Default(""); + int RevId = GenX::getGenXRevId(CPU, Args, Drv); + if (CPUName == "PVC" && RevId >= 3) + return "PVCXT"; return CanonicalCPU; } diff --git a/lib/Driver/ToolChains/Arch/GenX.h b/lib/Driver/ToolChains/Arch/GenX.h index 9410c0caea46..f3baf328829d 100644 --- a/lib/Driver/ToolChains/Arch/GenX.h +++ b/lib/Driver/ToolChains/Arch/GenX.h @@ -38,6 +38,10 @@ int getGenXRevId(const std::string &CPU, const llvm::opt::ArgList &Args, const Driver *Drv = nullptr); +// get stepping from args +std::string getGenXTargetStepping(const std::string &CPU, + const llvm::opt::ArgList &Args, + const Driver *Drv = nullptr); // get features from args and triple void getGenXTargetFeatures(const Driver &D, const llvm::Triple &Triple, diff --git a/lib/Driver/ToolChains/GenX.cpp b/lib/Driver/ToolChains/GenX.cpp index 6d2923c79778..561f749e4f6a 100644 --- a/lib/Driver/ToolChains/GenX.cpp +++ b/lib/Driver/ToolChains/GenX.cpp @@ -86,6 +86,10 @@ ArgStringList constructCompatibilityFinalizerOptions(const ArgList &Args, CompatibilityArgs.push_back(LabelName.data()); } } + if (Args.hasArg(options::OPT_Qxcm_doubleGRF)) { + CompatibilityArgs.push_back("-TotalGRFNum"); + CompatibilityArgs.push_back("256"); + } // Add any finalizer options specified using -mCM_jit_option. // Options may be single options or multiple options within quotes. diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp index 93c0aff9bae8..141a53233e22 100644 --- a/lib/Frontend/CompilerInvocation.cpp +++ b/lib/Frontend/CompilerInvocation.cpp @@ -791,6 +791,9 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK, else { Opts.MaxSLMSize = llvm::StringSwitch(TargetOpts.CPU) .Case("XEHP_SDV", 128) + .Case("DG2", 128) + .Case("PVC", 128) + .Case("PVCXT", 128) .Default(64); } @@ -799,6 +802,9 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK, else { Opts.MaxOBRWSize = llvm::StringSwitch(TargetOpts.CPU) .Case("XEHP_SDV", 16) + .Case("DG2", 16) + .Case("PVC", 16) + .Case("PVCXT", 16) .Default(8); } @@ -810,6 +816,11 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK, .Case("RKL", false) .Case("DG1", false) .Case("XEHP_SDV", false) + .Case("ADLP", false) + .Case("ADLS", false) + .Case("DG2", false) + .Case("PVC", false) + .Case("PVCXT", false) .Default(true); } @@ -3304,6 +3315,7 @@ static void ParseTargetArgs(TargetOptions &Opts, ArgList &Args, Opts.EABIVersion = EABIVersion; } Opts.CPU = Args.getLastArgValue(OPT_target_cpu); + Opts.Stepping = Args.getLastArgValue(OPT_Qxcm_stepping); Args.getLastArgValue(OPT_Qxcm_revid, "0").getAsInteger(0, Opts.RevId); Opts.FPMath = Args.getLastArgValue(OPT_mfpmath); diff --git a/lib/Frontend/InitPreprocessor.cpp b/lib/Frontend/InitPreprocessor.cpp index 5c81d27f397d..49e3f0c5d3f5 100644 --- a/lib/Frontend/InitPreprocessor.cpp +++ b/lib/Frontend/InitPreprocessor.cpp @@ -638,7 +638,12 @@ static void InitializePredefinedMacros(const TargetInfo &TI, .Case("TGLLP", "CM_GEN12") .Case("RKL", "CM_GEN12") .Case("DG1", "CM_GEN12") + .Case("ADLP", "CM_GEN12") + .Case("ADLS", "CM_GEN12") .Case("XEHP_SDV", "CM_XEHP") + .Case("DG2", "CM_XEHPG") + .Case("PVC", "CM_XEHPC") + .Case("PVCXT", "CM_XEHPC") .Default(""); Builder.defineMacro(CmTarget); const char *GenXValue = llvm::StringSwitch(TOpts.CPU) @@ -655,6 +660,11 @@ static void InitializePredefinedMacros(const TargetInfo &TI, .Case("RKL", "1201") .Case("DG1", "1210") .Case("XEHP_SDV", "1270") + .Case("ADLP", "1220") + .Case("ADLS", "1230") + .Case("DG2", "1271") + .Case("PVC", "1280") + .Case("PVCXT", "1280") .Default(""); Builder.defineMacro("CM_GENX", GenXValue); } diff --git a/lib/Headers/cm/include/CMakeLists.txt b/lib/Headers/cm/include/CMakeLists.txt index 85dee8fff47a..3db4f37f65fa 100755 --- a/lib/Headers/cm/include/CMakeLists.txt +++ b/lib/Headers/cm/include/CMakeLists.txt @@ -12,18 +12,21 @@ set(CM_HEADERS cm/cm_has_instr.h cm/cm_bfn.h cm/cm_common.h + cm/cm_cvt.h cm/cm_dataport.h cm/cm_gateway.h cm/cm.h cm/cm_internal.h cm/cm_library.h cm/cm_linear.h + cm/cm_lsc.h cm/cm_mask.h cm/cm_printf.h cm/cm_printfocl.h cm/cm_sampler.h cm/cm_send.h cm/cm_spec_constant.h + cm/cm_srnd.h cm/cm_svm.h cm/cm_target.h cm/cmtl/global.h diff --git a/lib/Headers/cm/include/cm/cm.h b/lib/Headers/cm/include/cm/cm.h index a5ba5dcc09d3..0b991bd919b0 100644 --- a/lib/Headers/cm/include/cm/cm.h +++ b/lib/Headers/cm/include/cm/cm.h @@ -17,6 +17,9 @@ SPDX-License-Identifier: MIT #include "cm_dataport.h" #include "cm_has_instr.h" #include "cm_bfn.h" +#include "cm_cvt.h" +#include "cm_lsc.h" +#include "cm_srnd.h" #include "cm_internal.h" #include "cm_sampler.h" #include "cm_traits.h" @@ -442,6 +445,8 @@ CM_NODEBUG CM_INLINE typename std::enable_if< vector >::type cm_rol(vector src0, vector src1) { CM_HAS_BIT_ROTATE_CONTROL; + if constexpr (sizeof(T0) == sizeof(long long) || sizeof(T1) == sizeof(long long)) + CM_HAS_BIT_ROTATE_64BIT_CONTROL; return details::__cm_intrinsic_impl_rol(src0, src1); } @@ -456,6 +461,8 @@ cm_rol(vector src0, U src1) { typename details::vector_type::type _Src0 = src0; typename details::vector_type::type _Src1 = src1; CM_HAS_BIT_ROTATE_CONTROL; + if constexpr (sizeof(T0) == sizeof(long long) || sizeof(T1) == sizeof(long long)) + CM_HAS_BIT_ROTATE_64BIT_CONTROL; return details::__cm_intrinsic_impl_rol(_Src0, _Src1); } @@ -490,6 +497,8 @@ CM_NODEBUG CM_INLINE typename std::enable_if< vector >::type cm_ror(vector src0, vector src1) { CM_HAS_BIT_ROTATE_CONTROL; + if constexpr (sizeof(T0) == sizeof(long long) || sizeof(T1) == sizeof(long long)) + CM_HAS_BIT_ROTATE_64BIT_CONTROL; return details::__cm_intrinsic_impl_ror(src0, src1); } @@ -504,6 +513,8 @@ cm_ror(vector src0, U src1) { typename details::vector_type::type _Src0 = src0; typename details::vector_type::type _Src1 = src1; CM_HAS_BIT_ROTATE_CONTROL; + if constexpr (sizeof(T0) == sizeof(long long) || sizeof(T1) == sizeof(long long)) + CM_HAS_BIT_ROTATE_64BIT_CONTROL; return details::__cm_intrinsic_impl_ror(_Src0, _Src1); } @@ -2134,11 +2145,18 @@ CM_NODEBUG CM_INLINE void cm_dpas_check_common() { CM_STATIC_ERROR(details::is_dword_type::value, "Src2 must be DWORD type"); - CM_STATIC_ERROR((systolic_depth == 8), "systolic_depth must be 8"); + CM_STATIC_ERROR((systolic_depth == 8) || (systolic_depth == 4), + "systolic_depth must be 8 or 4"); CM_STATIC_ERROR((repeat_count >= 1) && (repeat_count <= 8), "repeat_count must be within 1 to 8"); +#if !defined(CM_GENX) + CM_STATIC_WARNING(0, "GEN not specified so cm_dpas() code may not be optimal"); constexpr int DPAS_EXECUTION_SIZE = 8; +#else // !defined(CM_GENX) + constexpr int DPAS_EXECUTION_SIZE = (CM_GENX >= 1280) ? 16 : 8; +#endif // !defined(CM_GENX) + CM_STATIC_ERROR((N == DPAS_EXECUTION_SIZE * repeat_count), "Unsupported execution size in dpas"); @@ -2207,7 +2225,26 @@ CM_NODEBUG CM_INLINE void cm_dpas_check_types() { details::is_one_of_enum_v; - constexpr bool is_right = (check_int || check_hf || check_bf16); + ////////////////////////////////////////////////////////////////////////////// + // TF32 ////////////////////////////////////////////////////////////////////// +#ifdef CM_HAS_TF32 + // f | f | tf32 | tf32 + constexpr bool check_tf32 = + details::is_one_of_v && details::is_one_of_v && + details::is_one_of_enum_v && + details::is_one_of_enum_v; +#else // CM_HAS_TF32 + constexpr bool check_tf32 = false; +#endif // CM_HAS_TF32 + if constexpr (check_hf && (std::is_same::value || std::is_same::value)) + CM_HAS_DPAS_ACC_HALF_CONTROL; + if constexpr (check_bf16 && (std::is_same::value || std::is_same::value)) + CM_HAS_DPAS_ACC_BF16_CONTROL; + + constexpr bool is_right = (check_int || check_hf || check_bf16 || check_tf32); + if (!is_right) { CM_STATIC_WARNING(is_right, "types: dst | src0 | src1(from template " @@ -2215,6 +2252,9 @@ CM_NODEBUG CM_INLINE void cm_dpas_check_types() { CM_STATIC_WARNING(is_right, "ud, d | ud, d | ub, b, u4, s4, u2, s2 | ub, b, u4, s4, u2, s2"); CM_STATIC_WARNING(is_right, "f, bf | f, bf | bf | bf"); CM_STATIC_WARNING(is_right, "f, hf | f, hf | hf | hf"); +#if defined(CM_HAS_TF32) + CM_STATIC_WARNING(is_right, "f | f | tf32 | tf32"); +#endif // defined(CM_HAS_TF32) CM_STATIC_ERROR(is_right, "unsupported dpas type"); } @@ -2317,7 +2357,8 @@ cm_dpasw(vector src0, CM_STATIC_ERROR((N == 8 * repeat_count), "Execution size must be 8"); - CM_STATIC_ERROR((systolic_depth == 8), "systolic_depth must be 8"); + CM_STATIC_ERROR((systolic_depth == 8) || (systolic_depth == 4), + "systolic_depth must be 8 or 4"); CM_STATIC_ERROR((repeat_count >= 1) && (repeat_count <= 8), "repeat_count must be within 1 to 8"); @@ -2371,7 +2412,8 @@ template = 1) && (repeat_count <= 8), "repeat_count must be within 1 to 8"); diff --git a/lib/Headers/cm/include/cm/cm_atomic.h b/lib/Headers/cm/include/cm/cm_atomic.h index bd1936ccc9ed..0670bac345bb 100644 --- a/lib/Headers/cm/include/cm/cm_atomic.h +++ b/lib/Headers/cm/include/cm/cm_atomic.h @@ -60,6 +60,8 @@ _ATOMIC_CHECK_TYPES(ATOMIC_MAXSINT, uint, int); _ATOMIC_CHECK(ATOMIC_FMAX, float); _ATOMIC_CHECK(ATOMIC_FMIN, float); _ATOMIC_CHECK(ATOMIC_FCMPWR, float); +_ATOMIC_CHECK(ATOMIC_FADD, float); +_ATOMIC_CHECK(ATOMIC_FSUB, float); _ATOMIC_CHECK_TYPES(ATOMIC_PREDEC, uint, int); #undef _ATOMIC_CHECK @@ -228,6 +230,12 @@ write(SurfaceIndex index, CmAtomicOpType op, uint globalOffset, case ATOMIC_FCMPWR: _ATOMIC_WRITE(ATOMIC_FCMPWR, 8, float); break; + case ATOMIC_FADD: + _ATOMIC_WRITE(ATOMIC_FADD, 8, float); + break; + case ATOMIC_FSUB: + _ATOMIC_WRITE(ATOMIC_FSUB, 8, float); + break; } #undef _ATOMIC_WRITE diff --git a/lib/Headers/cm/include/cm/cm_common.h b/lib/Headers/cm/include/cm/cm_common.h index 74227ee13082..a089611fea7e 100644 --- a/lib/Headers/cm/include/cm/cm_common.h +++ b/lib/Headers/cm/include/cm/cm_common.h @@ -343,6 +343,8 @@ enum class CmAtomicOpType { _ATOMIC_FMAX = 0x10, _ATOMIC_FMIN = 0x11, _ATOMIC_FCMPWR = 0x12, + _ATOMIC_FADD = 0x13, + _ATOMIC_FSUB = 0x14, _ATOMIC_PREDEC = 0xff }; @@ -362,6 +364,8 @@ enum class CmAtomicOpType { #define ATOMIC_FMAX CmAtomicOpType::_ATOMIC_FMAX #define ATOMIC_FMIN CmAtomicOpType::_ATOMIC_FMIN #define ATOMIC_FCMPWR CmAtomicOpType::_ATOMIC_FCMPWR +#define ATOMIC_FADD CmAtomicOpType::_ATOMIC_FADD +#define ATOMIC_FSUB CmAtomicOpType::_ATOMIC_FSUB #define ATOMIC_PREDEC CmAtomicOpType::_ATOMIC_PREDEC enum class CM3DSampleOp : int { @@ -472,6 +476,7 @@ enum class CmPrecisionType { CM_Precision_S8 = 7, // signed 8 bits CM_Precision_BF16 = 8, // bfloat 16 CM_Precision_FP16 = 9, // half float + CM_Precision_TF32 = 11, // tensorfloat 32 }; #define CM_PRECISION_U1 CmPrecisionType::CM_Precision_U1 @@ -484,6 +489,7 @@ enum class CmPrecisionType { #define CM_PRECISION_S8 CmPrecisionType::CM_Precision_S8 #define CM_PRECISION_BF CmPrecisionType::CM_Precision_BF16 #define CM_PRECISION_HF CmPrecisionType::CM_Precision_FP16 +#define CM_PRECISION_TF32 CmPrecisionType::CM_Precision_TF32 constexpr unsigned get_ops_per_channel(CmPrecisionType src1_precision, CmPrecisionType src2_precision) { if ((src1_precision == CM_PRECISION_U8) || @@ -520,6 +526,10 @@ constexpr unsigned get_ops_per_channel(CmPrecisionType src1_precision, (src2_precision == CM_PRECISION_HF)) { return 2; } + else if ((src1_precision == CM_PRECISION_TF32) && + (src2_precision == CM_PRECISION_TF32)) { + return 1; + } return 0xFFFFFFFF; } @@ -543,6 +553,9 @@ constexpr unsigned get_precision_bits(CmPrecisionType src_precision) { ) { return 16; } + if (src_precision == CM_PRECISION_TF32) { + return 32; + } return 0; } @@ -597,5 +610,89 @@ constexpr unsigned get_precision_bits(CmPrecisionType src_precision) { // cm_label() is deprecated #define cm_label(...) CM_STATIC_WARNING(0, "cm_label() is deprecated") +// L1 or L3 cache hint kinds. +enum class CacheHint : uint8_t { + Default = 0, + Uncached = 1, + Cached = 2, + WriteBack = 3, + WriteThrough = 4, + Streaming = 5, + ReadInvalidate = 6 +}; + +// Data size or format to read or store. +enum class DataSize : uint8_t { + Default = 0, + U8 = 1, + U16 = 2, + U32 = 3, + U64 = 4, + U8U32 = 5, // load 8b, zero extend to 32b; store the opposite + U16U32 = 6, // load 16b, zero extend to 32b; store the opposite + U16U32H = 7 // load 16b into high 16 of each 32b; store the high 16 +}; + +// The number of elements to load per address (vector size) +enum class VectorSize : uint8_t { + N0 = 0, + N1 = 1, // 1 element + N2 = 2, // 2 element + N3 = 3, // 3 element + N4 = 4, // 4 element + N8 = 5, // 8 element + N16 = 6, // 16 element + N32 = 7, // 32 element + N64 = 8 // 64 element +}; + +// LSC atomic op kind and encoding. +enum class AtomicOp : uint8_t { + IINC = 0x08, + IDEC = 0x09, + LOAD = 0x0A, + STORE = 0x0B, + IADD = 0x0C, + ISUB = 0x0D, + SMIN = 0x0E, + SMAX = 0x0F, + UMIN = 0x10, + UMAX = 0x11, + ICAS = 0x12, + FADD = 0x13, + FSUB = 0x14, + FMIN = 0x15, + FMAX = 0x16, + FCAS = 0x17, + AND = 0x18, + OR = 0x19, + XOR = 0x1A +}; + +enum class LSC_SCOPE : uint8_t { + LSC_SCOPE_GROUP, + LSC_SCOPE_LOCAL, + LSC_SCOPE_TILE, + LSC_SCOPE_GPU, + LSC_SCOPE_GPUS, + LSC_SCOPE_SYSTEM, + LSC_SCOPE_SYSACQ +}; + +enum class LSC_FENCE_OP : uint8_t { + LSC_FENCE_OP_NONE, + LSC_FENCE_OP_EVICT, + LSC_FENCE_OP_INVALIDATE, + LSC_FENCE_OP_DISCARD, + LSC_FENCE_OP_CLEAN, + LSC_FENCE_OP_FLUSHL3 +}; + +enum class LSC_SFID : uint8_t { + LSC_UGM, + LSC_UGML, + LSC_TGM, + LSC_SLM +}; #endif /* _CLANG_CM_COMMON_H_ */ diff --git a/lib/Headers/cm/include/cm/cm_cvt.h b/lib/Headers/cm/include/cm/cm_cvt.h new file mode 100644 index 000000000000..c1f50789d105 --- /dev/null +++ b/lib/Headers/cm/include/cm/cm_cvt.h @@ -0,0 +1,57 @@ +/*========================== begin_copyright_notice ============================ + +Copyright (C) 2020-2021 Intel Corporation + +SPDX-License-Identifier: MIT + +============================= end_copyright_notice ===========================*/ + +#if (__INCLUDE_LEVEL__ == 1) +static_assert(0, "CM:w:cm_cvt.h should not be included explicitly - only " + " is required"); +#endif + +#ifndef _CLANG_CM_CVT_H_ +#define _CLANG_CM_CVT_H_ + +#include "cm_common.h" +#include "cm_internal.h" +#include "cm_traits.h" +#include "cm_has_instr.h" + + +// float32 to tf32 convertion one direction +template +CM_NODEBUG CM_INLINE vector cm_tf32_cvt(vector src0) { + CM_STATIC_ERROR( + (std::is_same::type>::value && + std::is_same::type>::value), + "Invalid type for cm_tf32_fp32_cvt: src->dst must be float->int"); + CM_HAS_TF32_CONTROL; + + return details::__cm_intrinsic_impl_tf32_cvt(src0); +} + +template +CM_NODEBUG CM_INLINE vector +cm_tf32_cvt(matrix src) { + CM_HAS_TF32_CONTROL; + + vector _Src = src; + return cm_tf32_cvt(_Src); +} + +template +CM_NODEBUG CM_INLINE + typename std::enable_if::value && + details::is_cm_scalar::value, + typename std::remove_const::type>::type + cm_tf32_cvt(T0 src) { + CM_HAS_TF32_CONTROL; + + vector _Src = src; + vector _Result = cm_tf32_cvt(_Src); + return _Result(0); +} + +#endif // _CLANG_CM_CVT_H_ diff --git a/lib/Headers/cm/include/cm/cm_dataport.h b/lib/Headers/cm/include/cm/cm_dataport.h index 63b6d38db158..e7155a92555b 100644 --- a/lib/Headers/cm/include/cm/cm_dataport.h +++ b/lib/Headers/cm/include/cm/cm_dataport.h @@ -127,6 +127,135 @@ CM_NODEBUG CM_INLINE void write(SurfaceIndex index, int offset, details::__cm_intrinsic_impl_oword_write(index, offset, src); } +/// \brief HWord block read. +/// @param idx surface index, which must correspond to a buffer. +/// +/// @param offset zero based offset of the input buffer in bytes. Must be HWord aligned. +/// +/// @param dst the data to be read. The size of vector can be only 1, 2, 4 or +/// 8 HWords. +template +CM_NODEBUG CM_INLINE void cm_hword_read(SurfaceIndex index, uint offset, + vector_ref dst) { + vector header = 0; + const uint exDesc = 0xa; + uint desc = cm_get_value(index); + + constexpr unsigned Sz = sizeof(uint) * N; + CM_STATIC_ERROR(details::isPowerOf2(N) && Sz >= details::GRF && + Sz <= 8 * details::GRF, + "Data size must be 1/2/4/8 HWords"); + + switch (N) + { + case 8: + desc |= (0x0 << 8); // MESSAGE_SPECIFIC_CONTROL + break; + case 16: + desc |= (0x1 << 8); + break; + case 32: + desc |= (0x2 << 8); + break; + case 64: + desc |= (0x3 << 8); + break; + } + + desc += 0x1 << 13; + desc += 0x1 << 14; + desc += 0x1 << 19; + desc += (N / 8) << 20; // Response length + desc += 0x1 << 25; // Msg length + header(2) = offset; + + cm_send(dst.format(), header.format(), + exDesc, desc, 0u); +} + +/// \brief HWord block write. +/// @param idx surface index, which must correspond to a buffer. +/// +/// @param offset zero based offset of the input buffer in bytes. Must be HWord aligned. +/// +/// @param src the data to be written. The size of vector can be only 1, 2, 4 or +/// 8 HWords. +template +CM_NODEBUG CM_INLINE void cm_hword_write(SurfaceIndex index, int offset, + vector src) { + vector header = 0; + const uint exDesc = ((N / 8) << 6) + 0xa; + uint desc = cm_get_value(index); + + constexpr unsigned Sz = sizeof(uint) * N; + CM_STATIC_ERROR(details::isPowerOf2(Sz) && Sz >= details::GRF && + Sz <= 8 * details::GRF, + "Data size must be 1/2/4/8 HWords"); + + switch (N) + { + case 8: + desc |= (0x0 << 8); // MESSAGE_SPECIFIC_CONTROL + break; + case 16: + desc |= (0x1 << 8); + break; + case 32: + desc |= (0x2 << 8); + break; + case 64: + desc |= (0x3 << 8); + break; + } + + desc += 0x1 << 13; + desc += 0x9 << 14; + desc += 0x1 << 19; + desc += 0x1 << 25; // Msg length + header(2) = offset; + + cm_sends(NULL, header.format(), src.format(), + exDesc, desc, 0u); +} + +/// \brief QWord scattered read. +/// @param idx surface index, which must correspond to a buffer. +/// +/// @param offsets zero based offset for each elemet to be read. +/// +/// @param data the data to be read. The size of vector can be only 8 or 16 QWords. +template +CM_NODEBUG CM_INLINE +typename std::enable_if<(sizeof(T) == 8) && ((C == 8) || (C == 16)), + void>::type +cm_qword_scatter_read(SurfaceIndex idx, vector offsets, vector_ref data) +{ + uint desc = cm_get_value(idx); + desc += (5 & 0x1F) << 14; + desc += (C / 8 - 1) << 8; + desc += (C / 4) << 20; + desc += (C / 8) << 25; + cm_send(data, offsets, 0xA, desc, 0u); +} + +/// \brief QWord scattered write. +/// @param idx surface index, which must correspond to a buffer. +/// +/// @param offsets zero based offset for each elemet to be written. +/// +/// @param data the data to be written. The size of vector can be only 8 or 16 QWords. +template +CM_NODEBUG CM_INLINE +typename std::enable_if<(sizeof(T) == 8) && ((C == 8) || (C == 16)), + void>::type +cm_qword_scatter_write(SurfaceIndex idx, vector offsets, vector_ref data) +{ + uint desc = cm_get_value(idx); + desc += (13 & 0x1F) << 14; + desc += (C / 8 - 1) << 8; + desc += (C / 8) << 25; + cm_sends(NULL, offsets, data, (0xA | ((C / 4) << 6)), desc, 0u); +} /// \brief Media block read. /// diff --git a/lib/Headers/cm/include/cm/cm_gateway.h b/lib/Headers/cm/include/cm/cm_gateway.h index 8721bc13af3c..1c006735cd34 100644 --- a/lib/Headers/cm/include/cm/cm_gateway.h +++ b/lib/Headers/cm/include/cm/cm_gateway.h @@ -145,6 +145,22 @@ CM_INLINE void monitor_no_event(void) { /// timer_value passed in as the timeout. Only the bottom 10 bits are valid. /// Only one event may be monitored/waited on at a time /// +#define CM_WAIT_EVENT (defined(CM_XEHPG) || defined(CM_XEHPC)) + +#ifdef CM_WAIT_EVENT +CM_INLINE void wait_event(unsigned short timer_value) { + matrix payload = 0; + unsigned msgLength = 1; + unsigned rspLength = 0; + uint msgDesc = (SUBID_GW_WAIT_EVENT & 0x7ffff) + (1 << 19) + + ((rspLength & 0x1f) << 20) + ((msgLength & 0xf) << 25); + + payload.format()[0] = timer_value; + + cm_send(NULL, payload, SFID_GATEWAY, msgDesc, 0u /* sendc */); + cm_sbarrier(0); +} +#else CM_INLINE unsigned int wait_event(unsigned short timer_value) { matrix payload = 0; matrix response = 0; @@ -162,6 +178,7 @@ CM_INLINE unsigned int wait_event(unsigned short timer_value) { return lock; } +#endif /// \brief Wrapper function for cm_wait builtin /// @@ -252,5 +269,36 @@ CM_INLINE void cm_signal() { } #endif +template +CM_NODEBUG CM_INLINE +typename std::enable_if::value, + void>::type +cm_nbarrier_signal( + T barrierId, + T producerConsumerMode, + T numProducers, + T numConsumers) +{ + constexpr unsigned gateway = 3; + constexpr unsigned barrier = 4; + + constexpr unsigned descriptor = + 1 << 25 | // Message length: 1 register + 0 << 12 | // Fence Data Ports: No fence + barrier; // Barrier subfunction + + vector payload = 0; +#ifndef CMRT_EMU + payload = cm_get_r0(); +#endif // CMRT_EMU + + payload(2) = + (numConsumers & 0xff) << 24 | + (numProducers & 0xff) << 16 | + producerConsumerMode << 14 | + (barrierId & 0b11111) << 0; + + cm_send(NULL, payload, gateway, descriptor, /* sendc = */ 0); +} #endif /* _CLANG_CM_GATEWAY_H */ diff --git a/lib/Headers/cm/include/cm/cm_has_instr.h b/lib/Headers/cm/include/cm/cm_has_instr.h index 36c8e6059f60..34631be8a22f 100644 --- a/lib/Headers/cm/include/cm/cm_has_instr.h +++ b/lib/Headers/cm/include/cm/cm_has_instr.h @@ -52,6 +52,13 @@ namespace CheckVersion { #define CM_HAS_BIT_ROTATE_CONTROL CM_HAS_CONTROL(false) #endif +/// CM_HAS__CONTROL macors +/// ------------------------------- +/// Create static_assert if feature isn't supported for this platform. +/// Otherwise, do nothing. +/// +/// CM_GENX value of platforms sets according to Frontend/InitPreprocessor.cpp. +//===----------------------------------------------------------------------===// //BFN #if (CM_GENX >= 1270) //>= XEHP_SDV @@ -61,6 +68,18 @@ namespace CheckVersion { #define CM_HAS_BFN_CONTROL CM_HAS_CONTROL(false) #endif + +//ACC_BF16 and ACC_HALF +#if (CM_GENX >= 1280) //>= PVC + #define CM_HAS_DPAS_ACC_HALF 1 + #define CM_HAS_DPAS_ACC_BF16 1 + #define CM_HAS_DPAS_ACC_HALF_CONTROL CM_HAS_CONTROL(true) + #define CM_HAS_DPAS_ACC_BF16_CONTROL CM_HAS_CONTROL(true) +#else + #define CM_HAS_DPAS_ACC_HALF_CONTROL CM_HAS_CONTROL(false) + #define CM_HAS_DPAS_ACC_BF16_CONTROL CM_HAS_CONTROL(false) +#endif + //BF16 #if (CM_GENX >= 1200) //>= TGLLP #define CM_HAS_BF16 1 @@ -77,8 +96,9 @@ namespace CheckVersion { #define CM_HAS_DPAS_CONTROL CM_HAS_CONTROL(false) #endif + //DPAS_ODD -#if (CM_GENX >= 1270) //>= XEHP_SDV +#if (CM_GENX >= 1270 && CM_GENX <= 1280) //>= XEHP_SDV && <= PVC #define CM_HAS_DPAS_ODD 1 #define CM_HAS_DPAS_ODD_CONTROL CM_HAS_CONTROL(true) #else @@ -98,13 +118,54 @@ namespace CheckVersion { CM_GENX == 900 || /*SKL*/ \ CM_GENX == 950 || /*KBL*/ \ CM_GENX == 1150 || /*ICLLP*/ \ - CM_GENX == 1270 /*XeHP_SDV*/ ) + CM_GENX == 1270 || /*XeHP_SDV*/ \ + CM_GENX == 1280 /*PVC*/ ) #define CM_HAS_IEEE_DIV_SQRT 1 #define CM_HAS_IEEE_DIV_SQRT_CONTROL CM_HAS_CONTROL(true) #else //IEEE #define CM_HAS_IEEE_DIV_SQRT_CONTROL CM_HAS_CONTROL(false) #endif //IEEE +//LSC +#if (CM_GENX >= 1271) //>= DG2 + #define CM_HAS_LSC 1 + #define CM_HAS_LSC_CONTROL CM_HAS_CONTROL(true) +#else + #define CM_HAS_LSC_CONTROL CM_HAS_CONTROL(false) +#endif + + +//LSC_UNTYPED_2D +#if (CM_GENX >= 1280) //>= PVC + #define CM_HAS_LSC_UNTYPED_2D 1 + #define CM_HAS_LSC_UNTYPED_2D_CONTROL CM_HAS_CONTROL(true) +#else + #define CM_HAS_LSC_UNTYPED_2D_CONTROL CM_HAS_CONTROL(false) +#endif + + +//TF32 +#if ((CM_GENX == 1280 && CM_GENX_REVID >= 5) || CM_GENX > 1280) //>= PVCXT + #define CM_HAS_TF32 1 + #define CM_HAS_TF32_CONTROL CM_HAS_CONTROL(true) +#else + #define CM_HAS_TF32_CONTROL CM_HAS_CONTROL(false) +#endif + +//BitRotate64 +#if (CM_GENX >= 1280) //>= PVC + #define CM_HAS_BIT_ROTATE_64BIT 1 + #define CM_HAS_BIT_ROTATE_64BIT_CONTROL CM_HAS_CONTROL(true) +#else + #define CM_HAS_BIT_ROTATE_64BIT_CONTROL CM_HAS_CONTROL(false) +#endif + +#if (CM_GENX >= 1280) // >= PVC + #define CM_HAS_STOCHASTIC_ROUNDING 1 + #define CM_HAS_STOCHASTIC_ROUNDING_CONTROL CM_HAS_CONTROL(true) +#else + #define CM_HAS_STOCHASTIC_ROUNDING_CONTROL CM_HAS_CONTROL(false) +#endif #else // CM_HAS_CONTROL CM_STATIC_ERROR(0, "Redeclaration of CM_HAS_CONTROL! It's used for control version of features!"); diff --git a/lib/Headers/cm/include/cm/cm_internal.h b/lib/Headers/cm/include/cm/cm_internal.h index c71c7a0a99cb..1fdb112303d2 100644 --- a/lib/Headers/cm/include/cm/cm_internal.h +++ b/lib/Headers/cm/include/cm/cm_internal.h @@ -416,10 +416,223 @@ vector __cm_intrinsic_impl_dpasw_nosrc0(int dummy, vector src2); +template +vector __cm_intrinsic_impl_srnd(vector src1, vector src2); + +// ---------------------------------------------------------------------------- +// +// Conversions to special types (BF, etc) +// +// ---------------------------------------------------------------------------- template vector __cm_intrinsic_impl_bf_cvt(vector src0); +template +vector __cm_intrinsic_impl_tf32_cvt(vector src0); + +// ---------------------------------------------------------------------------- +// +// LSC intrinsics +// +// ---------------------------------------------------------------------------- + +template +void __cm_intrinsic_impl_prefetch_bti(SurfaceIndex Idx, vector Offset, + vector Pred); + +template +void __cm_intrinsic_impl_prefetch_flat(uint64_t Addr, vector Offset, + vector Pred); + +template +void __cm_intrinsic_impl_prefetch_bindless(uint64_t Addr, vector Offset, + vector Pred); + +template +void __cm_intrinsic_impl_block_prefetch_bti(SurfaceIndex Idx, unsigned Offset); + +template +void __cm_intrinsic_impl_block_prefetch_flat(uint64_t Addr, unsigned Offset); + +template +RetTy __cm_intrinsic_impl_load_bti(SurfaceIndex Idx, vector Offset, + vector Pred); + +template +RetTy __cm_intrinsic_impl_load_flat(uint64_t Addr, vector Offset, + vector Pred); + +template +RetTy __cm_intrinsic_impl_load_bindless(uint64_t Addr, vector Offset, + vector Pred); + +template +RetTy __cm_intrinsic_impl_load4_bti(SurfaceIndex Idx, vector Offset, + vector Pred, + ChannelMaskType mask); + +template +RetTy __cm_intrinsic_impl_load4_flat(uint64_t Addr, vector Offset, + vector Pred, + ChannelMaskType mask); + +template +RetTy __cm_intrinsic_impl_load4_bindless(uint64_t Addr, vector Offset, + vector Pred, + ChannelMaskType mask); + +template +RetTy __cm_intrinsic_impl_block_load_bti(SurfaceIndex Idx, unsigned Offset); + +template +RetTy __cm_intrinsic_impl_block_load_flat(uint64_t Addr, unsigned Offset); + +template +void __cm_intrinsic_impl_store_bti( + SurfaceIndex Idx, vector Offset, + vector()> Data, vector Pred); + +template +void __cm_intrinsic_impl_store_flat( + uint64_t Addr, vector Offset, + vector()> Data, vector Pred); + +template +void __cm_intrinsic_impl_store_bindless( + uint64_t Addr, vector Offset, + vector()> Data, vector Pred); + +template +void __cm_intrinsic_impl_store4_bti( + SurfaceIndex Idx, vector Offset, + vector()> Data, vector Pred, + ChannelMaskType mask); + +template +void __cm_intrinsic_impl_store4_flat( + uint64_t Addr, vector Offset, + vector()> Data, vector Pred, + ChannelMaskType mask); + +template +void __cm_intrinsic_impl_store4_bindless( + uint64_t Addr, vector Offset, + vector()> Data, vector Pred, + ChannelMaskType mask); + +template +void __cm_intrinsic_impl_block_store_bti( + SurfaceIndex Idx, unsigned Offset, + vector()> Data); + +template +void __cm_intrinsic_impl_block_store_flat( + uint64_t Addr, unsigned Offset, + vector()> Data); + +template +RetTy __cm_intrinsic_impl_load_slm(vector Offset, + vector Pred); + +template +RetTy __cm_intrinsic_impl_load4_slm(vector Offset, + vector Pred, + ChannelMaskType mask); + +template +RetTy __cm_intrinsic_impl_block_load_slm(unsigned Offset); + +template +void __cm_intrinsic_impl_store_slm( + vector Offset, + vector()> Data, + vector Pred); + +template +void __cm_intrinsic_impl_store4_slm( + vector Offset, + vector()> Data, vector Pred, + ChannelMaskType mask); + +template +void __cm_intrinsic_impl_block_store_slm( + unsigned Offset, vector()> Data); + +template +vector __cm_intrinsic_impl_block_load2d_flat(uintptr_t BaseAddr, + unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, + int X, int Y); + +template +void __cm_intrinsic_impl_block_store2d_flat(uintptr_t BaseAddr, + unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, + int X, int Y, vector Data); + +template +void __cm_intrinsic_impl_block_prefetch2d_flat(uintptr_t BaseAddr, + unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch, + int X, int Y); + + +template +RetTy __cm_intrinsic_impl_lsc_atomic_bti(vector Pred, + SurfaceIndex Idx, + vector Offset, + Args... args); + +template +RetTy __cm_intrinsic_impl_lsc_atomic_flat(vector Pred, + uint64_t Addr, + vector Offset, + Args... args); + +template +RetTy __cm_intrinsic_impl_lsc_atomic_bindless(vector Pred, + uint64_t Addr, + vector Offset, + Args... args); + +template +RetTy __cm_intrinsic_impl_lsc_atomic_slm(vector Pred, + vector Offset, + Args... args); + +template +void __cm_intrinsic_impl_lsc_fence(vector Pred); + template struct simd_type { static constexpr int length = n; using type = T __attribute__((ext_vector_type(n))); diff --git a/lib/Headers/cm/include/cm/cm_lsc.h b/lib/Headers/cm/include/cm/cm_lsc.h new file mode 100644 index 000000000000..f704a5b98483 --- /dev/null +++ b/lib/Headers/cm/include/cm/cm_lsc.h @@ -0,0 +1,1388 @@ +/*========================== begin_copyright_notice ============================ + +Copyright (C) 2019-2021 Intel Corporation + +SPDX-License-Identifier: MIT + +============================= end_copyright_notice ===========================*/ + +#if (__INCLUDE_LEVEL__ == 1) +static_assert(0, "CM:w:cm_lsc.h should not be included explicitly - only " + " is required"); +#endif + +#ifndef _CLANG_CM_LSC_H_ +#define _CLANG_CM_LSC_H_ + +#include "cm_common.h" +#include "cm_internal.h" +#include "cm_has_instr.h" + +#define CM_LSC_REPLICATE_MASK(VectorSize) \ + __attribute__((genx_replicate_mask(details::lsc_vector_size()))) + +/// \brief Data prefetch. +/// +/// @param VS Vector size +/// +/// @param DS Data size +/// +/// @param L1H L1 cache hint +/// +/// @param L3H L3 chache hint +/// +/// @param N The number of channels (platform dependent) +/// +/// @param Idx Surface index, which must correspond to a buffer. +/// +/// @param Offset zero based offset of the input buffer in bytes. +/// +/// @param Pred Predicate +/// +template +CM_NODEBUG CM_INLINE void cm_prefetch(SurfaceIndex Idx, + vector Offset, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr int ImmOffset = 0; + __cm_intrinsic_impl_prefetch_bti(Idx, Offset, + Pred); +} + +/// flat-address prefetch +template +CM_NODEBUG CM_INLINE void cm_ptr_prefetch(const unsigned *const Ptr, + vector Offset, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr int ImmOffset = 0; + uint64_t Addr = (uint64_t)Ptr; + __cm_intrinsic_impl_prefetch_flat( + Addr, Offset, Pred); +} + +/// bindless-address prefetch +template +CM_NODEBUG CM_INLINE void cm_offset_prefetch(unsigned SurfaceOffset, + vector Offset, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr int ImmOffset = 0; + uint64_t Addr = (uint64_t)SurfaceOffset; + __cm_intrinsic_impl_prefetch_bindless( + Addr, Offset, Pred); +} + +/// Surface-based Block prefetch. +template +CM_NODEBUG CM_INLINE void cm_prefetch(SurfaceIndex Idx, unsigned Offset) { + CM_HAS_LSC_CONTROL; + + CM_STATIC_WARNING(details::always_false(), + "Please use new interface with explicit NElts"); + CM_STATIC_ERROR(DS == DataSize::U32 || DS == DataSize::U64, + "Transposed prefetch can work only with U32 and U64 data sizes"); + CM_STATIC_ERROR( + (details::lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr int _ImmOffset = 0; + details::__cm_intrinsic_impl_block_prefetch_bti( + Idx, Offset); +} + +// Surface-based block prefetch, new interface +template +CM_NODEBUG CM_INLINE void cm_prefetch(SurfaceIndex Idx, unsigned Offset) { + CM_HAS_LSC_CONTROL; + + CM_STATIC_ERROR(DS == DataSize::U32 || DS == DataSize::U64, + "Transposed prefetch can work only with U32 and U64 data sizes"); + CM_STATIC_ERROR( + (details::lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr int _ImmOffset = 0; + constexpr VectorSize VS = details::lsc_vector_size(); + details::__cm_intrinsic_impl_block_prefetch_bti( + Idx, Offset); +} + +/// Flat-address Block prefetch. +template +CM_NODEBUG CM_INLINE void cm_ptr_prefetch(const unsigned *const Ptr, + unsigned Offset) { + CM_HAS_LSC_CONTROL; + + CM_STATIC_WARNING(details::always_false(), + "Please use new interface with explicit NElts"); + CM_STATIC_ERROR(DS == DataSize::U32 || DS == DataSize::U64, + "Transposed prefetch can work only with U32 and U64 data sizes"); + CM_STATIC_ERROR( + (details::lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr int _ImmOffset = 0; + uint64_t _Addr = (uint64_t)Ptr; + details::__cm_intrinsic_impl_block_prefetch_flat(_Addr, Offset); +} + +/// Flat-address Block prefetch, new interface +template +CM_NODEBUG CM_INLINE void cm_ptr_prefetch(const unsigned *const Ptr, + unsigned Offset) { + CM_HAS_LSC_CONTROL; + + CM_STATIC_ERROR(DS == DataSize::U32 || DS == DataSize::U64, + "Transposed prefetch can work only with U32 and U64 data sizes"); + CM_STATIC_ERROR( + (details::lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr int ImmOffset = 0; + constexpr VectorSize VS = details::lsc_vector_size(); + uint64_t Addr = (uint64_t)Ptr; + details::__cm_intrinsic_impl_block_prefetch_flat(Addr, Offset); +} + +/// \brief Data Read. +/// +/// @param T The return element data type. +/// +/// @param N The number of channels (platform dependent) +/// +/// @param VS Vector size +/// +/// @param DS Data size +/// +/// @param L1H L1 cache hint +/// +/// @param L3H L3 chache hint +/// +/// @param Pred Predicate +/// +/// @param Idx Surface index, which must correspond to a buffer. +/// +/// @param Offset zero based offset of the input buffer in bytes. +/// +/// BTI non-transposed load +template +CM_NODEBUG CM_INLINE CM_LSC_REPLICATE_MASK(VS) auto cm_load( + SurfaceIndex Idx, vector Offset, vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, VS); + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + auto _TmpRes = + __cm_intrinsic_impl_load_bti<_MessTy, _DS, VS, _ImmOffset, L1H, L3H, + _Transposed, N>(Idx, Offset, Pred); + return lsc_format_ret(_TmpRes); +} + +/// Flat-address non-transposed load +template +CM_NODEBUG CM_INLINE auto cm_ptr_load(const T *const Ptr, + vector Offset, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, VS); + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + uint64_t _Addr = (uint64_t)Ptr; + auto _TmpRes = + __cm_intrinsic_impl_load_flat<_MessTy, _DS, VS, _ImmOffset, L1H, L3H, + _Transposed, N>(_Addr, Offset, Pred); + return lsc_format_ret(_TmpRes); +} + +/// Flat-address non-transposed load (NElts interface) +template +CM_NODEBUG CM_INLINE auto cm_ptr_load(const T *const Ptr, + vector Offset, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr VectorSize VS = details::lsc_vector_size(); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, VS); + CM_STATIC_WARNING(details::always_false(), + " DEPRECATION WARNING!" + " Please for non transposed use interface with explicit VS." + " NElts are for transposed only." + " This one is deprecated and to be removed soon."); + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + uint64_t _Addr = (uint64_t)Ptr; + auto _TmpRes = + __cm_intrinsic_impl_load_flat<_MessTy, _DS, VS, _ImmOffset, L1H, L3H, + _Transposed, N>(_Addr, Offset, Pred); + return lsc_format_ret(_TmpRes); +} + +// bindless-address version using a base-pointer to a buffer +template +CM_NODEBUG CM_INLINE auto cm_offset_load(unsigned SurfaceOffset, + vector Offset, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, VS); + using _RetTy = decltype(lsc_data_type_ext()); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + uint64_t _Addr = (uint64_t)SurfaceOffset; + return __cm_intrinsic_impl_load_bindless<_RetTy, _DS, VS, _ImmOffset, L1H, + L3H, _Transposed, N>(_Addr, Offset, + Pred); +} + +// Block-load with a SurfaceIndex +template +CM_NODEBUG CM_INLINE auto cm_load(SurfaceIndex Idx, unsigned Offset) { + CM_HAS_LSC_CONTROL; + + CM_STATIC_WARNING(details::always_false(), + "Please use new interface with explicit NElts"); + CM_STATIC_ERROR( + (details::lsc_check_cache_hint()), + "unsupported cache hint"); + using _RetTy = decltype(details::lsc_data_type()); + static_assert(VS != VectorSize::N0, "invalid vector size"); + constexpr DataSize _DS = details::lsc_data_size(); + CM_STATIC_ERROR(_DS == DataSize::U32 || _DS == DataSize::U64, + "Transposed load can work only with U32 and U64 data sizes"); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = true; + return details::__cm_intrinsic_impl_block_load_bti< + _RetTy, _DS, VS, _ImmOffset, L1H, L3H, _Transposed>(Idx, Offset); +} + +// Block-load with a SurfaceIndex, new interface +template +CM_NODEBUG CM_INLINE auto cm_load(SurfaceIndex Idx, unsigned Offset) { + CM_HAS_LSC_CONTROL; + + CM_STATIC_ERROR( + (details::lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr VectorSize VS = details::lsc_vector_size(); + using _RetTy = decltype(details::lsc_data_type()); + static_assert(VS != VectorSize::N0, "invalid vector size"); + constexpr DataSize _DS = details::lsc_data_size(); + CM_STATIC_ERROR(_DS == DataSize::U32 || _DS == DataSize::U64, + "Transposed load can work only with U32 and U64 data sizes"); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = true; + return details::__cm_intrinsic_impl_block_load_bti< + _RetTy, _DS, VS, _ImmOffset, L1H, L3H, _Transposed>(Idx, Offset); +} + +// Block-load with a base-pointer to the buffer +template +CM_NODEBUG CM_INLINE auto cm_ptr_load(const T *const Ptr, unsigned Offset) { + CM_HAS_LSC_CONTROL; + + CM_STATIC_ERROR( + (details::lsc_check_cache_hint()), + "unsupported cache hint"); + CM_STATIC_WARNING(details::always_false(), + "Please use new interface with explicit NElts"); + using _RetTy = decltype(details::lsc_data_type()); + static_assert(VS != VectorSize::N0, "invalid vector size"); + constexpr DataSize _DS = details::lsc_data_size(); + CM_STATIC_ERROR(_DS == DataSize::U32 || _DS == DataSize::U64, + "Transposed load can work only with U32 and U64 data sizes"); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = true; + uint64_t _Addr = (uint64_t)Ptr; + return details::__cm_intrinsic_impl_block_load_flat< + _RetTy, _DS, VS, _ImmOffset, L1H, L3H, _Transposed>(_Addr, Offset); +} +// Block-load with a base-pointer to the buffer, new interface +template +CM_NODEBUG CM_INLINE auto cm_ptr_load(const T *const Ptr, unsigned Offset) { + CM_HAS_LSC_CONTROL; + + CM_STATIC_ERROR( + (details::lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr VectorSize VS = details::lsc_vector_size(); + using _RetTy = decltype(details::lsc_data_type()); + static_assert(VS != VectorSize::N0, "invalid vector size"); + constexpr DataSize _DS = details::lsc_data_size(); + CM_STATIC_ERROR(_DS == DataSize::U32 || _DS == DataSize::U64, + "Transposed load can work only with U32 and U64 data sizes"); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = true; + uint64_t _Addr = (uint64_t)Ptr; + return details::__cm_intrinsic_impl_block_load_flat< + _RetTy, _DS, VS, _ImmOffset, L1H, L3H, _Transposed>(_Addr, Offset); +} + +/// BTI non-transposed quad load +/// * vector size is always 4 for quad so it is not specified +/// * store is always transposed, so no block version +template +CM_NODEBUG CM_INLINE auto cm_load4(SurfaceIndex Idx, vector Offset, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr VectorSize _VS = + details::lsc_get_vector_size_from_channel_mask(); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, _VS); + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + auto _TmpRes = + __cm_intrinsic_impl_load4_bti<_MessTy, _DS, _VS, _ImmOffset, + L1H, L3H, _Transposed, N>(Idx, Offset, Pred, + Mask); + return lsc_format_ret(_TmpRes); +} + +/// Flat-address non-transposed quad load +template +CM_NODEBUG CM_INLINE auto cm_ptr_load4(const T *const Ptr, + vector Offset, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr VectorSize _VS = + details::lsc_get_vector_size_from_channel_mask(); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, _VS); + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type_ext()); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + uint64_t _Addr = (uint64_t)Ptr; + auto _TmpRes = + __cm_intrinsic_impl_load4_flat<_MessTy, _DS, _VS, _ImmOffset, L1H, L3H, + _Transposed, N>(_Addr, Offset, Pred, Mask); + return lsc_format_ret(_TmpRes); +} + +/// \brief Data Write. +/// +/// @param T The element data type. +/// +/// @param N The number of channels (platform dependent) +/// +/// @param NElts The number of element to store (for block store only) +/// +/// @param VS Vector size +/// +/// @param DS Data size +/// +/// @param L1H L1 cache hint +/// +/// @param L3H L3 chache hint +/// +/// @param Pred Predicate +/// +/// @param Idx Surface index, which must correspond to a buffer. +/// +/// @param Offset zero based offset of the output buffer in bytes. +/// +/// @param Data data to write. +/// +template +CM_NODEBUG CM_INLINE void +cm_store(SurfaceIndex Idx, vector Offset, + vector()> Data, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, VS); + constexpr DataSize _DS = lsc_expand_ds(details::lsc_data_size()); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + using _StTy = decltype(lsc_data_type_ext()); + using _CastTy = typename lsc_bitcast_type::type; + _StTy _TmpData = Data.format<_CastTy>(); + details::__cm_intrinsic_impl_store_bti::type, _DS, + VS, _ImmOffset, L1H, L3H, _Transposed, + N>(Idx, Offset, _TmpData, Pred); +} +/// Flat-address store using a base-address to a buffer +template +CM_NODEBUG CM_INLINE void +cm_ptr_store(T *Ptr, vector Offset, + vector()> Data, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, VS); + constexpr DataSize _DS = lsc_expand_ds(details::lsc_data_size()); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + uint64_t _Addr = (uint64_t)Ptr; + using _StTy = decltype(lsc_data_type_ext()); + using _CastTy = typename lsc_bitcast_type::type; + _StTy _TmpData = Data.format<_CastTy>(); + details::__cm_intrinsic_impl_store_flat::type, + _DS, VS, _ImmOffset, L1H, L3H, + _Transposed, N>(_Addr, Offset, + _TmpData, Pred); +} + +/// bindless-address store using a base-address to a buffer +template +CM_NODEBUG CM_INLINE void +cm_offset_store(unsigned SurfaceOffset, vector Offset, + vector()> Data, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, VS); + constexpr DataSize _DS = details::lsc_data_size(); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + uint64_t _Addr = (uint64_t)SurfaceOffset; + details::__cm_intrinsic_impl_store_bindless(_Addr, Offset, + Data, Pred); +} + +/// Quad version of BTI store: +/// * vector size is always 4 for quad so it is not specified +/// * store is always transposed, so no block version +template +CM_NODEBUG CM_INLINE void cm_store4( + SurfaceIndex Idx, vector Offset, + vector()> Data, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr VectorSize _VS = + details::lsc_get_vector_size_from_channel_mask(); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, _VS); + constexpr DataSize _DS = lsc_expand_ds(details::lsc_data_size()); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + using _StTy = decltype(lsc_data_type_ext()); + using _CastTy = typename lsc_bitcast_type::type; + _StTy _TmpData = Data.format<_CastTy>(); + details::__cm_intrinsic_impl_store4_bti::type, + _DS, _VS, _ImmOffset, L1H, + L3H, _Transposed, N>( + Idx, Offset, _TmpData, Pred, Mask); +} + +/// Quad version of flat store +template +CM_NODEBUG CM_INLINE void cm_ptr_store4( + T *Ptr, vector Offset, + vector()> Data, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr VectorSize _VS = + details::lsc_get_vector_size_from_channel_mask(); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, _VS); + constexpr DataSize _DS = lsc_expand_ds(details::lsc_data_size()); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + uint64_t _Addr = (uint64_t)Ptr; + using _StTy = decltype(lsc_data_type_ext()); + using _CastTy = typename lsc_bitcast_type::type; + _StTy _TmpData = Data.format<_CastTy>(); + details::__cm_intrinsic_impl_store4_flat::type, + _DS, _VS, _ImmOffset, L1H, L3H, + _Transposed, N>( + _Addr, Offset, _TmpData, Pred, Mask); +} + +/// Block store with a SurfaceIndex. +template +CM_NODEBUG CM_INLINE void cm_store(SurfaceIndex Idx, unsigned Offset, + vector Data) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr DataSize _DS = lsc_data_size(); + CM_STATIC_ERROR(_DS == DataSize::U32 || _DS == DataSize::U64, + "Transposed store can work only with U32 and U64 data sizes"); + constexpr VectorSize _VS = lsc_vector_size(); + static_assert(_VS != VectorSize::N0, "invalid vector size"); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = true; + __cm_intrinsic_impl_block_store_bti(Idx, Offset, Data); +} + +/// Block store with a base pointer. +template +CM_NODEBUG CM_INLINE void cm_ptr_store(T *ptr, unsigned Offset, + vector Data) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr DataSize _DS = lsc_data_size(); + CM_STATIC_ERROR(_DS == DataSize::U32 || _DS == DataSize::U64, + "Transposed store can work only with U32 and U64 data sizes"); + constexpr VectorSize _VS = lsc_vector_size(); + static_assert(_VS != VectorSize::N0, "invalid vector size"); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = true; + uint64_t _Addr = (uint64_t)ptr; + __cm_intrinsic_impl_block_store_flat(_Addr, Offset, Data); +} + +/// \brief SLM Data Read. +/// +/// @param T The return element data type. +/// +/// @param N The number of channels (platform dependent) +/// +/// @param VS Vector size +/// +/// @param DS Data size +/// +/// @param Pred Predicate +/// +/// @param Offset zero based offset of the input SLM buffer in bytes. +/// + +// Non-transposed SLM load +template +CM_NODEBUG CM_INLINE auto cm_load_slm(vector Offset, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, VS); + using _MessTy = decltype(details::lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + constexpr DataSize _DS = lsc_expand_ds(details::lsc_data_size()); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + auto _TmpRes = + details::__cm_intrinsic_impl_load_slm<_MessTy, _DS, VS, _ImmOffset, + _Transposed>(Offset, Pred); + return lsc_format_ret(_TmpRes); +} + +// Block-load with a base-pointer to the SLM +template +CM_NODEBUG CM_INLINE auto cm_load_slm(unsigned Offset) { + CM_HAS_LSC_CONTROL; + + CM_STATIC_WARNING(details::always_false(), + "Please use new interface with explicit NElts"); + using namespace details; + using _RetTy = decltype(lsc_data_type()); + static_assert(VS != VectorSize::N0, "invalid vector size"); + constexpr DataSize _DS = lsc_data_size(); + CM_STATIC_ERROR(_DS == DataSize::U32 || _DS == DataSize::U64, + "Transposed load can work only with U32 and U64 data sizes"); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = true; + return __cm_intrinsic_impl_block_load_slm<_RetTy, _DS, VS, _ImmOffset, + _Transposed>(Offset); +} + +// Block-load with a base-pointer to the SLM, new interface +template +CM_NODEBUG CM_INLINE auto cm_load_slm(unsigned Offset) { + CM_HAS_LSC_CONTROL; + + using namespace details; + constexpr VectorSize VS = details::lsc_vector_size(); + using _RetTy = decltype(lsc_data_type()); + static_assert(VS != VectorSize::N0, "invalid vector size"); + constexpr DataSize _DS = lsc_data_size(); + CM_STATIC_ERROR(_DS == DataSize::U32 || _DS == DataSize::U64, + "Transposed load can work only with U32 and U64 data sizes"); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = true; + return __cm_intrinsic_impl_block_load_slm<_RetTy, _DS, VS, _ImmOffset, + _Transposed>(Offset); +} + +// Non-transposed SLM quad load +template +CM_NODEBUG CM_INLINE auto cm_load4_slm(vector Offset, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + constexpr VectorSize _VS = + details::lsc_get_vector_size_from_channel_mask(); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, _VS); + using _MessTy = decltype(details::lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + constexpr DataSize DS_ = lsc_expand_ds(details::lsc_data_size()); + constexpr int ImmOffset = 0; + constexpr bool Transposed = false; + auto _TmpRes = + details::__cm_intrinsic_impl_load4_slm<_MessTy, DS_, _VS, + ImmOffset, Transposed>(Offset, + Pred, Mask); + auto _Formatted = _TmpRes.format(); + constexpr int stride = _Formatted.n_elems() / _TmpRes.n_elems(); + _RetTy _Res = _Formatted.select<_TmpRes.n_elems(), stride>(0); + return _Res; +} + +/// \brief SLM Data Write. +/// +/// @param T The element data type. +/// +/// @param N The number of channels (platform dependent) +/// +/// @param NElts The number of element to store (for block store only) +/// +/// @param VS Vector size +/// +/// @param DS Data size +/// +/// @param Pred Predicate +/// +/// @param Offset zero based offset of the output SLM buffer in bytes. +/// +/// @param Data data to write. +/// +template +CM_NODEBUG CM_INLINE void +cm_store_slm(vector Offset, + vector()> Data, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + CM_STATIC_WARNING(details::always_false(), + "Please use new interface with explicit NElts"); + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, VS); + constexpr DataSize _DS = lsc_expand_ds(details::lsc_data_size()); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + using _StTy = decltype(lsc_data_type_ext()); + using _CastTy = typename lsc_bitcast_type::type; + _StTy _TmpData = Data.format<_CastTy>(); + details::__cm_intrinsic_impl_store_slm::type, _DS, + VS, _ImmOffset, _Transposed, N>( + Offset, _TmpData, Pred); +} + +// explicit NElts version +template +CM_NODEBUG CM_INLINE void +cm_store_slm(vector Offset, + vector()> Data, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL( + N, details::lsc_vector_size()); + constexpr DataSize DS_ = lsc_expand_ds(details::lsc_data_size()); + constexpr int ImmOffset = 0; + constexpr bool Transposed = false; + constexpr VectorSize VS = details::lsc_vector_size(); + using _StTy = decltype(lsc_data_type_ext()); + using _CastTy = typename lsc_bitcast_type::type; + _StTy _TmpData = Data.format<_CastTy>(); + details::__cm_intrinsic_impl_store_slm::type, DS_, + VS, ImmOffset, Transposed, N>( + Offset, _TmpData, Pred); +} + +// Block version. +template +CM_NODEBUG CM_INLINE void cm_store_slm(unsigned Offset, vector Data) { + CM_HAS_LSC_CONTROL; + + constexpr DataSize _DS = details::lsc_data_size(); + CM_STATIC_ERROR(_DS == DataSize::U32 || _DS == DataSize::U64, + "Transposed store can work only with U32 and U64 data sizes"); + constexpr VectorSize _VS = details::lsc_vector_size(); + static_assert(_VS != VectorSize::N0, "invalid vector size"); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = true; + details::__cm_intrinsic_impl_block_store_slm(Offset, Data); +} + +// Quad version +template +CM_NODEBUG CM_INLINE void cm_store4_slm( + vector Offset, + vector()> Data, + vector Pred = 1) { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + constexpr DataSize _DS = lsc_expand_ds(details::lsc_data_size()); + constexpr VectorSize _VS = + details::lsc_get_vector_size_from_channel_mask(); + CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, _VS); + constexpr int _ImmOffset = 0; + constexpr bool _Transposed = false; + using _StTy = decltype(lsc_data_type_ext()); + using _CastTy = typename lsc_bitcast_type::type; + _StTy _TmpData = Data.format<_CastTy>(); + details::__cm_intrinsic_impl_store4_slm::type, + _DS, _VS, _ImmOffset, _Transposed, N>( + Offset, _TmpData, Pred, Mask); +} + +// HW restrictions force data which is read to contain padding filled with +// zeroes for 2d lsc loads. This function eliminates such padding. For details +// see documentation for LSC_UNTYPED (LOAD_BLOCK2D). +template +CM_NODEBUG CM_INLINE vector +unpad_untyped_2d_lsc_load(vector _PaddedV) { + CM_STATIC_ERROR(!Transposed || !Transformed, + "Transposed and transformed is not supported"); + CM_STATIC_ERROR(!Transposed || (Transposed && NumBlocks == 1), + "Transposed expected to be 1 block only"); + vector _UnpaddedV = 0; + +#pragma unroll + for (unsigned i = 0; i < NumBlocks; i++) { + + if constexpr (!Transposed && !Transformed) { +#pragma unroll + for (unsigned j = 0; j < Height; j++) { + unsigned UnpaddedBlockNumElements = i * (Width * Height); + _UnpaddedV.select(UnpaddedBlockNumElements + j * Width) = + _PaddedV.select(i * GRFBlockPitch + j * GRFRowPitch); + } + } + + else if constexpr (Transposed && !Transformed) { +#pragma unroll + for (unsigned j = 0; j < Width; j++) + _UnpaddedV.select(j * Height) = + _PaddedV.select(j * GRFRowPitch); + } + + else if constexpr (!Transposed && Transformed) { +#pragma unroll + for (unsigned j = 0; j < Height; j += ElemsPerDword) { + unsigned UnpaddedBlockNumElements = i * (Width * Height); + _UnpaddedV.select(UnpaddedBlockNumElements + + j * Width) = + _PaddedV.select(i * GRFBlockPitch + + j * GRFRowPitch); + } + } + } + + return _UnpaddedV; +} + +/// \brief 2D Block Read (flat) +/// +/// @param T The element data type. +/// +/// @param N The data size +/// +/// @param Width The block width in number of elements +/// +/// @param Height The block height +/// +/// @param NBlks, The number of blocks +/// +/// @param Transposed Is Transposed or not +/// +/// @param Transformed apply VNNI transform or not +/// +/// @param L1H L1 cache hint +/// +/// @param L3H L3 chache hint +/// +/// @param Ptr Surface base address +/// +/// @param SurfaceWidth the surface width minus 1 in bytes +/// +/// @param SurfaceHeight the surface height minus 1 in rows +/// +/// @param SurfacePitch the surface pitch minus 1 in bytes +/// +/// @param X zero based X-coordinate of the left upper rectangle corner in +/// number of elements. +/// +/// @param Y zero based Y-coordinate of the left upper rectangle corner in rows. +/// +/// @param Data Data to store. +/// +/// @return vector of type T and size N. Size is specified with padding. +/// see details::getBlock2dDataSize for details +template ()> +CM_NODEBUG CM_INLINE vector +cm_ptr_load(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, + unsigned SurfacePitch, int X, int Y) { + CM_HAS_LSC_UNTYPED_2D_CONTROL; + + CM_STATIC_ERROR(!Transposed || !Transformed, + "Transposed and transformed is not supported"); + CM_STATIC_ERROR(!Transposed || (Transposed && NBlks == 1), + "Transposed expected to be 1 block only"); + using namespace details; + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + uintptr_t Base = reinterpret_cast(Ptr); + // Calculate number of elements with padding + constexpr int ElemsPerDword = 4 / sizeof(T); + constexpr int GRFRowSize = Transposed ? Height : Width; + constexpr int GRFRowPitch = details::getNextPowerOf2(GRFRowSize); + constexpr int GRFBlockSize = GRFRowPitch * (Transposed ? Width : Height); + constexpr int GRFBlockPitch = + details::roundUpNextMultiple(64 / sizeof(T), GRFBlockSize); + constexpr int ActualN = NBlks * GRFBlockPitch; + + vector _ActualRet = + __cm_intrinsic_impl_block_load2d_flat( + Base, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y); + + // If no padding is observed, then return as read + if constexpr (ActualN == N) + return _ActualRet; + + return unpad_untyped_2d_lsc_load(_ActualRet); +} + +// convenient overload to not break legacy +template ()> +CM_NODEBUG CM_INLINE vector cm_load(T *Ptr, unsigned SurfaceWidth, + unsigned SurfaceHeight, + unsigned SurfacePitch, int X, int Y) { + return cm_ptr_load(Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y); +} + +/// \brief 2D Block Prefetch (flat) +template +CM_NODEBUG CM_INLINE void cm_ptr_prefetch(T *Ptr, unsigned SurfaceWidth, + unsigned SurfaceHeight, + unsigned SurfacePitch, int X, int Y) { + CM_HAS_LSC_UNTYPED_2D_CONTROL; + + using namespace details; + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + uintptr_t Base = reinterpret_cast(Ptr); + __cm_intrinsic_impl_block_prefetch2d_flat( + Base, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y); +} + +/// convenient overload to not break legacy +template +CM_NODEBUG CM_INLINE void cm_prefetch(T *Ptr, unsigned SurfaceWidth, + unsigned SurfaceHeight, + unsigned SurfacePitch, int X, int Y) { + return cm_ptr_prefetch( + Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y); +} + +/// \brief 2D Block Store (flat) +template ()> +CM_NODEBUG CM_INLINE void +cm_ptr_store(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, + unsigned SurfacePitch, int X, int Y, vector Data) { + CM_HAS_LSC_UNTYPED_2D_CONTROL; + + using namespace details; + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr int NBlks = 1; + uintptr_t Base = reinterpret_cast(Ptr); + __cm_intrinsic_impl_block_store2d_flat( + Base, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Data); +} + +/// convenient overload to not break legacy +template ()> +CM_NODEBUG CM_INLINE void +cm_store(T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight, + unsigned SurfacePitch, int X, int Y, vector Data) { + cm_ptr_store( + Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Data); +} + + +/// \brief LSC Atomic. +/// +/// @param T The element data type. +/// +/// @param VS Vector size +/// +/// @param DS Data size +/// +/// @param L1H L1 cache hint +/// +/// @param L3H L3 chache hint +/// +/// @param Pred Predicate +/// +/// @param Idx Surface index, which must correspond to a buffer. +/// +/// @param Offset zero based byte offset of the input buffer or SLM byte offset +/// +template +CM_NODEBUG CM_INLINE auto cm_atomic(SurfaceIndex Idx, + vector Offset, + vector Pred = 1) -> + typename std::enable_if< + details::lsc_atomic_nsrcs() == 0, + vector()> >::type { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr bool _Transposed = false; + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + auto _TmpRes = + __cm_intrinsic_impl_lsc_atomic_bti(Pred, Idx, Offset); + return lsc_format_ret(_TmpRes); +} + +template +CM_NODEBUG CM_INLINE auto +cm_atomic(SurfaceIndex Idx, vector Offset, + vector()> Src0, + vector Pred = 1) -> + typename std::enable_if() == 1, + decltype(Src0)>::type { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr bool _Transposed = false; + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + auto _TmpRes = + __cm_intrinsic_impl_lsc_atomic_bti(Pred, Idx, Offset, Src0); + return lsc_format_ret(_TmpRes); +} + +template +CM_NODEBUG CM_INLINE auto +cm_atomic(SurfaceIndex Idx, vector Offset, + vector()> Src0, + vector()> Src1, + vector Pred = 1) -> + typename std::enable_if() == 2, + decltype(Src0)>::type { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + constexpr bool _Transposed = false; + auto _TmpRes = __cm_intrinsic_impl_lsc_atomic_bti( + Pred, Idx, Offset, Src0, Src1); + return lsc_format_ret(_TmpRes); +} + +// flat-address atomic +template +CM_NODEBUG CM_INLINE auto cm_ptr_atomic(T *Ptr, vector Offset, + vector Pred = 1) -> + typename std::enable_if< + details::lsc_atomic_nsrcs() == 0, + vector()> >::type { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr bool _Transposed = false; + uint64_t _Addr = (uint64_t)Ptr; + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + auto _TmpRes = + __cm_intrinsic_impl_lsc_atomic_flat(Pred, _Addr, Offset); + return lsc_format_ret(_TmpRes); +} + +template +CM_NODEBUG CM_INLINE auto +cm_ptr_atomic(T *Ptr, vector Offset, + vector()> Src0, + vector Pred = 1) -> + typename std::enable_if() == 1, + decltype(Src0)>::type { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr bool _Transposed = false; + uint64_t _Addr = (uint64_t)Ptr; + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + auto _TmpRes = __cm_intrinsic_impl_lsc_atomic_flat( + Pred, _Addr, Offset, Src0); + return lsc_format_ret(_TmpRes); +} + +template +CM_NODEBUG CM_INLINE auto +cm_ptr_atomic(T *Ptr, vector Offset, + vector()> Src0, + vector()> Src1, + vector Pred = 1) -> + typename std::enable_if() == 2, + decltype(Src0)>::type { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + constexpr bool _Transposed = false; + uint64_t _Addr = (uint64_t)Ptr; + auto _TmpRes = __cm_intrinsic_impl_lsc_atomic_flat( + Pred, _Addr, Offset, Src0, Src1); + return lsc_format_ret(_TmpRes); +} + +// bindless-address atomic +template +CM_NODEBUG CM_INLINE auto cm_offset_atomic(unsigned SurfaceOffset, + vector Offset, + vector Pred = 1) -> + typename std::enable_if< + details::lsc_atomic_nsrcs() == 0, + vector()> >::type { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr bool _Transposed = false; + uint64_t _Addr = (uint64_t)SurfaceOffset; + using RetType = vector()>; + return __cm_intrinsic_impl_lsc_atomic_bindless(Pred, _Addr, + Offset); +} + +template +CM_NODEBUG CM_INLINE auto cm_atomic_slm(vector Offset, + vector Pred = 1) -> + typename std::enable_if< + details::lsc_atomic_nsrcs() == 0, + vector()> >::type { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr bool _Transposed = false; + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + auto _TmpRes = + __cm_intrinsic_impl_lsc_atomic_slm(Pred, Offset); + return lsc_format_ret(_TmpRes); +} + +template +CM_NODEBUG CM_INLINE auto +cm_atomic_slm(vector Offset, + vector()> Src0, + vector Pred = 1) -> + typename std::enable_if() == 1, + decltype(Src0)>::type { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + constexpr bool _Transposed = false; + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + auto _TmpRes = + __cm_intrinsic_impl_lsc_atomic_slm(Pred, Offset, Src0); + return lsc_format_ret(_TmpRes); +} + +template +CM_NODEBUG CM_INLINE auto +cm_atomic_slm(vector Offset, + vector()> Src0, + vector()> Src1, + vector Pred = 1) -> + typename std::enable_if() == 2, + decltype(Src0)>::type { + CM_HAS_LSC_CONTROL; + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + CM_STATIC_ERROR((lsc_check_cache_hint()), + "unsupported cache hint"); + constexpr DataSize _DS = lsc_expand_ds(lsc_data_size()); + using _MessTy = decltype(lsc_data_type_ext()); + using _RetTy = decltype(lsc_data_type()); + constexpr bool _Transposed = false; + auto _TmpRes = + __cm_intrinsic_impl_lsc_atomic_slm(Pred, Offset, Src0, Src1); + return lsc_format_ret(_TmpRes); +} + +/// +/// LSC Fence built-in +/// +/// \brief LSC Fence. +/// +/// @param N The number of channels (platform dependent) +/// +/// @param Sfid shaded funnction +/// +/// @param FenceOp +/// +/// @param Scope +/// +template +CM_NODEBUG CM_INLINE void cm_fence(vector Pred = 1) { + CM_HAS_LSC_CONTROL;//control platform version + + using namespace details; + CM_STATIC_ERROR(lsc_check_simt(), "unexpected number of channels"); + __cm_intrinsic_impl_lsc_fence(Pred); +} + + +#endif // _CLANG_CM_LSC_H_ diff --git a/lib/Headers/cm/include/cm/cm_srnd.h b/lib/Headers/cm/include/cm/cm_srnd.h new file mode 100644 index 000000000000..ab2d11b32dba --- /dev/null +++ b/lib/Headers/cm/include/cm/cm_srnd.h @@ -0,0 +1,37 @@ +/*========================== begin_copyright_notice ============================ + +Copyright (C) 2020-2021 Intel Corporation + +SPDX-License-Identifier: MIT + +============================= end_copyright_notice ===========================*/ + +#if (__INCLUDE_LEVEL__ == 1) +static_assert(0, "CM:w:cm_srnd.h should not be included explicitly - only " + " is required"); +#endif + +#ifndef _CLANG_CM_SRND_H_ +#define _CLANG_CM_SRND_H_ + +#include "cm_common.h" +#include "cm_internal.h" +#include "cm_has_instr.h" + +template +CM_NODEBUG CM_INLINE vector cm_srnd(vector src1, + vector src2) { + CM_HAS_STOCHASTIC_ROUNDING_CONTROL; + + constexpr bool is_hf16_fp32 = + details::is_half_type::value && details::is_float_type::value; + CM_STATIC_ERROR(is_hf16_fp32, "unsupported srnd type"); + + { + vector _Src1 = src1; + vector _Src2 = src2; + return details::__cm_intrinsic_impl_srnd(_Src1, _Src2); + } +} + +#endif // _CLANG_CM_SRND_H_ diff --git a/lib/Headers/cm/include/cm/cm_svm.h b/lib/Headers/cm/include/cm/cm_svm.h index 7234218d2a0e..1585203c9d96 100644 --- a/lib/Headers/cm/include/cm/cm_svm.h +++ b/lib/Headers/cm/include/cm/cm_svm.h @@ -406,7 +406,8 @@ constexpr bool checkSVMAtomic() { } // One source float operand. - if constexpr (Op == ATOMIC_FMAX || Op == ATOMIC_FMIN) { + if constexpr (Op == ATOMIC_FMAX || Op == ATOMIC_FMIN || Op == ATOMIC_FADD || + Op == ATOMIC_FSUB) { if constexpr (NumSrc != 1) { CM_STATIC_ERROR(NumSrc == 1, "One source operand is expected"); return false; diff --git a/lib/Headers/cm/include/cm/cm_target.h b/lib/Headers/cm/include/cm/cm_target.h index 991bab900957..237c1a9c8481 100644 --- a/lib/Headers/cm/include/cm/cm_target.h +++ b/lib/Headers/cm/include/cm/cm_target.h @@ -22,6 +22,21 @@ static_assert(0, "CM:w:cm_target.h should not be included explicitly - only " #define CM_HAS_VA_PLUS 1 #endif //(CM_GENX >= 900 && CM_GENX <= 1150) +#if !(CM_GENX == 1280 && CM_GENX_REVID <= 2) //PVC +#define CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT +#endif // !(CM_GENX == 1280 && CM_GENX_REVID <= 2) + +// On PVC non-transpose LSC messages have SIMD32 layout +// So 16-channels non-transposed lsc messages with VectorSze != 1 +// aren't supported on PVC +#if !defined(CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT) +#define CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, VS) \ + CM_STATIC_ERROR( \ + N == details::lsc_default_simt() || VS == VectorSize::N1, \ + "unexpected number of channels for non-transpose lsc message"); +#else +#define CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT_CONTROL(N, VS) +#endif //!defined(CM_HAS_LSC_NON_TRANSPOSE_MESSAGES_WITH_NON_DEFAULT_SIMT) // Make Gen target specific warnings into errors #pragma clang diagnostic error "-Wgen-target" diff --git a/lib/Headers/cm/include/cm/cm_util.h b/lib/Headers/cm/include/cm/cm_util.h index dd52c3952d5c..ae96a7222655 100644 --- a/lib/Headers/cm/include/cm/cm_util.h +++ b/lib/Headers/cm/include/cm/cm_util.h @@ -119,7 +119,8 @@ write_region(vector_ref vec, vector insertion, int offset) { } static inline constexpr unsigned getMaxNumOfOWordSLM() { -#if defined(CM_GEN12) +#if defined(CM_GEN12) || defined(CM_GEN12_2) || defined(CM_XEHP) || defined(CM_XEHPC) \ + || defined(CM_XEHPG) return 16; #else return 8; @@ -132,6 +133,251 @@ constexpr bool always_false() { return false; } +template constexpr unsigned lsc_vector_size() { + constexpr unsigned NElts[] = {0, 1, 2, 3, 4, 8, 16, 32, 64}; + return NElts[static_cast(VS)]; +} + +template constexpr VectorSize lsc_vector_size() { + switch (N) { + case 1: + return VectorSize::N1; + case 2: + return VectorSize::N2; + case 3: + return VectorSize::N3; + case 4: + return VectorSize::N4; + case 8: + return VectorSize::N8; + case 16: + return VectorSize::N16; + case 32: + return VectorSize::N32; + case 64: + return VectorSize::N64; + default: + break; + } + return VectorSize::N0; +} + +template constexpr DataSize lsc_data_size() { + if constexpr (DS != DataSize::Default) + return DS; + else if constexpr (sizeof(T) == 1) + return DataSize::U8; + else if constexpr (sizeof(T) == 2) + return DataSize::U16; + else if constexpr (sizeof(T) == 4) + return DataSize::U32; + else if constexpr (sizeof(T) == 8) + return DataSize::U64; + else if constexpr (DS == DataSize::Default) + static_assert(DS != DataSize::Default && "unsupported data type"); + return DS; +} + +template auto lsc_data_type_ext() { + constexpr unsigned NumElts = lsc_vector_size() * N; + static_assert(NumElts > 0 && "unexpected number of elements"); + if constexpr (sizeof(T) < 4) + return vector(); + else + return vector(); +} + +template auto lsc_data_type() { + constexpr unsigned NumElts = lsc_vector_size() * N; + static_assert(NumElts > 0 && "unexpected number of elements"); + return vector(); +} +// U8 and U16 types are not supported +// use U8U32 and U16U32 instead +constexpr DataSize lsc_expand_ds(DataSize ds) { + if (ds == DataSize::U8) + return DataSize::U8U32; + if (ds == DataSize::U16) + return DataSize::U16U32; + return ds; +} + +template struct lsc_expand_type { + typedef typename std::conditional::type type; +}; + +// fp has to be bitcsted to uint before zextention +template struct lsc_bitcast_type { +private: + typedef typename std::conditional::type _type1; + typedef typename std::conditional::type _type2; + +public: + typedef + typename std::conditional::type type; +}; +// format U8U32 and U16U32 back to U8 and U16 +template To lsc_format_ret(From from) { + auto _Formatted = from.format(); + constexpr int stride = _Formatted.n_elems() / from.n_elems(); + To _Res = _Formatted.select(0); + return _Res; +}; + +template constexpr int lsc_atomic_nsrcs() { + switch (Op) { + case AtomicOp::IINC: + case AtomicOp::IDEC: + case AtomicOp::LOAD: + return 0; + case AtomicOp::STORE: + case AtomicOp::IADD: + case AtomicOp::ISUB: + case AtomicOp::SMIN: + case AtomicOp::SMAX: + case AtomicOp::UMIN: + case AtomicOp::UMAX: + case AtomicOp::FSUB: + case AtomicOp::FMIN: + case AtomicOp::FMAX: + case AtomicOp::FADD: + case AtomicOp::AND: + case AtomicOp::OR: + case AtomicOp::XOR: + return 1; + case AtomicOp::ICAS: + case AtomicOp::FCAS: + return 2; + default: + break; + } + return 0; +} + +// Compute the data size for 2d block load or store. +template +constexpr int getBlock2dDataSize() { + if (Transformed) + return roundUpNextMultiple(Height, 4 / sizeof(T)) * getNextPowerOf2(Width) * + NBlocks; + return Width * Height * NBlocks; +} + +constexpr int getRoundedWidthFor2dTypedLSC(int Width) { + return Width < 4 ? 4 : details::getNextPowerOf2(Width); +} + +// Return the default SIMT width. +template constexpr int lsc_default_simt() { +#if CM_GENX >= 1280 + return 32; // SIMD32: PVC and later +#else // CM_GENX < 1280 + return 16; // SIMD16: DG2 +#endif // CM_GENX >= 1280 +} + +// Check for valid SIMT width. +template +constexpr bool lsc_check_simt() { +#if CM_GENX >= 1280 + return ((N == 32) || (N == 16)); // SIMD32: PVC +#else // CM_GENX < 1280 + return ((N == 16) || (N == 8)); // SIMD16: DG2 +#endif // CM_GENX >= 1280 +} + +template class CacheHintWrap { + template class is_one_of_t; + template + struct is_one_of_t + : std::conditional::type { + }; + template + struct is_one_of_t + : std::conditional>::type {}; + +public: + constexpr operator CacheHint() const { return mHint; } + template constexpr bool is_one_of() const { + return is_one_of_t::value; + } +}; + +constexpr bool are_both(CacheHint First, CacheHint Second, CacheHint Val) { + return First == Val && Second == Val; +} + +enum class LSCAction { + Prefetch, + Load, + Store, + Atomic +}; + +template +constexpr bool lsc_check_cache_hint() { + constexpr auto L1H = CacheHintWrap{}; + constexpr auto L3H = CacheHintWrap{}; + switch (Act) { + case LSCAction::Prefetch: + return L1H.is_one_of() && + L3H.is_one_of() && + !are_both(L1H, L3H, CacheHint::Uncached); + case LSCAction::Load: + return are_both(L1H, L3H, CacheHint::Default) || + (L1H.is_one_of() && + L3H.is_one_of()) || + (L1H == CacheHint::ReadInvalidate && L3H == CacheHint::Cached); + case LSCAction::Store: + return are_both(L1H, L3H, CacheHint::Default) || + are_both(L1H, L3H, CacheHint::WriteBack) || + (L1H.is_one_of() && + L3H.is_one_of()); + + case LSCAction::Atomic: + return are_both(L1H, L3H, CacheHint::Default) || + (L1H == CacheHint::Uncached && + L3H.is_one_of()); + } +} + +template +constexpr VectorSize lsc_get_vector_size_from_channel_mask() { + switch (Mask) { + case ChannelMaskType::_CM_R_ENABLE: + case ChannelMaskType::_CM_G_ENABLE: + case ChannelMaskType::_CM_B_ENABLE: + case ChannelMaskType::_CM_A_ENABLE: + return VectorSize::N1; + case ChannelMaskType::_CM_GR_ENABLE: + case ChannelMaskType::_CM_BR_ENABLE: + case ChannelMaskType::_CM_BG_ENABLE: + case ChannelMaskType::_CM_AR_ENABLE: + case ChannelMaskType::_CM_AG_ENABLE: + case ChannelMaskType::_CM_AB_ENABLE: + return VectorSize::N2; + case ChannelMaskType::_CM_BGR_ENABLE: + case ChannelMaskType::_CM_AGR_ENABLE: + case ChannelMaskType::_CM_ABR_ENABLE: + case ChannelMaskType::_CM_ABG_ENABLE: + return VectorSize::N3; + case ChannelMaskType::_CM_ABGR_ENABLE: + return VectorSize::N4; + default: + break; + } + return VectorSize::N0; +} + +template +constexpr unsigned lsc_get_num_elements_from_channel_mask() { + constexpr auto _VS = lsc_get_vector_size_from_channel_mask(); + return lsc_vector_size<_VS>(); +} } // namespace details diff --git a/lib/Headers/cm/include/cm/cm_vme.h b/lib/Headers/cm/include/cm/cm_vme.h index 7d02c29e09eb..7ba511f9dfb2 100644 --- a/lib/Headers/cm/include/cm/cm_vme.h +++ b/lib/Headers/cm/include/cm/cm_vme.h @@ -28,8 +28,6 @@ static_assert(0, "CM:w:cm_vme.h should not be included explicitly - only " #elif defined(CM_GEN12) #include "gen12_vme.h" #else - - #endif #endif /* _CLANG_CM_VME_H_ */ diff --git a/tools/cmoc/Backend.cpp b/tools/cmoc/Backend.cpp index 3537577d8309..0a9a77708c17 100644 --- a/tools/cmoc/Backend.cpp +++ b/tools/cmoc/Backend.cpp @@ -83,6 +83,11 @@ const std::unordered_map CmToNeoCPU{ {"RKL", "rkl"}, {"DG1", "dg1"}, {"XEHP_SDV", "xe_hp_sdv"}, + {"DG2", "dg2"}, + {"ADLS", "adls"}, + {"ADLP", "adlp"}, + {"PVC", "pvc"}, + {"PVCXT", "pvc"}, }; // clang-format on