VapourSynth-llvmexpr
Loading...
Searching...
No Matches
IRGeneratorBase.cpp
Go to the documentation of this file.
1
19
20#include "IRGeneratorBase.hpp"
21#include "../Sorting.hpp"
22
23#include <algorithm>
24#include <array>
25#include <format>
26#include <map>
27#include <numbers>
28#include <numeric>
29#include <unordered_map>
30
31#include "llvm/IR/Constants.h"
32#include "llvm/IR/Instructions.h"
33#include "llvm/TargetParser/Host.h"
34
35constexpr unsigned ALIGNMENT = 32; // Vapoursynth should guarantee this
36
38 const std::vector<Token>& tokens_in, const VSVideoInfo* out_vi,
39 const std::vector<const VSVideoInfo*>& in_vi, int width_in, int height_in,
40 bool mirror, const std::map<std::pair<int, std::string>, int>& p_map,
41 const analysis::ExpressionAnalysisResults& analysis_results_in,
42 llvm::LLVMContext& context_ref, llvm::Module& module_ref,
43 llvm::IRBuilder<>& builder_ref, MathLibraryManager& math_mgr,
44 std::string func_name_in, int approx_math_in)
45 : tokens(tokens_in), vo(out_vi), vi(in_vi),
46 num_inputs(static_cast<int>(in_vi.size())), width(width_in),
47 height(height_in), mirror_boundary(mirror), prop_map(p_map),
48 analysis_results(analysis_results_in), func_name(std::move(func_name_in)),
49 approx_math(approx_math_in), context(context_ref), module(module_ref),
50 builder(builder_ref), math_manager(math_mgr), func(nullptr),
51 rwptrs_arg(nullptr), strides_arg(nullptr), props_arg(nullptr),
52 alias_scope_domain(nullptr) {}
53
58
59llvm::AllocaInst*
61 const std::string& name) {
62 llvm::IRBuilder<> entry_builder(&func->getEntryBlock(),
63 func->getEntryBlock().begin());
64 return entry_builder.CreateAlloca(type, nullptr, name);
65}
66
67void IRGeneratorBase::assumeAligned(llvm::Value* ptr_value,
68 unsigned alignment) {
69 llvm::Function* assume_fn = llvm::Intrinsic::getOrInsertDeclaration(
70 &module, llvm::Intrinsic::assume);
71 llvm::Value* cond = builder.getInt1(true);
72 llvm::SmallVector<llvm::Value*, 2> args;
73 args.push_back(ptr_value);
74 args.push_back(builder.getInt64(static_cast<uint64_t>(alignment)));
75 llvm::OperandBundleDefT<llvm::Value*> align_bundle("align", args);
76 builder.CreateCall(assume_fn, {cond}, {align_bundle});
77}
78
79llvm::Value* IRGeneratorBase::getFinalCoord(llvm::Value* coord,
80 llvm::Value* max_dim,
81 bool use_mirror) {
82 llvm::Value* zero = builder.getInt32(0);
83 llvm::Value* one = builder.getInt32(1);
84
85 llvm::Value* result = nullptr;
86 if (use_mirror) {
87 auto* period = builder.CreateMul(max_dim, builder.getInt32(2));
88
89 auto* modulo_coord = builder.CreateSRem(coord, period);
90
91 auto* is_negative = builder.CreateICmpSLT(modulo_coord, zero);
92 auto* adjusted_modulo = builder.CreateAdd(modulo_coord, period);
93 modulo_coord =
94 builder.CreateSelect(is_negative, adjusted_modulo, modulo_coord);
95
96 auto* in_first_half = builder.CreateICmpSLT(modulo_coord, max_dim);
97 auto* period_minus_1 = builder.CreateSub(period, one);
98 auto* mirrored_coord = builder.CreateSub(period_minus_1, modulo_coord);
99
100 result =
101 builder.CreateSelect(in_first_half, modulo_coord, mirrored_coord);
102 } else { // Clamping
103 // clamp(coord, 0, max_dim - 1)
104 auto* dim_minus_1 = builder.CreateSub(max_dim, one);
105
106 llvm::Function* smax_func = llvm::Intrinsic::getOrInsertDeclaration(
107 &module, llvm::Intrinsic::smax, {builder.getInt32Ty()});
108 llvm::Function* smin_func = llvm::Intrinsic::getOrInsertDeclaration(
109 &module, llvm::Intrinsic::smin, {builder.getInt32Ty()});
110
111 auto* clamped_at_zero = builder.CreateCall(smax_func, {coord, zero});
112 result = builder.CreateCall(smin_func, {clamped_at_zero, dim_minus_1});
113 }
114
115 return result;
116}
117
118llvm::Value* IRGeneratorBase::generateLoadFromRowPtr(llvm::Value* row_ptr,
119 int clip_idx,
120 llvm::Value* x, int rel_x,
121 bool use_mirror,
122 bool no_x_bounds_check) {
123 const VSVideoInfo* vinfo = vi[clip_idx];
124 llvm::Value* coord_x = builder.CreateAdd(x, builder.getInt32(rel_x));
125 llvm::Value* final_x = nullptr;
126 if (no_x_bounds_check) {
127 final_x = coord_x;
128 } else {
129 final_x = getFinalCoord(coord_x, builder.getInt32(width), use_mirror);
130 }
131
132 const VSVideoFormat& format = vinfo->format;
133 int bpp = format.bytesPerSample;
134 int vs_clip_idx = clip_idx + 1;
135
136 llvm::Value* x_offset = builder.CreateMul(final_x, builder.getInt32(bpp));
137 llvm::Value* pixel_addr =
138 builder.CreateGEP(builder.getInt8Ty(), row_ptr, x_offset);
139
140 unsigned pixel_align = std::gcd(ALIGNMENT, bpp);
141 assumeAligned(pixel_addr, pixel_align);
142
143 llvm::Value* loaded_val = nullptr;
144 if (format.sampleType == stInteger) {
145 llvm::Type* load_type = nullptr;
146 if (bpp == 1) {
147 load_type = builder.getInt8Ty();
148 } else if (bpp == 2) {
149 load_type = builder.getInt16Ty();
150 } else {
151 load_type = builder.getInt32Ty();
152 }
153 llvm::LoadInst* li = builder.CreateLoad(load_type, pixel_addr);
154 setMemoryInstAttrs(li, pixel_align, vs_clip_idx);
155 loaded_val = builder.CreateZExtOrBitCast(li, builder.getInt32Ty());
156 return builder.CreateUIToFP(loaded_val, builder.getFloatTy());
157 }
158 // stFloat
159 if (bpp == 4) {
160 llvm::LoadInst* li =
161 builder.CreateLoad(builder.getFloatTy(), pixel_addr);
162 setMemoryInstAttrs(li, pixel_align, vs_clip_idx);
163 return li;
164 }
165 if (bpp == 2) {
166 llvm::LoadInst* li =
167 builder.CreateLoad(builder.getHalfTy(), pixel_addr);
168 setMemoryInstAttrs(li, pixel_align, vs_clip_idx);
169 return builder.CreateFPExt(li, builder.getFloatTy());
170 }
171 throw std::runtime_error("Unsupported float sample size.");
172}
173
175 llvm::BranchInst* loop_br) { // NOLINT(readability-non-const-parameter)
176 llvm::StringMap<bool> host_features = llvm::sys::getHostCPUFeatures();
177 unsigned simd_width = 4;
178 if (!host_features.empty()) {
179 if (host_features["avx512f"]) {
180 simd_width = 16; // NOLINT(cppcoreguidelines-avoid-magic-numbers)
181 } else if (host_features["avx2"]) {
182 simd_width = 8; // NOLINT(cppcoreguidelines-avoid-magic-numbers)
183 }
184 }
185
186 auto create_md_node = [this](const char* name, llvm::Type* type,
187 uint64_t value) -> llvm::MDNode* {
188 std::array<llvm::Metadata*, 2> md = {
189 llvm::MDString::get(context, name),
190 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(type, value))};
191 return llvm::MDNode::get(context, md);
192 };
193
194 llvm::MDNode* vec_width_node =
195 create_md_node("llvm.loop.vectorize.width",
196 llvm::Type::getInt32Ty(context), simd_width);
197
198 llvm::MDNode* enable_vec_node = create_md_node(
199 "llvm.loop.vectorize.enable", llvm::Type::getInt1Ty(context), 1);
200
201 llvm::MDNode* interleave_node = create_md_node(
202 "llvm.loop.interleave.count", llvm::Type::getInt32Ty(context), 4);
203
204 llvm::SmallVector<llvm::Metadata*,
205 5> // NOLINT(cppcoreguidelines-avoid-magic-numbers)
206 loop_md_elems;
207 loop_md_elems.push_back(nullptr); // to be replaced with self reference
208 loop_md_elems.push_back(enable_vec_node);
209 loop_md_elems.push_back(vec_width_node);
210 loop_md_elems.push_back(interleave_node);
211 llvm::MDNode* loop_id = llvm::MDNode::getDistinct(context, loop_md_elems);
212 loop_id->replaceOperandWith(0, loop_id);
213
214 loop_br->setMetadata(llvm::LLVMContext::MD_loop, loop_id);
215}
216
217llvm::Value* IRGeneratorBase::generatePixelLoad(int clip_idx, llvm::Value* x,
218 llvm::Value* y, bool mirror) {
219 llvm::Value* final_x = getFinalCoord(x, builder.getInt32(width), mirror);
220 llvm::Value* final_y = getFinalCoord(y, builder.getInt32(height), mirror);
221
222 int vs_clip_idx = clip_idx + 1;
223 llvm::Value* base_ptr = preloaded_base_ptrs[vs_clip_idx];
224 llvm::Value* stride = preloaded_strides[vs_clip_idx];
225
226 llvm::Value* y_offset = builder.CreateMul(final_y, stride);
227 llvm::Value* row_ptr =
228 builder.CreateGEP(builder.getInt8Ty(), base_ptr, y_offset);
229
230 return generateLoadFromRowPtr(row_ptr, clip_idx, final_x, 0, mirror, true);
231}
232
233void IRGeneratorBase::generatePixelStore(llvm::Value* value_to_store,
234 llvm::Value* x, llvm::Value* y) {
235 const VSVideoFormat& format = vo->format;
236 int bpp = format.bytesPerSample;
237 constexpr int DST_IDX = 0;
238
239 llvm::Value* base_ptr = preloaded_base_ptrs[DST_IDX];
240 llvm::Value* stride = preloaded_strides[DST_IDX];
241
242 llvm::Value* y_offset = builder.CreateMul(y, stride);
243 llvm::Value* x_offset = builder.CreateMul(x, builder.getInt32(bpp));
244 llvm::Value* total_offset = builder.CreateAdd(y_offset, x_offset);
245 llvm::Value* pixel_addr =
246 builder.CreateGEP(builder.getInt8Ty(), base_ptr, total_offset);
247
248 unsigned pixel_align = std::gcd(ALIGNMENT, bpp);
249 assumeAligned(pixel_addr, pixel_align);
250
251 llvm::Value* final_val = nullptr;
252 if (format.sampleType == stInteger) {
253 int max_val = (1 << format.bitsPerSample) - 1;
254 llvm::Value* zero_f = llvm::ConstantFP::get(builder.getFloatTy(), 0.0);
255 llvm::Value* max_f = llvm::ConstantFP::get(
256 builder.getFloatTy(), static_cast<double>(max_val));
257
258 llvm::Value* temp = createIntrinsicCall(llvm::Intrinsic::maxnum,
259 value_to_store, zero_f);
260 llvm::Value* clamped_f =
261 createIntrinsicCall(llvm::Intrinsic::minnum, temp, max_f);
262
263 llvm::Value* rounded_f =
264 createIntrinsicCall(llvm::Intrinsic::roundeven, clamped_f);
265
266 llvm::Type* store_type = nullptr;
267 if (bpp == 1) {
268 store_type = builder.getInt8Ty();
269 } else if (bpp == 2) {
270 store_type = builder.getInt16Ty();
271 } else {
272 store_type = builder.getInt32Ty();
273 }
274 final_val = builder.CreateFPToUI(rounded_f, store_type);
275 llvm::StoreInst* si = builder.CreateStore(final_val, pixel_addr);
276 setMemoryInstAttrs(si, pixel_align, DST_IDX);
277 } else {
278 if (bpp == 4) {
279 llvm::StoreInst* si =
280 builder.CreateStore(value_to_store, pixel_addr);
281 setMemoryInstAttrs(si, pixel_align, DST_IDX);
282 } else if (bpp == 2) {
283 llvm::Value* truncated_val =
284 builder.CreateFPTrunc(value_to_store, builder.getHalfTy());
285 llvm::StoreInst* si =
286 builder.CreateStore(truncated_val, pixel_addr);
287 setMemoryInstAttrs(si, pixel_align, DST_IDX);
288 } else {
289 throw std::runtime_error("Unsupported float sample size.");
290 }
291 }
292}
293
295 std::vector<llvm::Value*>& rpn_stack,
296 llvm::Type* float_ty,
297 llvm::Type* i32_ty,
298 bool use_approx_math) {
299 auto apply_stack_op = [&]<size_t ARITY>(auto&& op) {
300 std::array<llvm::Value*, ARITY> args{};
301 for (size_t i = ARITY; i > 0; --i) {
302 args.at(i - 1) = rpn_stack.back();
303 rpn_stack.pop_back();
304 }
305 rpn_stack.push_back(std::apply(op, args));
306 };
307
308 auto apply_intrinsic = [&]<size_t ARITY>(llvm::Intrinsic::ID id) {
309 apply_stack_op.operator()<ARITY>(
310 [&](auto... args) { return createIntrinsicCall(id, args...); });
311 };
312
313 auto apply_binary_op = [&](auto op_callable) {
314 apply_stack_op.operator()<2>(
315 [&](auto a, auto b) { return op_callable(a, b); });
316 };
317
318 auto apply_binary_cmp = [&](llvm::CmpInst::Predicate pred) {
319 apply_stack_op.operator()<2>([&](auto a, auto b) {
320 auto cmp = builder.CreateFCmp(pred, a, b);
321 return builder.CreateSelect(cmp,
322 llvm::ConstantFP::get(float_ty, 1.0),
323 llvm::ConstantFP::get(float_ty, 0.0));
324 });
325 };
326
327 auto apply_logical_op = [&](auto op) {
328 apply_stack_op.operator()<2>([&](auto a_val, auto b_val) {
329 auto a_bool = builder.CreateFCmpOGT(
330 a_val, llvm::ConstantFP::get(float_ty, 0.0));
331 auto b_bool = builder.CreateFCmpOGT(
332 b_val, llvm::ConstantFP::get(float_ty, 0.0));
333 auto logic_res = op(a_bool, b_bool);
334 return builder.CreateSelect(logic_res,
335 llvm::ConstantFP::get(float_ty, 1.0),
336 llvm::ConstantFP::get(float_ty, 0.0));
337 });
338 };
339
340 auto apply_bitwise_op = [&](auto op) {
341 apply_stack_op.operator()<2>([&](auto a, auto b) {
342 auto a_rounded = createIntrinsicCall(llvm::Intrinsic::nearbyint, a);
343 auto b_rounded = createIntrinsicCall(llvm::Intrinsic::nearbyint, b);
344 auto ai = builder.CreateFPToSI(a_rounded, i32_ty);
345 auto bi = builder.CreateFPToSI(b_rounded, i32_ty);
346 auto resi = op(ai, bi);
347 return builder.CreateSIToFP(resi, float_ty);
348 });
349 };
350
351 auto apply_approx_math_op =
352 [&]<size_t ARITY>(MathOp math_op, llvm::Intrinsic::ID intrinsic_id) {
353 static_assert(ARITY == 1 || ARITY == 2,
354 "Only unary or binary operations supported");
355
356 std::array<llvm::Value*, ARITY> args{};
357 for (size_t i = 0; i < ARITY; ++i) {
358 args.at(ARITY - 1 - i) = rpn_stack.back();
359 rpn_stack.pop_back();
360 }
361
362 if (use_approx_math) {
363 auto* callee = math_manager.getFunction(math_op);
364 llvm::SmallVector<llvm::Value*, 2> call_args(args.begin(),
365 args.end());
366 auto* call = builder.CreateCall(callee, call_args);
367 call->setFastMathFlags(builder.getFastMathFlags());
368 rpn_stack.push_back(call);
369 } else {
370 rpn_stack.push_back(std::apply(
371 [&](auto... args) {
372 return createIntrinsicCall(intrinsic_id, args...);
373 },
374 args));
375 }
376 };
377
378 switch (token.type) {
379 case TokenType::Number: {
380 const auto& payload = std::get<TokenPayloadNumber>(token.payload);
381 rpn_stack.push_back(llvm::ConstantFP::get(float_ty, payload.value));
382 return true;
383 }
385 rpn_stack.push_back(
386 builder.CreateSIToFP(builder.getInt32(width), float_ty));
387 return true;
389 rpn_stack.push_back(
390 builder.CreateSIToFP(builder.getInt32(height), float_ty));
391 return true;
393 rpn_stack.push_back(builder.CreateLoad(
394 float_ty,
395 builder.CreateGEP(float_ty, props_arg, builder.getInt32(0))));
396 return true;
398 rpn_stack.push_back(llvm::ConstantFP::get(float_ty, std::numbers::pi));
399 return true;
400
401 // Binary Operators
402 case TokenType::Add:
403 apply_binary_op([&](llvm::Value* a, llvm::Value* b) {
404 return builder.CreateFAdd(a, b);
405 });
406 return true;
407 case TokenType::Sub:
408 apply_binary_op([&](llvm::Value* a, llvm::Value* b) {
409 return builder.CreateFSub(a, b);
410 });
411 return true;
412 case TokenType::Mul:
413 apply_binary_op([&](llvm::Value* a, llvm::Value* b) {
414 return builder.CreateFMul(a, b);
415 });
416 return true;
417 case TokenType::Div:
418 apply_binary_op([&](llvm::Value* a, llvm::Value* b) {
419 return builder.CreateFDiv(a, b);
420 });
421 return true;
422 case TokenType::Mod:
423 apply_binary_op([&](llvm::Value* a, llvm::Value* b) {
424 return builder.CreateFRem(a, b);
425 });
426 return true;
427 case TokenType::Pow:
428 apply_intrinsic.operator()<2>(llvm::Intrinsic::pow);
429 return true;
430 case TokenType::Atan2:
431 apply_approx_math_op.operator()<2>(MathOp::Atan2,
432 llvm::Intrinsic::atan2);
433 return true;
435 apply_intrinsic.operator()<2>(llvm::Intrinsic::copysign);
436 return true;
437 case TokenType::Min:
438 apply_intrinsic.operator()<2>(llvm::Intrinsic::minnum);
439 return true;
440 case TokenType::Max:
441 apply_intrinsic.operator()<2>(llvm::Intrinsic::maxnum);
442 return true;
443
444 // Binary comparisons
445 case TokenType::Gt:
446 apply_binary_cmp(llvm::CmpInst::FCMP_OGT);
447 return true;
448 case TokenType::Lt:
449 apply_binary_cmp(llvm::CmpInst::FCMP_OLT);
450 return true;
451 case TokenType::Ge:
452 apply_binary_cmp(llvm::CmpInst::FCMP_OGE);
453 return true;
454 case TokenType::Le:
455 apply_binary_cmp(llvm::CmpInst::FCMP_OLE);
456 return true;
457 case TokenType::Eq:
458 apply_binary_cmp(llvm::CmpInst::FCMP_OEQ);
459 return true;
460
461 // Logical ops
462 case TokenType::And:
463 apply_logical_op(
464 [&](auto a, auto b) { return builder.CreateAnd(a, b); });
465 return true;
466 case TokenType::Or:
467 apply_logical_op(
468 [&](auto a, auto b) { return builder.CreateOr(a, b); });
469 return true;
470 case TokenType::Xor:
471 apply_logical_op(
472 [&](auto a, auto b) { return builder.CreateXor(a, b); });
473 return true;
474
475 // Bitwise ops
477 apply_bitwise_op(
478 [&](auto a, auto b) { return builder.CreateAnd(a, b); });
479 return true;
480 case TokenType::Bitor:
481 apply_bitwise_op(
482 [&](auto a, auto b) { return builder.CreateOr(a, b); });
483 return true;
485 apply_bitwise_op(
486 [&](auto a, auto b) { return builder.CreateXor(a, b); });
487 return true;
488
489 // Unary Operators
490 case TokenType::Sqrt: {
491 auto* a = rpn_stack.back();
492 rpn_stack.pop_back();
493 auto* zero = llvm::ConstantFP::get(float_ty, 0.0);
494 auto* max_val = createIntrinsicCall(llvm::Intrinsic::maxnum, a, zero);
495 rpn_stack.push_back(
496 createIntrinsicCall(llvm::Intrinsic::sqrt, max_val));
497 return true;
498 }
499 case TokenType::Exp:
500 apply_approx_math_op.operator()<1>(MathOp::Exp, llvm::Intrinsic::exp);
501 return true;
502 case TokenType::Log:
503 apply_approx_math_op.operator()<1>(MathOp::Log, llvm::Intrinsic::log);
504 return true;
505 case TokenType::Abs:
506 apply_intrinsic.operator()<1>(llvm::Intrinsic::fabs);
507 return true;
508 case TokenType::Floor:
509 apply_intrinsic.operator()<1>(llvm::Intrinsic::floor);
510 return true;
511 case TokenType::Ceil:
512 apply_intrinsic.operator()<1>(llvm::Intrinsic::ceil);
513 return true;
514 case TokenType::Trunc:
515 apply_intrinsic.operator()<1>(llvm::Intrinsic::trunc);
516 return true;
517 case TokenType::Round:
518 apply_intrinsic.operator()<1>(llvm::Intrinsic::round);
519 return true;
520 case TokenType::Sin:
521 apply_approx_math_op.operator()<1>(MathOp::Sin, llvm::Intrinsic::sin);
522 return true;
523 case TokenType::Cos:
524 apply_approx_math_op.operator()<1>(MathOp::Cos, llvm::Intrinsic::cos);
525 return true;
526 case TokenType::Tan:
527 apply_approx_math_op.operator()<1>(MathOp::Tan, llvm::Intrinsic::tan);
528 return true;
529 case TokenType::Asin:
530 apply_approx_math_op.operator()<1>(MathOp::Asin, llvm::Intrinsic::asin);
531 return true;
532 case TokenType::Acos:
533 apply_approx_math_op.operator()<1>(MathOp::Acos, llvm::Intrinsic::acos);
534 return true;
535 case TokenType::Atan:
536 apply_approx_math_op.operator()<1>(MathOp::Atan, llvm::Intrinsic::atan);
537 return true;
538 case TokenType::Exp2:
539 apply_intrinsic.operator()<1>(llvm::Intrinsic::exp2);
540 return true;
541 case TokenType::Log10:
542 apply_intrinsic.operator()<1>(llvm::Intrinsic::log10);
543 return true;
544 case TokenType::Log2:
545 apply_intrinsic.operator()<1>(llvm::Intrinsic::log2);
546 return true;
547 case TokenType::Sinh:
548 apply_intrinsic.operator()<1>(llvm::Intrinsic::sinh);
549 return true;
550 case TokenType::Cosh:
551 apply_intrinsic.operator()<1>(llvm::Intrinsic::cosh);
552 return true;
553 case TokenType::Tanh:
554 apply_intrinsic.operator()<1>(llvm::Intrinsic::tanh);
555 return true;
556 case TokenType::Sgn: {
557 auto* x = rpn_stack.back();
558 rpn_stack.pop_back();
559 auto* zero = llvm::ConstantFP::get(float_ty, 0.0);
560 auto* one = llvm::ConstantFP::get(float_ty, 1.0);
561 auto* nonzero = builder.CreateFCmpONE(x, zero);
562 auto* sign = builder.CreateCall(
563 llvm::Intrinsic::getOrInsertDeclaration(
564 &module, llvm::Intrinsic::copysign, {float_ty}),
565 {one, x});
566 rpn_stack.push_back(builder.CreateSelect(nonzero, sign, zero));
567 return true;
568 }
569 case TokenType::Neg: {
570 auto* a = rpn_stack.back();
571 rpn_stack.pop_back();
572 rpn_stack.push_back(builder.CreateFNeg(a));
573 return true;
574 }
575 case TokenType::Not: {
576 auto* a = rpn_stack.back();
577 rpn_stack.pop_back();
578 rpn_stack.push_back(builder.CreateSelect(
579 builder.CreateFCmpOLE(a, llvm::ConstantFP::get(float_ty, 0.0)),
580 llvm::ConstantFP::get(float_ty, 1.0),
581 llvm::ConstantFP::get(float_ty, 0.0)));
582 return true;
583 }
584 case TokenType::Bitnot: {
585 auto* a = rpn_stack.back();
586 rpn_stack.pop_back();
587 auto* a_rounded = createIntrinsicCall(llvm::Intrinsic::nearbyint, a);
588 rpn_stack.push_back(builder.CreateSIToFP(
589 builder.CreateNot(builder.CreateFPToSI(a_rounded, i32_ty)),
590 float_ty));
591 return true;
592 }
593
594 // Ternary and other multi-arg
595 case TokenType::Ternary: {
596 auto* c = rpn_stack.back();
597 rpn_stack.pop_back();
598 auto* b = rpn_stack.back();
599 rpn_stack.pop_back();
600 auto* a = rpn_stack.back();
601 rpn_stack.pop_back();
602 rpn_stack.push_back(builder.CreateSelect(
603 builder.CreateFCmpOGT(a, llvm::ConstantFP::get(float_ty, 0.0)), b,
604 c));
605 return true;
606 }
607 case TokenType::Clip:
608 case TokenType::Clamp: {
609 auto* max_val = rpn_stack.back();
610 rpn_stack.pop_back();
611 auto* min_val = rpn_stack.back();
612 rpn_stack.pop_back();
613 auto* val = rpn_stack.back();
614 rpn_stack.pop_back();
615 auto* temp = createIntrinsicCall(llvm::Intrinsic::maxnum, val, min_val);
616 auto* clamped =
617 createIntrinsicCall(llvm::Intrinsic::minnum, temp, max_val);
618 rpn_stack.push_back(clamped);
619 return true;
620 }
621 case TokenType::Fma: {
622 auto* c = rpn_stack.back();
623 rpn_stack.pop_back();
624 auto* b = rpn_stack.back();
625 rpn_stack.pop_back();
626 auto* a = rpn_stack.back();
627 rpn_stack.pop_back();
628 rpn_stack.push_back(builder.CreateCall(
629 llvm::Intrinsic::getOrInsertDeclaration(
630 &module, llvm::Intrinsic::fma, {builder.getFloatTy()}),
631 {a, b, c}));
632 return true;
633 }
634
635 // Stack manipulation
636 case TokenType::Dup: {
637 const auto& payload = std::get<TokenPayloadStackOp>(token.payload);
638 rpn_stack.push_back(rpn_stack[rpn_stack.size() - 1 - payload.n]);
639 return true;
640 }
641 case TokenType::Drop: {
642 const auto& payload = std::get<TokenPayloadStackOp>(token.payload);
643 if (payload.n > 0) {
644 rpn_stack.resize(rpn_stack.size() - payload.n);
645 }
646 return true;
647 }
648 case TokenType::Swap: {
649 const auto& payload = std::get<TokenPayloadStackOp>(token.payload);
650 std::swap(rpn_stack.back(),
651 rpn_stack[rpn_stack.size() - 1 - payload.n]);
652 return true;
653 }
654 case TokenType::SortN: {
655 const auto& payload = std::get<TokenPayloadStackOp>(token.payload);
656 int n = payload.n;
657 if (n < 2) {
658 return true;
659 }
660
661 std::vector<llvm::Value*> values;
662 values.reserve(n);
663 for (int k = 0; k < n; ++k) {
664 values.push_back(rpn_stack.back());
665 rpn_stack.pop_back();
666 }
667
668 auto compare_swap = [&](int i_idx, int j_idx) {
669 llvm::Value* val_i = values[i_idx];
670 llvm::Value* val_j = values[j_idx];
671 llvm::Value* cond = builder.CreateFCmpOGT(val_i, val_j);
672 values[i_idx] = builder.CreateSelect(cond, val_j, val_i); // min
673 values[j_idx] = builder.CreateSelect(cond, val_i, val_j); // max
674 };
675
676 auto network = get_sorting_network(n);
677 for (const auto& pair : network) {
678 compare_swap(pair.first, pair.second);
679 }
680
681 for (int k = n - 1; k >= 0; --k) {
682 rpn_stack.push_back(values[k]);
683 }
684 return true;
685 }
687 case TokenType::ArgmaxN: {
688 const auto& payload = std::get<TokenPayloadStackOp>(token.payload);
689 int n = payload.n;
690 if (n < 1) {
691 rpn_stack.push_back(
692 llvm::ConstantFP::get(builder.getFloatTy(), 0.0));
693 return true;
694 }
695
696 std::vector<llvm::Value*> values(n);
697 for (int i = 0; i < n; ++i) {
698 values[i] = rpn_stack.back();
699 rpn_stack.pop_back();
700 }
701
702 struct Node {
703 llvm::Value* val;
704 llvm::Value* idx;
705 };
706 std::vector<Node> current_level;
707 current_level.reserve(n);
708 for (int i = 0; i < n; ++i) {
709 current_level.push_back(
710 {values[i],
711 llvm::ConstantFP::get(builder.getFloatTy(),
712 static_cast<double>(n - 1 - i))});
713 }
714
715 bool is_max = (token.type == TokenType::ArgmaxN);
716
717 while (current_level.size() > 1) {
718 std::vector<Node> next_level;
719 for (size_t i = 0; i < current_level.size(); i += 2) {
720 if (i + 1 < current_level.size()) {
721 const auto& left = current_level[i];
722 const auto& right = current_level[i + 1];
723
724 llvm::Value* cmp_val =
725 is_max ? builder.CreateFCmpOGT(left.val, right.val)
726 : builder.CreateFCmpOLT(left.val, right.val);
727
728 llvm::Value* eq_val =
729 builder.CreateFCmpOEQ(left.val, right.val);
730 llvm::Value* cmp_idx =
731 builder.CreateFCmpOLT(left.idx, right.idx);
732 llvm::Value* tie_break = builder.CreateAnd(eq_val, cmp_idx);
733 llvm::Value* cond = builder.CreateOr(cmp_val, tie_break);
734
735 next_level.push_back(
736 {builder.CreateSelect(cond, left.val, right.val),
737 builder.CreateSelect(cond, left.idx, right.idx)});
738 } else {
739 next_level.push_back(current_level[i]);
740 }
741 }
742 current_level = std::move(next_level);
743 }
744 rpn_stack.push_back(current_level[0].idx);
745 return true;
746 }
747 case TokenType::ArgsortN: {
748 const auto& payload = std::get<TokenPayloadStackOp>(token.payload);
749 int n = payload.n;
750 if (n < 1) {
751 return true;
752 }
753 if (n == 1) {
754 rpn_stack.pop_back();
755 rpn_stack.push_back(
756 llvm::ConstantFP::get(builder.getFloatTy(), 0.0));
757 return true;
758 }
759
760 std::vector<llvm::Value*> values(n);
761 std::vector<llvm::Value*> indices(n);
762 for (int i = 0; i < n; ++i) {
763 values[i] = rpn_stack.back();
764 rpn_stack.pop_back();
765 indices[i] = llvm::ConstantFP::get(builder.getFloatTy(),
766 static_cast<double>(n - 1 - i));
767 }
768
769 auto network = get_sorting_network(n);
770 for (const auto& pair : network) {
771 int i1 = pair.first;
772 int i2 = pair.second;
773
774 llvm::Value* v1 = values[i1];
775 llvm::Value* v2 = values[i2];
776 llvm::Value* idx1 = indices[i1];
777 llvm::Value* idx2 = indices[i2];
778
779 llvm::Value* cmp_val = builder.CreateFCmpOGT(v1, v2);
780 llvm::Value* eq_val = builder.CreateFCmpOEQ(v1, v2);
781 llvm::Value* cmp_idx = builder.CreateFCmpOGT(idx1, idx2);
782 llvm::Value* tie_break = builder.CreateAnd(eq_val, cmp_idx);
783 llvm::Value* cond = builder.CreateOr(cmp_val, tie_break);
784
785 values[i1] = builder.CreateSelect(cond, v2, v1);
786 values[i2] = builder.CreateSelect(cond, v1, v2);
787 indices[i1] = builder.CreateSelect(cond, idx2, idx1);
788 indices[i2] = builder.CreateSelect(cond, idx1, idx2);
789 }
790
791 for (int i = n - 1; i >= 0; --i) {
792 rpn_stack.push_back(indices[i]);
793 }
794 return true;
795 }
796
797 // Control Flow (no-op during this pass)
799 case TokenType::Jump:
800 return true;
801
802 default:
803 // Not a common token - let derived class handle it
804 return false;
805 }
806}
807
808void IRGeneratorBase::generateIRFromTokens(llvm::Value* x, llvm::Value* y,
809 llvm::Value* x_fp, llvm::Value* y_fp,
810 bool no_x_bounds_check) {
811 llvm::Type* float_ty = builder.getFloatTy();
812 llvm::Type* i32_ty = builder.getInt32Ty();
813 llvm::Function* parent_func = builder.GetInsertBlock()->getParent();
814
815 bool use_approx_math = false;
816 if (approx_math == 1) {
817 use_approx_math = true;
818 } else if (approx_math == 2) {
819 // In auto mode, always try approx math first
820 use_approx_math = true;
821 }
822
823 if (tokens.empty()) {
824 generatePixelStore(llvm::ConstantFP::get(float_ty, 0.0), x, y);
825 return;
826 }
827
828 std::unordered_map<std::string, llvm::Value*> named_vars;
829 const auto& all_vars = analysis_results.getVariableUsageResult().all_vars;
830
831 for (const std::string& var_name : all_vars) {
832 named_vars[var_name] = createAllocaInEntry(float_ty, var_name);
833 }
834
835 std::map<int, llvm::BasicBlock*> llvm_blocks;
836 const auto& cfg_blocks = analysis_results.getCFGBlocks();
837 const auto& label_to_block_idx = analysis_results.getLabelToBlockIdx();
838 const auto& stack_depth_in = analysis_results.getStackDepthIn();
839
840 for (int i = 0; i < static_cast<int>(cfg_blocks.size()); ++i) {
841 std::string name = std::format("b{}", i);
842 for (const auto& [label_name, block_idx] : label_to_block_idx) {
843 if (block_idx == i) {
844 name = label_name;
845 break;
846 }
847 }
848 llvm_blocks[i] = llvm::BasicBlock::Create(context, name, parent_func);
849 }
850 llvm::BasicBlock* exit_bb =
851 llvm::BasicBlock::Create(context, "exit", parent_func);
852
853 // Branch from current block to the first CFG block
854 builder.CreateBr(llvm_blocks[0]);
855
856 // Initial PHI generation for merge blocks
857 std::map<int, std::vector<llvm::Value*>> block_initial_stacks;
858 for (int i = 0; i < static_cast<int>(cfg_blocks.size()); ++i) {
859 if (cfg_blocks[i].predecessors.size() > 1) {
860 builder.SetInsertPoint(llvm_blocks[i]);
861 std::vector<llvm::Value*> initial_stack;
862 int depth = stack_depth_in[i];
863 initial_stack.reserve(depth);
864 for (int j = 0; j < depth; ++j) {
865 initial_stack.push_back(builder.CreatePHI(
866 float_ty, cfg_blocks[i].predecessors.size()));
867 }
868 block_initial_stacks[i] = initial_stack;
869 }
870 }
871
872 // Process blocks
873 std::map<int, std::vector<llvm::Value*>> block_final_stacks;
874
875 for (int i = 0; i < static_cast<int>(cfg_blocks.size()); ++i) {
876 const auto& block_info = cfg_blocks[i];
877 builder.SetInsertPoint(llvm_blocks[i]);
878
879 std::vector<llvm::Value*> rpn_stack;
880 if (block_info.predecessors.empty()) {
881 // Entry block, empty stack
882 } else if (block_info.predecessors.size() == 1) {
883 int pred_idx = block_info.predecessors[0];
884 if (block_final_stacks.contains(pred_idx)) {
885 rpn_stack = block_final_stacks.at(pred_idx);
886 }
887 } else {
888 rpn_stack = block_initial_stacks.at(i);
889 }
890
891 for (int j = block_info.start_token_idx; j < block_info.end_token_idx;
892 ++j) {
893 const auto& token = tokens[j];
894
895 // Try common tokens first
896 if (processCommonToken(token, rpn_stack, float_ty, i32_ty,
897 use_approx_math)) {
898 continue;
899 }
900
901 // Variables
902 if (token.type == TokenType::VarStore) {
903 const auto& payload = std::get<TokenPayloadVar>(token.payload);
904 llvm::Value* val_to_store = rpn_stack.back();
905 rpn_stack.pop_back();
906 llvm::Value* var_ptr = named_vars[payload.name];
907 builder.CreateStore(val_to_store, var_ptr);
908 continue;
909 }
910 if (token.type == TokenType::VarLoad) {
911 const auto& payload = std::get<TokenPayloadVar>(token.payload);
912 llvm::Value* var_ptr = named_vars[payload.name];
913 rpn_stack.push_back(builder.CreateLoad(float_ty, var_ptr));
914 continue;
915 }
916
917 // Special tokens - delegate to derived class
918 if (!processModeSpecificToken(token, rpn_stack, x, y, x_fp, y_fp,
919 no_x_bounds_check)) {
920 throw std::runtime_error(std::format(
921 "Unhandled token type: {}", static_cast<int>(token.type)));
922 }
923 }
924
925 // Create Terminator
926 if (block_info.successors.empty()) {
927 builder.CreateBr(exit_bb);
928 } else if (block_info.successors.size() == 1) {
929 builder.CreateBr(llvm_blocks[block_info.successors[0]]);
930 } else { // size is 2, from a JUMP
931 llvm::Value* cond_val = rpn_stack.back();
932 llvm::Value* cond = builder.CreateFCmpOGT(
933 cond_val, llvm::ConstantFP::get(float_ty, 0.0));
934 builder.CreateCondBr(cond, llvm_blocks[block_info.successors[0]],
935 llvm_blocks[block_info.successors[1]]);
936 rpn_stack.pop_back();
937 }
938
939 block_final_stacks[i] = rpn_stack;
940 }
941
942 // Populate PHI nodes
943 for (int i = 0; i < static_cast<int>(cfg_blocks.size()); ++i) {
944 if (cfg_blocks[i].predecessors.size() > 1) {
945 auto& phis = block_initial_stacks.at(i);
946 for (int pred_idx : cfg_blocks[i].predecessors) {
947 auto& incoming_stack = block_final_stacks.at(pred_idx);
948 auto* incoming_block = llvm_blocks.at(pred_idx);
949 for (size_t j = 0; j < phis.size(); ++j) {
950 if (j < incoming_stack.size()) {
951 llvm::cast<llvm::PHINode>(phis[j])->addIncoming(
952 incoming_stack[j], incoming_block);
953 }
954 }
955 }
956 }
957 }
958
959 // Final Result PHI
960 builder.SetInsertPoint(exit_bb);
961 std::vector<std::pair<llvm::Value*, llvm::BasicBlock*>> final_values;
962 for (int i = 0; i < static_cast<int>(cfg_blocks.size()); ++i) {
963 if (cfg_blocks[i].successors.empty()) {
964 auto& stack = block_final_stacks.at(i);
965 if (!stack.empty()) {
966 final_values.emplace_back(stack.back(), llvm_blocks.at(i));
967 }
968 }
969 }
970
971 llvm::Value* result_val = nullptr;
972 if (final_values.empty()) {
973 result_val = llvm::UndefValue::get(float_ty);
974 } else if (final_values.size() == 1) {
975 result_val = final_values[0].first;
976 } else {
977 llvm::PHINode* phi =
978 builder.CreatePHI(float_ty, final_values.size(), "result_phi");
979 for (const auto& pair : final_values) {
980 phi->addIncoming(pair.first, pair.second);
981 }
982 result_val = phi;
983 }
984
985 // Let derived class handle exit logic (if any) and final store
986 finalizeAndStoreResult(result_val, x, y);
987}
constexpr unsigned ALIGNMENT
MathOp
Definition Math.hpp:69
@ Sin
Definition Math.hpp:72
@ Tan
Definition Math.hpp:74
@ Atan2
Definition Math.hpp:76
@ Asin
Definition Math.hpp:78
@ Atan
Definition Math.hpp:75
@ Exp
Definition Math.hpp:70
@ Log
Definition Math.hpp:71
@ Acos
Definition Math.hpp:77
@ Cos
Definition Math.hpp:73
constexpr std::vector< std::pair< int, int > > get_sorting_network(int n)
Definition Sorting.hpp:2149
llvm::Value * createIntrinsicCall(llvm::Intrinsic::ID intrinsic_id, Args... args)
const std::map< std::pair< int, std::string >, int > & prop_map
llvm::Value * getFinalCoord(llvm::Value *coord, llvm::Value *max_dim, bool use_mirror)
virtual void defineFunctionSignature()=0
llvm::IRBuilder & builder
std::string func_name
llvm::LLVMContext & context
llvm::Value * generatePixelLoad(int clip_idx, llvm::Value *x, llvm::Value *y, bool mirror)
virtual bool processModeSpecificToken(const Token &token, std::vector< llvm::Value * > &rpn_stack, llvm::Value *x, llvm::Value *y, llvm::Value *x_fp, llvm::Value *y_fp, bool no_x_bounds_check)=0
const std::vector< const VSVideoInfo * > & vi
void generateIRFromTokens(llvm::Value *x, llvm::Value *y, llvm::Value *x_fp, llvm::Value *y_fp, bool no_x_bounds_check)
llvm::Value * rwptrs_arg
const std::vector< Token > & tokens
IRGeneratorBase(const std::vector< Token > &tokens_in, const VSVideoInfo *out_vi, const std::vector< const VSVideoInfo * > &in_vi, int width_in, int height_in, bool mirror, const std::map< std::pair< int, std::string >, int > &p_map, const analysis::ExpressionAnalysisResults &analysis_results_in, llvm::LLVMContext &context_ref, llvm::Module &module_ref, llvm::IRBuilder<> &builder_ref, MathLibraryManager &math_mgr, std::string func_name_in, int approx_math_in)
llvm::Value * generateLoadFromRowPtr(llvm::Value *row_ptr, int clip_idx, llvm::Value *x, int rel_x, bool use_mirror, bool no_x_bounds_check)
void setMemoryInstAttrs(MemInstT *inst, unsigned alignment, int rwptr_index)
llvm::AllocaInst * createAllocaInEntry(llvm::Type *type, const std::string &name)
void assumeAligned(llvm::Value *ptr_value, unsigned alignment)
llvm::Function * func
virtual void generateLoops()=0
std::vector< llvm::Value * > preloaded_base_ptrs
const VSVideoInfo * vo
std::vector< llvm::Value * > preloaded_strides
llvm::Module & module
llvm::MDNode * alias_scope_domain
llvm::Value * props_arg
MathLibraryManager & math_manager
void addLoopMetadata(llvm::BranchInst *loop_br)
virtual void finalizeAndStoreResult(llvm::Value *result_val, llvm::Value *x, llvm::Value *y)=0
const analysis::ExpressionAnalysisResults & analysis_results
void generatePixelStore(llvm::Value *value_to_store, llvm::Value *x, llvm::Value *y)
bool processCommonToken(const Token &token, std::vector< llvm::Value * > &rpn_stack, llvm::Type *float_ty, llvm::Type *i32_ty, bool use_approx_math)
llvm::Value * strides_arg
TokenType type
PayloadVariant payload