VapourSynth-llvmexpr
Loading...
Searching...
No Matches
VkExprExecutor.cpp
Go to the documentation of this file.
1
19
20#include "VkExprExecutor.hpp"
21
23#include "VulkanContext.hpp"
24#include "VulkanMemory.hpp"
25
26#include <algorithm>
27#include <array>
28#include <bit>
29#include <cmath>
30#include <cstring>
31#include <memory>
32#include <mutex>
33#include <queue>
34#include <semaphore>
35#include <stdexcept>
36#include <utility>
37#include <vector>
38
39namespace vkexpr {
40
41namespace {
42
43// NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers)
44float float_from_half_bits(std::uint16_t half_bits) {
45 std::uint32_t sign = (half_bits >> 15) & 0x1;
46 std::uint32_t exp = (half_bits >> 10) & 0x1F;
47 std::uint32_t mant = half_bits & 0x3FF;
48 std::uint32_t float_bits = 0;
49
50 if (exp == 0) {
51 if (mant == 0) {
52 float_bits = sign << 31;
53 } else {
54 exp = 1;
55 while ((mant & 0x400) == 0) {
56 mant <<= 1;
57 exp--;
58 }
59 mant &= 0x3FF;
60 float_bits = (sign << 31) | ((exp + 127 - 15) << 23) | (mant << 13);
61 }
62 } else if (exp == 31) {
63 float_bits = (sign << 31) | 0x7F800000 | (mant << 13);
64 } else {
65 float_bits = (sign << 31) | ((exp + 127 - 15) << 23) | (mant << 13);
66 }
67
68 return std::bit_cast<float>(float_bits);
69}
70
71std::uint16_t half_bits_from_float(float value) {
72 auto float_bits = std::bit_cast<std::uint32_t>(value);
73 std::uint32_t sign = (float_bits >> 31) & 0x1;
74 std::int32_t exp =
75 static_cast<std::int32_t>((float_bits >> 23) & 0xFF) - 127 + 15;
76 std::uint32_t mant = (float_bits >> 13) & 0x3FF;
77
78 if (std::isnan(value)) {
79 return static_cast<std::uint16_t>((sign << 15) | 0x7C00 |
80 ((mant != 0U) ? mant : 1));
81 }
82 if (std::isinf(value)) {
83 return static_cast<std::uint16_t>((sign << 15) | 0x7C00);
84 }
85
86 if (exp <= 0) {
87 if (exp < -10) {
88 return static_cast<std::uint16_t>(sign << 15);
89 }
90 mant = (mant | 0x400) >> (1 - exp);
91 return static_cast<std::uint16_t>((sign << 15) | mant);
92 }
93
94 if (exp >= 31) {
95 return static_cast<std::uint16_t>((sign << 15) | 0x7C00);
96 }
97
98 return static_cast<std::uint16_t>(
99 (sign << 15) | (static_cast<std::uint32_t>(exp) << 10) | mant);
100}
101// NOLINTEND(cppcoreguidelines-avoid-magic-numbers)
102
103// NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic,cppcoreguidelines-pro-type-reinterpret-cast)
104void pack_plane_to_float(std::span<float> dst, const VSFrame* src, int plane,
105 const VSAPI* vsapi) {
106 const int width = vsapi->getFrameWidth(src, plane);
107 const int height = vsapi->getFrameHeight(src, plane);
108 const VSVideoFormat* format = vsapi->getVideoFrameFormat(src);
109
110 if (width <= 0 || height <= 0) {
111 throw std::runtime_error("Invalid plane dimensions.");
112 }
113 if (static_cast<size_t>(width) * static_cast<size_t>(height) !=
114 dst.size()) {
115 throw std::runtime_error("Plane buffer size mismatch.");
116 }
117
118 const std::uint8_t* src_data = vsapi->getReadPtr(src, plane);
119 if (src_data == nullptr) {
120 throw std::runtime_error("Null plane pointer.");
121 }
122
123 const std::ptrdiff_t stride = vsapi->getStride(src, plane);
124 const int bpp = format->bytesPerSample;
125 const bool is_float = (format->sampleType == stFloat);
126
127 if (!is_float) {
128 for (int row = 0; row < height; ++row) {
129 const std::uint8_t* row_ptr =
130 src_data + (static_cast<std::ptrdiff_t>(row) * stride);
131 float* out_row = dst.data() + (static_cast<size_t>(row) *
132 static_cast<size_t>(width));
133 for (int col = 0; col < width; ++col) {
134 if (bpp == 1) {
135 out_row[col] = static_cast<float>(row_ptr[col]);
136 } else if (bpp == 2) {
137 out_row[col] = static_cast<float>(
138 reinterpret_cast<const std::uint16_t*>(row_ptr)[col]);
139 } else if (bpp == 4) {
140 out_row[col] = static_cast<float>(
141 reinterpret_cast<const std::uint32_t*>(row_ptr)[col]);
142 } else {
143 throw std::runtime_error(
144 "Unsupported integer sample size.");
145 }
146 }
147 }
148 return;
149 }
150
151 if (bpp == 4) {
152 for (int row = 0; row < height; ++row) {
153 const std::uint8_t* row_ptr =
154 src_data + (static_cast<std::ptrdiff_t>(row) * stride);
155 float* out_row = dst.data() + (static_cast<size_t>(row) *
156 static_cast<size_t>(width));
157 std::memcpy(out_row, row_ptr,
158 static_cast<size_t>(width) * sizeof(float));
159 }
160 return;
161 }
162 if (bpp == 2) {
163 for (int row = 0; row < height; ++row) {
164 const std::uint8_t* row_ptr =
165 src_data + (static_cast<std::ptrdiff_t>(row) * stride);
166 float* out_row = dst.data() + (static_cast<size_t>(row) *
167 static_cast<size_t>(width));
168 for (int col = 0; col < width; ++col) {
169 std::uint16_t half_bits =
170 reinterpret_cast<const std::uint16_t*>(row_ptr)[col];
171 out_row[col] = float_from_half_bits(half_bits);
172 }
173 }
174 return;
175 }
176
177 throw std::runtime_error("Unsupported float sample size.");
178}
179
180void unpack_float_to_plane(const std::span<const float> src, VSFrame* dst,
181 int plane, const VSAPI* vsapi) {
182 const int width = vsapi->getFrameWidth(dst, plane);
183 const int height = vsapi->getFrameHeight(dst, plane);
184 const VSVideoFormat* format = vsapi->getVideoFrameFormat(dst);
185
186 [[unlikely]] if (width <= 0 || height <= 0) {
187 throw std::runtime_error("Invalid plane dimensions.");
188 }
189 [[unlikely]] if (static_cast<size_t>(width) * static_cast<size_t>(height) !=
190 src.size()) {
191 throw std::runtime_error("Plane buffer size mismatch.");
192 }
193
194 std::uint8_t* dst_data = vsapi->getWritePtr(dst, plane);
195 if (dst_data == nullptr) {
196 throw std::runtime_error("Null plane pointer.");
197 }
198
199 const std::ptrdiff_t stride = vsapi->getStride(dst, plane);
200 const int bpp = format->bytesPerSample;
201 const bool is_float = (format->sampleType == stFloat);
202
203 if (!is_float) {
204 const int bits = format->bitsPerSample;
205 if (bits <= 0 || bits > 31) {
206 throw std::runtime_error("Invalid integer bitsPerSample.");
207 }
208 const int max_val = (1 << bits) - 1;
209
210 for (int row = 0; row < height; ++row) {
211 std::uint8_t* out_row_ptr =
212 dst_data + (static_cast<std::ptrdiff_t>(row) * stride);
213 const float* in_row = src.data() + (static_cast<size_t>(row) *
214 static_cast<size_t>(width));
215
216 for (int col = 0; col < width; ++col) {
217 float value = in_row[col];
218 float clamped =
219 std::clamp(value, 0.0F, static_cast<float>(max_val));
220 int int_val = static_cast<int>(std::nearbyint(clamped));
221
222 if (bpp == 1) {
223 out_row_ptr[col] = static_cast<std::uint8_t>(int_val);
224 } else if (bpp == 2) {
225 reinterpret_cast<std::uint16_t*>(out_row_ptr)[col] =
226 static_cast<std::uint16_t>(int_val);
227 } else if (bpp == 4) {
228 reinterpret_cast<std::uint32_t*>(out_row_ptr)[col] =
229 static_cast<std::uint32_t>(int_val);
230 } else [[unlikely]] {
231 throw std::runtime_error(
232 "Unsupported integer sample size.");
233 }
234 }
235 }
236 return;
237 }
238
239 if (bpp == 4) {
240 for (int row = 0; row < height; ++row) {
241 std::uint8_t* out_row_ptr =
242 dst_data + (static_cast<std::ptrdiff_t>(row) * stride);
243 const float* in_row = src.data() + (static_cast<size_t>(row) *
244 static_cast<size_t>(width));
245 std::memcpy(out_row_ptr, in_row,
246 static_cast<size_t>(width) * sizeof(float));
247 }
248 return;
249 }
250
251 if (bpp == 2) {
252 for (int row = 0; row < height; ++row) {
253 std::uint8_t* out_row_ptr =
254 dst_data + (static_cast<std::ptrdiff_t>(row) * stride);
255 const float* in_row = src.data() + (static_cast<size_t>(row) *
256 static_cast<size_t>(width));
257 for (int col = 0; col < width; ++col) {
258 float value = in_row[col];
259 reinterpret_cast<std::uint16_t*>(out_row_ptr)[col] =
260 half_bits_from_float(value);
261 }
262 }
263 return;
264 }
265
266 throw std::runtime_error("Unsupported float sample size.");
267}
268// NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic,cppcoreguidelines-pro-type-reinterpret-cast)
269
270} // namespace
271
285
286 struct Stream {
287 std::unique_ptr<VulkanMemory> memory;
288 std::array<std::vector<std::unique_ptr<VulkanComputePipeline>>, 3>
290 std::array<PlaneResources, 3> plane_resources;
291 vk::raii::CommandPool command_pool = nullptr;
292 vk::raii::CommandBuffer command_buffer = nullptr;
293 vk::raii::Fence fence = nullptr;
294
295 Stream() = default;
297 for (auto& res : plane_resources) {
299 }
300 }
301
302 Stream(const Stream&) = delete;
303 Stream& operator=(const Stream&) = delete;
304 Stream(Stream&&) = delete;
305 Stream& operator=(Stream&&) = delete;
306
308 if (res.initialized) {
309 for (auto& buf : res.input_buffers) {
310 memory->destroyBuffer(buf);
311 }
312 for (auto& buf : res.input_staging_buffers) {
313 memory->destroyBuffer(buf);
314 }
315 res.input_buffers.clear();
316 res.input_staging_buffers.clear();
317
318 memory->destroyBuffer(res.output_buffer);
319 memory->destroyBuffer(res.output_staging_buffer);
320
321 for (auto& buf : res.intermediate_buffers) {
322 memory->destroyBuffer(buf);
323 }
324 res.intermediate_buffers.clear();
325
326 res.initialized = false;
327 res.buffer_size = 0;
328 }
329
330 if (res.props_buffer.isValid()) {
331 memory->destroyBuffer(res.props_buffer);
332 memory->destroyBuffer(res.props_staging_buffer);
333 res.props_size = 0;
334 }
335 }
336 };
337
339 int num_inputs = 0;
340 std::uint32_t num_props_floats = 0;
341 std::array<std::vector<std::string>, 3> glsl_stages;
342
343 int num_streams = 0;
344 std::vector<std::unique_ptr<Stream>> streams;
345 std::counting_semaphore<> semaphore{0};
346 std::queue<int> free_stream_indices;
347 std::mutex stream_mutex;
348
349 Impl(int device_id, int num_streams, int num_inputs,
350 std::array<std::vector<std::string>, 3> glsl_stages,
351 std::uint32_t num_props_floats)
352 : context(&VulkanContext::getInstance(device_id)),
356 [[unlikely]] if (num_inputs <= 0) {
357 throw std::runtime_error("VkExprExecutor: num_inputs must be > 0.");
358 }
359 [[unlikely]] if (num_streams <= 0) {
360 throw std::runtime_error(
361 "VkExprExecutor: num_streams must be > 0.");
362 }
363
364 auto& ctx = *context;
365 streams.resize(static_cast<size_t>(num_streams));
366 for (int k = 0; k < num_streams; ++k) {
367 streams[static_cast<size_t>(k)] = std::make_unique<Stream>();
368 auto& stream = *streams[static_cast<size_t>(k)];
369 stream.memory = std::make_unique<VulkanMemory>(ctx);
370 free_stream_indices.push(k);
371
372 vk::CommandPoolCreateInfo pool_info(
373 vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
374 ctx.getQueueFamilyIndex());
375 stream.command_pool =
376 vk::raii::CommandPool(ctx.getDevice(), pool_info);
377
378 vk::CommandBufferAllocateInfo cmd_info(
379 *stream.command_pool, vk::CommandBufferLevel::ePrimary, 1);
380 auto cmd_buffers =
381 vk::raii::CommandBuffers(ctx.getDevice(), cmd_info);
382 stream.command_buffer = std::move(cmd_buffers[0]);
383
384 vk::FenceCreateInfo fence_info;
385 stream.fence = vk::raii::Fence(ctx.getDevice(), fence_info);
386
387 for (int plane = 0; plane < 3; ++plane) {
388 if (this->glsl_stages.at(plane).empty()) {
389 continue;
390 }
391 auto& plane_pipelines = stream.pipelines.at(plane);
392 auto& stages = this->glsl_stages.at(plane);
393 plane_pipelines.resize(stages.size());
394
395 for (size_t s = 0; s < stages.size(); ++s) {
396 plane_pipelines[s] =
397 std::make_unique<VulkanComputePipeline>(
398 ctx, stages[s],
399 static_cast<std::uint32_t>(num_inputs +
400 static_cast<int>(s)),
402 }
403 }
404 }
405 }
406
408 semaphore.acquire();
409 std::lock_guard<std::mutex> lock(stream_mutex);
410 int idx = free_stream_indices.front();
412 return idx;
413 }
414
415 void releaseStreamIndex(int idx) {
416 {
417 std::lock_guard<std::mutex> lock(stream_mutex);
418 free_stream_indices.push(idx);
419 }
420 semaphore.release();
421 }
422
423 void drain() {
424 for (int i = 0; i < num_streams; ++i) {
425 semaphore.acquire();
426 }
427 }
428};
429
431 int device_id, int num_streams, int num_inputs,
432 std::array<std::vector<std::string>, 3> glsl_stages,
433 std::uint32_t num_props_floats)
434 : impl(std::make_unique<Impl>(device_id, num_streams, num_inputs,
435 std::move(glsl_stages), num_props_floats)) {}
436
438 if (impl != nullptr) {
439 impl->drain();
440 }
441}
442
443void VkExprExecutor::processPlane(int plane, int frame_number,
444 std::span<const VSFrame* const> inputs,
445 VSFrame* output, std::span<const float> props,
446 const VSAPI* vsapi) {
447 [[unlikely]] if (impl == nullptr) {
448 throw std::runtime_error("VkExprExecutor is not initialized.");
449 }
450 [[unlikely]] if (plane < 0 || plane > 2) {
451 throw std::runtime_error("Invalid plane index.");
452 }
453 [[unlikely]] if (inputs.size() != static_cast<size_t>(impl->num_inputs)) {
454 throw std::runtime_error(
455 "VkExprExecutor: unexpected number of inputs.");
456 }
457 [[unlikely]] if (impl->glsl_stages.at(plane).empty()) {
458 throw std::runtime_error(
459 "VkExprExecutor: no shader for requested plane.");
460 }
461 const int width = vsapi->getFrameWidth(output, plane);
462 const int height = vsapi->getFrameHeight(output, plane);
463 [[unlikely]] if (width <= 0 || height <= 0) {
464 throw std::runtime_error("Invalid output dimensions.");
465 }
466
467 for (const auto* in : inputs) {
468 [[unlikely]] if (vsapi->getFrameWidth(in, plane) != width ||
469 vsapi->getFrameHeight(in, plane) != height) {
470 throw std::runtime_error("Input/output plane dimension mismatch.");
471 }
472 }
473
474 VkDeviceSize buffer_size = static_cast<VkDeviceSize>(width) *
475 static_cast<VkDeviceSize>(height) *
476 sizeof(float);
477
478 const int stream_idx = impl->acquireStreamIndex();
479 struct StreamReleaser {
480 Impl& impl;
481 int idx;
482 StreamReleaser(Impl& i, int x) : impl(i), idx(x) {}
483 ~StreamReleaser() { impl.releaseStreamIndex(idx); }
484 StreamReleaser(const StreamReleaser&) = delete;
485 StreamReleaser& operator=(const StreamReleaser&) = delete;
486 StreamReleaser(StreamReleaser&&) = delete;
487 StreamReleaser& operator=(StreamReleaser&&) = delete;
488 } releaser(*impl, stream_idx);
489
490 auto& stream = *impl->streams.at(static_cast<size_t>(stream_idx));
491 auto& plane_res = stream.plane_resources.at(plane);
492 auto& stage_sources = impl->glsl_stages.at(plane);
493
494 // (Re)allocate per-plane resources if needed
495 if (!plane_res.initialized || plane_res.buffer_size < buffer_size ||
496 plane_res.input_buffers.size() != inputs.size() ||
497 stream.pipelines.at(plane).size() != stage_sources.size()) {
498
499 stream.freePlaneResources(plane_res);
500
501 plane_res.input_buffers.resize(inputs.size());
502 plane_res.input_staging_buffers.resize(inputs.size());
503
504 for (size_t i = 0; i < inputs.size(); ++i) {
505 plane_res.input_buffers[i] =
506 stream.memory->createGPUBuffer(buffer_size);
507 plane_res.input_staging_buffers[i] =
508 stream.memory->createStagingBuffer(buffer_size, true);
509 }
510
511 plane_res.output_buffer = stream.memory->createGPUBuffer(buffer_size);
512 plane_res.output_staging_buffer =
513 stream.memory->createStagingBuffer(buffer_size, false);
514
515 const size_t num_intermediates =
516 (stage_sources.size() > 0) ? (stage_sources.size() - 1) : 0;
517 plane_res.intermediate_buffers.resize(num_intermediates);
518 for (size_t i = 0; i < num_intermediates; ++i) {
519 plane_res.intermediate_buffers[i] =
520 stream.memory->createGPUBuffer(buffer_size);
521 }
522
523 plane_res.buffer_size = buffer_size;
524 plane_res.initialized = true;
525 }
526
527 // Props buffers (optional)
528 VkDeviceSize props_size = props.size() * sizeof(float);
529 if (props_size > 0) {
530 if (!plane_res.props_buffer.isValid() ||
531 plane_res.props_size < props_size) {
532 if (plane_res.props_buffer.isValid()) {
533 stream.memory->destroyBuffer(plane_res.props_buffer);
534 stream.memory->destroyBuffer(plane_res.props_staging_buffer);
535 }
536
537 plane_res.props_buffer = stream.memory->createGPUBuffer(
538 props_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
539 VK_BUFFER_USAGE_TRANSFER_DST_BIT);
540 plane_res.props_staging_buffer =
541 stream.memory->createStagingBuffer(props_size, true);
542 plane_res.props_size = props_size;
543 }
544 }
545
546 // Pack inputs to float32 staging buffers
547 for (size_t i = 0; i < inputs.size(); ++i) {
548 auto* mapped_data = static_cast<float*>(
549 plane_res.input_staging_buffers[i].getMappedData());
550 std::span<float> mapped_span(mapped_data,
551 static_cast<size_t>(width) *
552 static_cast<size_t>(height));
553
554 pack_plane_to_float(mapped_span, inputs[i], plane, vsapi);
555 stream.memory->flushBuffer(plane_res.input_staging_buffers[i],
556 buffer_size);
557 }
558
559 // Upload props to staging
560 if (plane_res.props_buffer.isValid() && props_size > 0) {
561 std::memcpy(plane_res.props_staging_buffer.getMappedData(),
562 props.data(), props_size);
563 stream.memory->flushBuffer(plane_res.props_staging_buffer, props_size);
564 }
565
566 stream.command_buffer.reset();
567 vk::CommandBufferBeginInfo begin_info(
568 vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
569 stream.command_buffer.begin(begin_info);
570
571 // Upload input buffers
572 for (size_t i = 0; i < inputs.size(); ++i) {
573 vk::BufferCopy region(0, 0, buffer_size);
574 stream.command_buffer.copyBuffer(
575 vk::Buffer(plane_res.input_staging_buffers[i].buffer),
576 vk::Buffer(plane_res.input_buffers[i].buffer), region);
577 }
578
579 // Upload props
580 if (plane_res.props_buffer.isValid() && props_size > 0) {
581 vk::BufferCopy region(0, 0, props_size);
582 stream.command_buffer.copyBuffer(
583 vk::Buffer(plane_res.props_staging_buffer.buffer),
584 vk::Buffer(plane_res.props_buffer.buffer), region);
585 }
586
587 // Transfer -> compute barriers for inputs/props
588 std::vector<vk::BufferMemoryBarrier> to_compute_barriers;
589 to_compute_barriers.reserve(inputs.size() + 1);
590 for (size_t i = 0; i < inputs.size(); ++i) {
591 vk::BufferMemoryBarrier b;
592 b.srcAccessMask = vk::AccessFlagBits::eTransferWrite;
593 b.dstAccessMask = vk::AccessFlagBits::eShaderRead;
594 b.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
595 b.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
596 b.buffer = vk::Buffer(plane_res.input_buffers[i].buffer);
597 b.offset = 0;
598 b.size = VK_WHOLE_SIZE;
599 to_compute_barriers.push_back(b);
600 }
601 if (plane_res.props_buffer.isValid() && props_size > 0) {
602 vk::BufferMemoryBarrier b;
603 b.srcAccessMask = vk::AccessFlagBits::eTransferWrite;
604 b.dstAccessMask = vk::AccessFlagBits::eShaderRead;
605 b.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
606 b.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
607 b.buffer = vk::Buffer(plane_res.props_buffer.buffer);
608 b.offset = 0;
609 b.size = VK_WHOLE_SIZE;
610 to_compute_barriers.push_back(b);
611 }
612 if (!to_compute_barriers.empty()) {
613 stream.command_buffer.pipelineBarrier(
614 vk::PipelineStageFlagBits::eTransfer,
615 vk::PipelineStageFlagBits::eComputeShader, {}, {},
616 to_compute_barriers, {});
617 }
618
619 const size_t num_stages = stage_sources.size();
620 for (size_t s = 0; s < num_stages; ++s) {
621 std::vector<VulkanBuffer*> dispatch_inputs;
622 dispatch_inputs.reserve(inputs.size() + s);
623 for (size_t i = 0; i < inputs.size(); ++i) {
624 dispatch_inputs.push_back(&plane_res.input_buffers[i]);
625 }
626 for (size_t k = 0; k < s; ++k) {
627 dispatch_inputs.push_back(&plane_res.intermediate_buffers[k]);
628 }
629
630 VulkanBuffer* dispatch_output =
631 (s == num_stages - 1) ? &plane_res.output_buffer
632 : &plane_res.intermediate_buffers[s];
633
634 stream.pipelines.at(plane).at(s)->recordDispatch(
635 stream.command_buffer, dispatch_inputs, *dispatch_output,
636 (plane_res.props_buffer.isValid() && props_size > 0)
637 ? &plane_res.props_buffer
638 : nullptr,
639 static_cast<std::uint32_t>(width),
640 static_cast<std::uint32_t>(height), frame_number);
641
642 if (s < num_stages - 1) {
643 vk::BufferMemoryBarrier b;
644 b.srcAccessMask = vk::AccessFlagBits::eShaderWrite;
645 b.dstAccessMask = vk::AccessFlagBits::eShaderRead;
646 b.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
647 b.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
648 b.buffer = vk::Buffer(dispatch_output->buffer);
649 b.offset = 0;
650 b.size = VK_WHOLE_SIZE;
651 std::array<vk::BufferMemoryBarrier, 1> bar = {b};
652
653 stream.command_buffer.pipelineBarrier(
654 vk::PipelineStageFlagBits::eComputeShader,
655 vk::PipelineStageFlagBits::eComputeShader, {}, {}, bar, {});
656 }
657 }
658
659 // Compute -> transfer barrier for output, then download
660 vk::BufferMemoryBarrier to_transfer;
661 to_transfer.srcAccessMask = vk::AccessFlagBits::eShaderWrite;
662 to_transfer.dstAccessMask = vk::AccessFlagBits::eTransferRead;
663 to_transfer.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
664 to_transfer.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
665 to_transfer.buffer = vk::Buffer(plane_res.output_buffer.buffer);
666 to_transfer.offset = 0;
667 to_transfer.size = VK_WHOLE_SIZE;
668 std::array<vk::BufferMemoryBarrier, 1> to_transfer_barriers = {to_transfer};
669 stream.command_buffer.pipelineBarrier(
670 vk::PipelineStageFlagBits::eComputeShader,
671 vk::PipelineStageFlagBits::eTransfer, {}, {}, to_transfer_barriers, {});
672
673 vk::BufferCopy download_region(0, 0, buffer_size);
674 stream.command_buffer.copyBuffer(
675 vk::Buffer(plane_res.output_buffer.buffer),
676 vk::Buffer(plane_res.output_staging_buffer.buffer), download_region);
677
678 stream.command_buffer.end();
679
680 vk::SubmitInfo submit_info;
681 submit_info.setCommandBuffers(*stream.command_buffer);
682 impl->context->submit(submit_info, *stream.fence);
683
684 auto result = impl->context->getDevice().waitForFences(*stream.fence,
685 VK_TRUE, UINT64_MAX);
686 if (result != vk::Result::eSuccess) {
687 throw std::runtime_error("Failed to wait for VkExpr fence");
688 }
689 impl->context->getDevice().resetFences(*stream.fence);
690
691 stream.memory->invalidateBuffer(plane_res.output_staging_buffer,
692 buffer_size);
693 const auto* mapped_out = static_cast<const float*>(
694 plane_res.output_staging_buffer.getMappedData());
695
696 std::span<const float> mapped_span(
697 mapped_out, static_cast<size_t>(width) * static_cast<size_t>(height));
698 unpack_float_to_plane(mapped_span, output, plane, vsapi);
699}
700
701} // namespace vkexpr
VkExprExecutor & operator=(const VkExprExecutor &)=delete
VkExprExecutor(int device_id, int num_streams, int num_inputs, std::array< std::vector< std::string >, 3 > glsl_stages, std::uint32_t num_props_floats)
void processPlane(int plane, int frame_number, std::span< const VSFrame *const > inputs, VSFrame *output, std::span< const float > props, const VSAPI *vsapi)
std::vector< VulkanBuffer > input_staging_buffers
std::vector< VulkanBuffer > intermediate_buffers
std::unique_ptr< VulkanMemory > memory
void freePlaneResources(PlaneResources &res) const
std::array< std::vector< std::unique_ptr< VulkanComputePipeline > >, 3 > pipelines
Stream & operator=(Stream &&)=delete
std::array< PlaneResources, 3 > plane_resources
Stream & operator=(const Stream &)=delete
std::array< std::vector< std::string >, 3 > glsl_stages
Impl(int device_id, int num_streams, int num_inputs, std::array< std::vector< std::string >, 3 > glsl_stages, std::uint32_t num_props_floats)
std::queue< int > free_stream_indices
std::vector< std::unique_ptr< Stream > > streams
std::counting_semaphore semaphore