444 std::span<const VSFrame* const> inputs,
445 VSFrame* output, std::span<const float> props,
446 const VSAPI* vsapi) {
447 [[unlikely]]
if (impl ==
nullptr) {
448 throw std::runtime_error(
"VkExprExecutor is not initialized.");
450 [[unlikely]]
if (plane < 0 || plane > 2) {
451 throw std::runtime_error(
"Invalid plane index.");
453 [[unlikely]]
if (inputs.size() !=
static_cast<size_t>(impl->num_inputs)) {
454 throw std::runtime_error(
455 "VkExprExecutor: unexpected number of inputs.");
457 [[unlikely]]
if (impl->glsl_stages.at(plane).empty()) {
458 throw std::runtime_error(
459 "VkExprExecutor: no shader for requested plane.");
461 const int width = vsapi->getFrameWidth(output, plane);
462 const int height = vsapi->getFrameHeight(output, plane);
463 [[unlikely]]
if (width <= 0 || height <= 0) {
464 throw std::runtime_error(
"Invalid output dimensions.");
467 for (
const auto* in : inputs) {
468 [[unlikely]]
if (vsapi->getFrameWidth(in, plane) != width ||
469 vsapi->getFrameHeight(in, plane) != height) {
470 throw std::runtime_error(
"Input/output plane dimension mismatch.");
474 VkDeviceSize buffer_size =
static_cast<VkDeviceSize
>(width) *
475 static_cast<VkDeviceSize
>(height) *
478 const int stream_idx = impl->acquireStreamIndex();
479 struct StreamReleaser {
482 StreamReleaser(
Impl& i,
int x) : impl(i), idx(x) {}
483 ~StreamReleaser() { impl.releaseStreamIndex(idx); }
484 StreamReleaser(
const StreamReleaser&) =
delete;
485 StreamReleaser&
operator=(
const StreamReleaser&) =
delete;
486 StreamReleaser(StreamReleaser&&) =
delete;
487 StreamReleaser&
operator=(StreamReleaser&&) =
delete;
488 } releaser(*impl, stream_idx);
490 auto& stream = *impl->streams.at(
static_cast<size_t>(stream_idx));
491 auto& plane_res = stream.plane_resources.at(plane);
492 auto& stage_sources = impl->glsl_stages.at(plane);
495 if (!plane_res.initialized || plane_res.buffer_size < buffer_size ||
496 plane_res.input_buffers.size() != inputs.size() ||
497 stream.pipelines.at(plane).size() != stage_sources.size()) {
499 stream.freePlaneResources(plane_res);
501 plane_res.input_buffers.resize(inputs.size());
502 plane_res.input_staging_buffers.resize(inputs.size());
504 for (
size_t i = 0; i < inputs.size(); ++i) {
505 plane_res.input_buffers[i] =
506 stream.memory->createGPUBuffer(buffer_size);
507 plane_res.input_staging_buffers[i] =
508 stream.memory->createStagingBuffer(buffer_size,
true);
511 plane_res.output_buffer = stream.memory->createGPUBuffer(buffer_size);
512 plane_res.output_staging_buffer =
513 stream.memory->createStagingBuffer(buffer_size,
false);
515 const size_t num_intermediates =
516 (stage_sources.size() > 0) ? (stage_sources.size() - 1) : 0;
517 plane_res.intermediate_buffers.resize(num_intermediates);
518 for (
size_t i = 0; i < num_intermediates; ++i) {
519 plane_res.intermediate_buffers[i] =
520 stream.memory->createGPUBuffer(buffer_size);
523 plane_res.buffer_size = buffer_size;
524 plane_res.initialized =
true;
528 VkDeviceSize props_size = props.size() *
sizeof(float);
529 if (props_size > 0) {
530 if (!plane_res.props_buffer.isValid() ||
531 plane_res.props_size < props_size) {
532 if (plane_res.props_buffer.isValid()) {
533 stream.memory->destroyBuffer(plane_res.props_buffer);
534 stream.memory->destroyBuffer(plane_res.props_staging_buffer);
537 plane_res.props_buffer = stream.memory->createGPUBuffer(
538 props_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
539 VK_BUFFER_USAGE_TRANSFER_DST_BIT);
540 plane_res.props_staging_buffer =
541 stream.memory->createStagingBuffer(props_size,
true);
542 plane_res.props_size = props_size;
547 for (
size_t i = 0; i < inputs.size(); ++i) {
548 auto* mapped_data =
static_cast<float*
>(
549 plane_res.input_staging_buffers[i].getMappedData());
550 std::span<float> mapped_span(mapped_data,
551 static_cast<size_t>(width) *
552 static_cast<size_t>(height));
554 pack_plane_to_float(mapped_span, inputs[i], plane, vsapi);
555 stream.memory->flushBuffer(plane_res.input_staging_buffers[i],
560 if (plane_res.props_buffer.isValid() && props_size > 0) {
561 std::memcpy(plane_res.props_staging_buffer.getMappedData(),
562 props.data(), props_size);
563 stream.memory->flushBuffer(plane_res.props_staging_buffer, props_size);
566 stream.command_buffer.reset();
567 vk::CommandBufferBeginInfo begin_info(
568 vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
569 stream.command_buffer.begin(begin_info);
572 for (
size_t i = 0; i < inputs.size(); ++i) {
573 vk::BufferCopy region(0, 0, buffer_size);
574 stream.command_buffer.copyBuffer(
575 vk::Buffer(plane_res.input_staging_buffers[i].buffer),
576 vk::Buffer(plane_res.input_buffers[i].buffer), region);
580 if (plane_res.props_buffer.isValid() && props_size > 0) {
581 vk::BufferCopy region(0, 0, props_size);
582 stream.command_buffer.copyBuffer(
583 vk::Buffer(plane_res.props_staging_buffer.buffer),
584 vk::Buffer(plane_res.props_buffer.buffer), region);
588 std::vector<vk::BufferMemoryBarrier> to_compute_barriers;
589 to_compute_barriers.reserve(inputs.size() + 1);
590 for (
size_t i = 0; i < inputs.size(); ++i) {
591 vk::BufferMemoryBarrier b;
592 b.srcAccessMask = vk::AccessFlagBits::eTransferWrite;
593 b.dstAccessMask = vk::AccessFlagBits::eShaderRead;
594 b.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
595 b.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
596 b.buffer = vk::Buffer(plane_res.input_buffers[i].buffer);
598 b.size = VK_WHOLE_SIZE;
599 to_compute_barriers.push_back(b);
601 if (plane_res.props_buffer.isValid() && props_size > 0) {
602 vk::BufferMemoryBarrier b;
603 b.srcAccessMask = vk::AccessFlagBits::eTransferWrite;
604 b.dstAccessMask = vk::AccessFlagBits::eShaderRead;
605 b.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
606 b.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
607 b.buffer = vk::Buffer(plane_res.props_buffer.buffer);
609 b.size = VK_WHOLE_SIZE;
610 to_compute_barriers.push_back(b);
612 if (!to_compute_barriers.empty()) {
613 stream.command_buffer.pipelineBarrier(
614 vk::PipelineStageFlagBits::eTransfer,
615 vk::PipelineStageFlagBits::eComputeShader, {}, {},
616 to_compute_barriers, {});
619 const size_t num_stages = stage_sources.size();
620 for (
size_t s = 0; s < num_stages; ++s) {
621 std::vector<VulkanBuffer*> dispatch_inputs;
622 dispatch_inputs.reserve(inputs.size() + s);
623 for (
size_t i = 0; i < inputs.size(); ++i) {
624 dispatch_inputs.push_back(&plane_res.input_buffers[i]);
626 for (
size_t k = 0; k < s; ++k) {
627 dispatch_inputs.push_back(&plane_res.intermediate_buffers[k]);
631 (s == num_stages - 1) ? &plane_res.output_buffer
632 : &plane_res.intermediate_buffers[s];
634 stream.pipelines.at(plane).at(s)->recordDispatch(
635 stream.command_buffer, dispatch_inputs, *dispatch_output,
636 (plane_res.props_buffer.isValid() && props_size > 0)
637 ? &plane_res.props_buffer
639 static_cast<std::uint32_t
>(width),
640 static_cast<std::uint32_t
>(height), frame_number);
642 if (s < num_stages - 1) {
643 vk::BufferMemoryBarrier b;
644 b.srcAccessMask = vk::AccessFlagBits::eShaderWrite;
645 b.dstAccessMask = vk::AccessFlagBits::eShaderRead;
646 b.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
647 b.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
648 b.buffer = vk::Buffer(dispatch_output->
buffer);
650 b.size = VK_WHOLE_SIZE;
651 std::array<vk::BufferMemoryBarrier, 1> bar = {b};
653 stream.command_buffer.pipelineBarrier(
654 vk::PipelineStageFlagBits::eComputeShader,
655 vk::PipelineStageFlagBits::eComputeShader, {}, {}, bar, {});
660 vk::BufferMemoryBarrier to_transfer;
661 to_transfer.srcAccessMask = vk::AccessFlagBits::eShaderWrite;
662 to_transfer.dstAccessMask = vk::AccessFlagBits::eTransferRead;
663 to_transfer.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
664 to_transfer.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
665 to_transfer.buffer = vk::Buffer(plane_res.output_buffer.buffer);
666 to_transfer.offset = 0;
667 to_transfer.size = VK_WHOLE_SIZE;
668 std::array<vk::BufferMemoryBarrier, 1> to_transfer_barriers = {to_transfer};
669 stream.command_buffer.pipelineBarrier(
670 vk::PipelineStageFlagBits::eComputeShader,
671 vk::PipelineStageFlagBits::eTransfer, {}, {}, to_transfer_barriers, {});
673 vk::BufferCopy download_region(0, 0, buffer_size);
674 stream.command_buffer.copyBuffer(
675 vk::Buffer(plane_res.output_buffer.buffer),
676 vk::Buffer(plane_res.output_staging_buffer.buffer), download_region);
678 stream.command_buffer.end();
680 vk::SubmitInfo submit_info;
681 submit_info.setCommandBuffers(*stream.command_buffer);
682 impl->context->submit(submit_info, *stream.fence);
684 auto result = impl->context->getDevice().waitForFences(*stream.fence,
685 VK_TRUE, UINT64_MAX);
686 if (result != vk::Result::eSuccess) {
687 throw std::runtime_error(
"Failed to wait for VkExpr fence");
689 impl->context->getDevice().resetFences(*stream.fence);
691 stream.memory->invalidateBuffer(plane_res.output_staging_buffer,
693 const auto* mapped_out =
static_cast<const float*
>(
694 plane_res.output_staging_buffer.getMappedData());
696 std::span<const float> mapped_span(
697 mapped_out,
static_cast<size_t>(width) *
static_cast<size_t>(height));
698 unpack_float_to_plane(mapped_span, output, plane, vsapi);