446 {
447 [[unlikely]] if (impl == nullptr) {
448 throw std::runtime_error("VkExprExecutor is not initialized.");
449 }
450 [[unlikely]] if (plane < 0 || plane > 2) {
451 throw std::runtime_error("Invalid plane index.");
452 }
453 [[unlikely]] if (inputs.size() != static_cast<size_t>(impl->num_inputs)) {
454 throw std::runtime_error(
455 "VkExprExecutor: unexpected number of inputs.");
456 }
457 [[unlikely]] if (impl->glsl_stages.at(plane).empty()) {
458 throw std::runtime_error(
459 "VkExprExecutor: no shader for requested plane.");
460 }
461 const int width = vsapi->getFrameWidth(output, plane);
462 const int height = vsapi->getFrameHeight(output, plane);
463 [[unlikely]] if (width <= 0 || height <= 0) {
464 throw std::runtime_error("Invalid output dimensions.");
465 }
466
467 for (const auto* in : inputs) {
468 [[unlikely]] if (vsapi->getFrameWidth(in, plane) != width ||
469 vsapi->getFrameHeight(in, plane) != height) {
470 throw std::runtime_error("Input/output plane dimension mismatch.");
471 }
472 }
473
474 VkDeviceSize buffer_size = static_cast<VkDeviceSize>(width) *
475 static_cast<VkDeviceSize>(height) *
476 sizeof(float);
477
478 const int stream_idx = impl->acquireStreamIndex();
479 struct StreamReleaser {
480 Impl& impl;
481 int idx;
482 StreamReleaser(Impl& i, int x) : impl(i), idx(x) {}
483 ~StreamReleaser() { impl.releaseStreamIndex(idx); }
484 StreamReleaser(const StreamReleaser&) = delete;
485 StreamReleaser& operator=(const StreamReleaser&) = delete;
486 StreamReleaser(StreamReleaser&&) = delete;
487 StreamReleaser& operator=(StreamReleaser&&) = delete;
488 } releaser(*impl, stream_idx);
489
490 auto& stream = *impl->streams.at(static_cast<size_t>(stream_idx));
491 auto& plane_res = stream.plane_resources.at(plane);
492 auto& stage_sources = impl->glsl_stages.at(plane);
493
494
495 if (!plane_res.initialized || plane_res.buffer_size < buffer_size ||
496 plane_res.input_buffers.size() != inputs.size() ||
497 stream.pipelines.at(plane).size() != stage_sources.size()) {
498
499 stream.freePlaneResources(plane_res);
500
501 plane_res.input_buffers.resize(inputs.size());
502 plane_res.input_staging_buffers.resize(inputs.size());
503
504 for (size_t i = 0; i < inputs.size(); ++i) {
505 plane_res.input_buffers[i] =
506 stream.memory->createGPUBuffer(buffer_size);
507 plane_res.input_staging_buffers[i] =
508 stream.memory->createStagingBuffer(buffer_size, true);
509 }
510
511 plane_res.output_buffer = stream.memory->createGPUBuffer(buffer_size);
512 plane_res.output_staging_buffer =
513 stream.memory->createStagingBuffer(buffer_size, false);
514
515 const size_t num_intermediates =
516 (stage_sources.size() > 0) ? (stage_sources.size() - 1) : 0;
517 plane_res.intermediate_buffers.resize(num_intermediates);
518 for (size_t i = 0; i < num_intermediates; ++i) {
519 plane_res.intermediate_buffers[i] =
520 stream.memory->createGPUBuffer(buffer_size);
521 }
522
523 plane_res.buffer_size = buffer_size;
524 plane_res.initialized = true;
525 }
526
527
528 VkDeviceSize props_size = props.size() * sizeof(float);
529 if (props_size > 0) {
530 if (!plane_res.props_buffer.isValid() ||
531 plane_res.props_size < props_size) {
532 if (plane_res.props_buffer.isValid()) {
533 stream.memory->destroyBuffer(plane_res.props_buffer);
534 stream.memory->destroyBuffer(plane_res.props_staging_buffer);
535 }
536
537 plane_res.props_buffer = stream.memory->createGPUBuffer(
538 props_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
539 VK_BUFFER_USAGE_TRANSFER_DST_BIT);
540 plane_res.props_staging_buffer =
541 stream.memory->createStagingBuffer(props_size, true);
542 plane_res.props_size = props_size;
543 }
544 }
545
546
547 for (size_t i = 0; i < inputs.size(); ++i) {
548 auto* mapped_data = static_cast<float*>(
549 plane_res.input_staging_buffers[i].getMappedData());
550 std::span<float> mapped_span(mapped_data,
551 static_cast<size_t>(width) *
552 static_cast<size_t>(height));
553
554 pack_plane_to_float(mapped_span, inputs[i], plane, vsapi);
555 stream.memory->flushBuffer(plane_res.input_staging_buffers[i],
556 buffer_size);
557 }
558
559
560 if (plane_res.props_buffer.isValid() && props_size > 0) {
561 std::memcpy(plane_res.props_staging_buffer.getMappedData(),
562 props.data(), props_size);
563 stream.memory->flushBuffer(plane_res.props_staging_buffer, props_size);
564 }
565
566 stream.command_buffer.reset();
567 vk::CommandBufferBeginInfo begin_info(
568 vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
569 stream.command_buffer.begin(begin_info);
570
571
572 for (size_t i = 0; i < inputs.size(); ++i) {
573 vk::BufferCopy region(0, 0, buffer_size);
574 stream.command_buffer.copyBuffer(
575 vk::Buffer(plane_res.input_staging_buffers[i].buffer),
576 vk::Buffer(plane_res.input_buffers[i].buffer), region);
577 }
578
579
580 if (plane_res.props_buffer.isValid() && props_size > 0) {
581 vk::BufferCopy region(0, 0, props_size);
582 stream.command_buffer.copyBuffer(
583 vk::Buffer(plane_res.props_staging_buffer.buffer),
584 vk::Buffer(plane_res.props_buffer.buffer), region);
585 }
586
587
588 std::vector<vk::BufferMemoryBarrier> to_compute_barriers;
589 to_compute_barriers.reserve(inputs.size() + 1);
590 for (size_t i = 0; i < inputs.size(); ++i) {
591 vk::BufferMemoryBarrier b;
592 b.srcAccessMask = vk::AccessFlagBits::eTransferWrite;
593 b.dstAccessMask = vk::AccessFlagBits::eShaderRead;
594 b.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
595 b.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
596 b.buffer = vk::Buffer(plane_res.input_buffers[i].buffer);
597 b.offset = 0;
598 b.size = VK_WHOLE_SIZE;
599 to_compute_barriers.push_back(b);
600 }
601 if (plane_res.props_buffer.isValid() && props_size > 0) {
602 vk::BufferMemoryBarrier b;
603 b.srcAccessMask = vk::AccessFlagBits::eTransferWrite;
604 b.dstAccessMask = vk::AccessFlagBits::eShaderRead;
605 b.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
606 b.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
607 b.buffer = vk::Buffer(plane_res.props_buffer.buffer);
608 b.offset = 0;
609 b.size = VK_WHOLE_SIZE;
610 to_compute_barriers.push_back(b);
611 }
612 if (!to_compute_barriers.empty()) {
613 stream.command_buffer.pipelineBarrier(
614 vk::PipelineStageFlagBits::eTransfer,
615 vk::PipelineStageFlagBits::eComputeShader, {}, {},
616 to_compute_barriers, {});
617 }
618
619 const size_t num_stages = stage_sources.size();
620 for (size_t s = 0; s < num_stages; ++s) {
621 std::vector<VulkanBuffer*> dispatch_inputs;
622 dispatch_inputs.reserve(inputs.size() + s);
623 for (size_t i = 0; i < inputs.size(); ++i) {
624 dispatch_inputs.push_back(&plane_res.input_buffers[i]);
625 }
626 for (size_t k = 0; k < s; ++k) {
627 dispatch_inputs.push_back(&plane_res.intermediate_buffers[k]);
628 }
629
630 VulkanBuffer* dispatch_output =
631 (s == num_stages - 1) ? &plane_res.output_buffer
632 : &plane_res.intermediate_buffers[s];
633
634 stream.pipelines.at(plane).at(s)->recordDispatch(
635 stream.command_buffer, dispatch_inputs, *dispatch_output,
636 (plane_res.props_buffer.isValid() && props_size > 0)
637 ? &plane_res.props_buffer
638 : nullptr,
639 static_cast<std::uint32_t>(width),
640 static_cast<std::uint32_t>(height), frame_number);
641
642 if (s < num_stages - 1) {
643 vk::BufferMemoryBarrier b;
644 b.srcAccessMask = vk::AccessFlagBits::eShaderWrite;
645 b.dstAccessMask = vk::AccessFlagBits::eShaderRead;
646 b.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
647 b.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
648 b.buffer = vk::Buffer(dispatch_output->buffer);
649 b.offset = 0;
650 b.size = VK_WHOLE_SIZE;
651 std::array<vk::BufferMemoryBarrier, 1> bar = {b};
652
653 stream.command_buffer.pipelineBarrier(
654 vk::PipelineStageFlagBits::eComputeShader,
655 vk::PipelineStageFlagBits::eComputeShader, {}, {}, bar, {});
656 }
657 }
658
659
660 vk::BufferMemoryBarrier to_transfer;
661 to_transfer.srcAccessMask = vk::AccessFlagBits::eShaderWrite;
662 to_transfer.dstAccessMask = vk::AccessFlagBits::eTransferRead;
663 to_transfer.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
664 to_transfer.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
665 to_transfer.buffer = vk::Buffer(plane_res.output_buffer.buffer);
666 to_transfer.offset = 0;
667 to_transfer.size = VK_WHOLE_SIZE;
668 std::array<vk::BufferMemoryBarrier, 1> to_transfer_barriers = {to_transfer};
669 stream.command_buffer.pipelineBarrier(
670 vk::PipelineStageFlagBits::eComputeShader,
671 vk::PipelineStageFlagBits::eTransfer, {}, {}, to_transfer_barriers, {});
672
673 vk::BufferCopy download_region(0, 0, buffer_size);
674 stream.command_buffer.copyBuffer(
675 vk::Buffer(plane_res.output_buffer.buffer),
676 vk::Buffer(plane_res.output_staging_buffer.buffer), download_region);
677
678 stream.command_buffer.end();
679
680 vk::SubmitInfo submit_info;
681 submit_info.setCommandBuffers(*stream.command_buffer);
682 impl->context->submit(submit_info, *stream.fence);
683
684 auto result = impl->context->getDevice().waitForFences(*stream.fence,
685 VK_TRUE, UINT64_MAX);
686 if (result != vk::Result::eSuccess) {
687 throw std::runtime_error("Failed to wait for VkExpr fence");
688 }
689 impl->context->getDevice().resetFences(*stream.fence);
690
691 stream.memory->invalidateBuffer(plane_res.output_staging_buffer,
692 buffer_size);
693 const auto* mapped_out = static_cast<const float*>(
694 plane_res.output_staging_buffer.getMappedData());
695
696 std::span<const float> mapped_span(
697 mapped_out, static_cast<size_t>(width) * static_cast<size_t>(height));
698 unpack_float_to_plane(mapped_span, output, plane, vsapi);
699}