From 110ce1a9e0476f08823d144c5aab01a27413f908 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@collabora.com>
Date: Mon, 18 Sep 2023 11:38:13 +0200
Subject: [PATCH 55/64] panfrost: Deduplicate panfrost_launch_grid()

Keep the common bits in a panfrost_launch_grid() helper, and move
the job-backend specific bits to {csf,jm}_launch_grid() helpers.
---
 src/gallium/drivers/panfrost/pan_cmdstream.c | 294 +++++++++----------
 1 file changed, 140 insertions(+), 154 deletions(-)

diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c
index 5f8822be653..519f44ef7a1 100644
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -3779,33 +3779,14 @@ panfrost_update_point_sprite_shader(struct panfrost_context *ctx,
 }
 
 #if PAN_USE_CSF
-/*
- * Launch grid is the compute equivalent of draw_vbo. Set up the registers for a
- * compute kernel and emit the run_compute command.
- */
 static void
-panfrost_launch_grid(struct pipe_context *pipe,
-                     const struct pipe_grid_info *info)
+csf_launch_grid(struct panfrost_batch *batch, const struct pipe_grid_info *info)
 {
-   struct panfrost_context *ctx = pan_context(pipe);
-
-   /* XXX - shouldn't be necessary with working memory barriers. Affected
-    * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */
-   panfrost_flush_all_batches(ctx, "Launch grid pre-barrier");
-
-   struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
-
-   ctx->compute_grid = info;
-
-   /* Conservatively assume workgroup size changes every launch */
-   ctx->dirty |= PAN_DIRTY_PARAMS;
-
-   panfrost_update_shader_state(batch, PIPE_SHADER_COMPUTE);
-
    /* Empty compute programs are invalid and don't make sense */
    if (batch->rsd[PIPE_SHADER_COMPUTE] == 0)
       return;
 
+   struct panfrost_context *ctx = batch->ctx;
    struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
    ceu_builder *b = batch->ceu_builder;
 
@@ -3879,10 +3860,147 @@ panfrost_launch_grid(struct pipe_context *pipe,
     */
    ceu_run_compute(b, 10, MALI_TASK_AXIS_Z);
    batch->any_compute = true;
+}
+#else // PAN_USE_CSF
+static void
+jm_launch_grid(struct panfrost_batch *batch, const struct pipe_grid_info *info)
+{
+   struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
+
+   /* Invoke according to the grid info */
+
+   unsigned num_wg[3] = {info->grid[0], info->grid[1], info->grid[2]};
+
+   if (info->indirect)
+      num_wg[0] = num_wg[1] = num_wg[2] = 1;
+
+#if PAN_ARCH <= 7
+   panfrost_pack_work_groups_compute(
+      pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), num_wg[0], num_wg[1],
+      num_wg[2], info->block[0], info->block[1], info->block[2], false,
+      info->indirect != NULL);
+
+   pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
+      cfg.job_task_split = util_logbase2_ceil(info->block[0] + 1) +
+                           util_logbase2_ceil(info->block[1] + 1) +
+                           util_logbase2_ceil(info->block[2] + 1);
+   }
+
+   pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
+      cfg.state = batch->rsd[PIPE_SHADER_COMPUTE];
+      cfg.attributes = panfrost_emit_image_attribs(
+         batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE);
+      cfg.thread_storage = panfrost_emit_shared_memory(batch, info);
+      cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE];
+      cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE];
+      cfg.textures = batch->textures[PIPE_SHADER_COMPUTE];
+      cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE];
+   }
+#else
+   struct panfrost_context *ctx = batch->ctx;
+   struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
+
+   pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
+      cfg.workgroup_size_x = info->block[0];
+      cfg.workgroup_size_y = info->block[1];
+      cfg.workgroup_size_z = info->block[2];
+
+      cfg.workgroup_count_x = num_wg[0];
+      cfg.workgroup_count_y = num_wg[1];
+      cfg.workgroup_count_z = num_wg[2];
+
+      panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_COMPUTE,
+                           batch->rsd[PIPE_SHADER_COMPUTE],
+                           panfrost_emit_shared_memory(batch, info));
+
+      /* Workgroups may be merged if the shader does not use barriers
+       * or shared memory. This condition is checked against the
+       * static shared_size at compile-time. We need to check the
+       * variable shared size at launch_grid time, because the
+       * compiler doesn't know about that.
+       */
+      cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
+                                     (info->variable_shared_mem == 0);
+
+      cfg.task_increment = 1;
+      cfg.task_axis = MALI_TASK_AXIS_Z;
+   }
+#endif
+
+   unsigned indirect_dep = 0;
+#if PAN_GPU_INDIRECTS
+   if (info->indirect) {
+      struct pan_indirect_dispatch_info indirect = {
+         .job = t.gpu,
+         .indirect_dim = pan_resource(info->indirect)->image.data.bo->ptr.gpu +
+                         info->indirect_offset,
+         .num_wg_sysval =
+            {
+               batch->num_wg_sysval[0],
+               batch->num_wg_sysval[1],
+               batch->num_wg_sysval[2],
+            },
+      };
+
+      indirect_dep = GENX(pan_indirect_dispatch_emit)(
+         &batch->pool.base, &batch->scoreboard, &indirect);
+   }
+#endif
+
+   panfrost_add_job(&batch->pool.base, &batch->scoreboard,
+                    MALI_JOB_TYPE_COMPUTE, true, false, indirect_dep, 0, &t,
+                    false);
+   batch->any_compute = true;
+}
+#endif // PAN_USE_CSF
+
+/*
+ * Launch grid is the compute equivalent of draw_vbo. Set up the registers for a
+ * compute kernel and emit the run_compute command.
+ */
+static void
+panfrost_launch_grid(struct pipe_context *pipe,
+                     const struct pipe_grid_info *info)
+{
+   struct panfrost_context *ctx = pan_context(pipe);
+
+   /* XXX - shouldn't be necessary with working memory barriers. Affected
+    * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */
+   panfrost_flush_all_batches(ctx, "Launch grid pre-barrier");
+
+   struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
+
+   if (info->indirect && !PAN_GPU_INDIRECTS) {
+      struct pipe_transfer *transfer;
+      uint32_t *params =
+         pipe_buffer_map_range(pipe, info->indirect, info->indirect_offset,
+                               3 * sizeof(uint32_t), PIPE_MAP_READ, &transfer);
+
+      struct pipe_grid_info direct = *info;
+      direct.indirect = NULL;
+      direct.grid[0] = params[0];
+      direct.grid[1] = params[1];
+      direct.grid[2] = params[2];
+      pipe_buffer_unmap(pipe, transfer);
+
+      if (params[0] && params[1] && params[2])
+         panfrost_launch_grid(pipe, &direct);
+
+      return;
+   }
+
+   ctx->compute_grid = info;
+
+   /* Conservatively assume workgroup size changes every launch */
+   ctx->dirty |= PAN_DIRTY_PARAMS;
+
+   panfrost_update_shader_state(batch, PIPE_SHADER_COMPUTE);
+
+   JOBX(launch_grid)(batch, info);
 
    panfrost_flush_all_batches(ctx, "Launch grid post-barrier");
 }
-#endif
+
 /*
  * Entrypoint for draws on JM/CSF Mali. Depending on generation, this requires
  * emitting jobs for indirect drawing, transform feedback, vertex shading, and
@@ -4265,138 +4383,6 @@ panfrost_direct_draw(struct panfrost_batch *batch,
 #endif
 }
 
-#if !PAN_USE_CSF
-/*
- * Launch grid is the compute equivalent of draw_vbo, so in this routine, we
- * construct the COMPUTE job and add it to the job chain.
- */
-static void
-panfrost_launch_grid(struct pipe_context *pipe,
-                     const struct pipe_grid_info *info)
-{
-   struct panfrost_context *ctx = pan_context(pipe);
-
-   /* XXX - shouldn't be necessary with working memory barriers. Affected
-    * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */
-   panfrost_flush_all_batches(ctx, "Launch grid pre-barrier");
-
-   struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
-
-   if (info->indirect && !PAN_GPU_INDIRECTS) {
-      struct pipe_transfer *transfer;
-      uint32_t *params =
-         pipe_buffer_map_range(pipe, info->indirect, info->indirect_offset,
-                               3 * sizeof(uint32_t), PIPE_MAP_READ, &transfer);
-
-      struct pipe_grid_info direct = *info;
-      direct.indirect = NULL;
-      direct.grid[0] = params[0];
-      direct.grid[1] = params[1];
-      direct.grid[2] = params[2];
-      pipe_buffer_unmap(pipe, transfer);
-
-      if (params[0] && params[1] && params[2])
-         panfrost_launch_grid(pipe, &direct);
-
-      return;
-   }
-
-   ctx->compute_grid = info;
-
-   struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
-
-   /* Invoke according to the grid info */
-
-   unsigned num_wg[3] = {info->grid[0], info->grid[1], info->grid[2]};
-
-   if (info->indirect)
-      num_wg[0] = num_wg[1] = num_wg[2] = 1;
-
-   /* Conservatively assume workgroup size changes every launch */
-   ctx->dirty |= PAN_DIRTY_PARAMS;
-
-   panfrost_update_shader_state(batch, PIPE_SHADER_COMPUTE);
-
-#if PAN_ARCH <= 7
-   panfrost_pack_work_groups_compute(
-      pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), num_wg[0], num_wg[1],
-      num_wg[2], info->block[0], info->block[1], info->block[2], false,
-      info->indirect != NULL);
-
-   pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
-      cfg.job_task_split = util_logbase2_ceil(info->block[0] + 1) +
-                           util_logbase2_ceil(info->block[1] + 1) +
-                           util_logbase2_ceil(info->block[2] + 1);
-   }
-
-   pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
-      cfg.state = batch->rsd[PIPE_SHADER_COMPUTE];
-      cfg.attributes = panfrost_emit_image_attribs(
-         batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE);
-      cfg.thread_storage = panfrost_emit_shared_memory(batch, info);
-      cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE];
-      cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE];
-      cfg.textures = batch->textures[PIPE_SHADER_COMPUTE];
-      cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE];
-   }
-#else
-   struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
-
-   pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
-      cfg.workgroup_size_x = info->block[0];
-      cfg.workgroup_size_y = info->block[1];
-      cfg.workgroup_size_z = info->block[2];
-
-      cfg.workgroup_count_x = num_wg[0];
-      cfg.workgroup_count_y = num_wg[1];
-      cfg.workgroup_count_z = num_wg[2];
-
-      panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_COMPUTE,
-                           batch->rsd[PIPE_SHADER_COMPUTE],
-                           panfrost_emit_shared_memory(batch, info));
-
-      /* Workgroups may be merged if the shader does not use barriers
-       * or shared memory. This condition is checked against the
-       * static shared_size at compile-time. We need to check the
-       * variable shared size at launch_grid time, because the
-       * compiler doesn't know about that.
-       */
-      cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
-                                     (info->variable_shared_mem == 0);
-
-      cfg.task_increment = 1;
-      cfg.task_axis = MALI_TASK_AXIS_Z;
-   }
-#endif
-
-   unsigned indirect_dep = 0;
-#if PAN_GPU_INDIRECTS
-   if (info->indirect) {
-      struct pan_indirect_dispatch_info indirect = {
-         .job = t.gpu,
-         .indirect_dim = pan_resource(info->indirect)->image.data.bo->ptr.gpu +
-                         info->indirect_offset,
-         .num_wg_sysval =
-            {
-               batch->num_wg_sysval[0],
-               batch->num_wg_sysval[1],
-               batch->num_wg_sysval[2],
-            },
-      };
-
-      indirect_dep = GENX(pan_indirect_dispatch_emit)(
-         &batch->pool.base, &batch->scoreboard, &indirect);
-   }
-#endif
-
-   panfrost_add_job(&batch->pool.base, &batch->scoreboard,
-                    MALI_JOB_TYPE_COMPUTE, true, false, indirect_dep, 0, &t,
-                    false);
-   batch->any_compute = true;
-   panfrost_flush_all_batches(ctx, "Launch grid post-barrier");
-}
-#endif
-
 #if PAN_USE_CSF
 static inline void
 csf_prepare_first_draw(struct panfrost_batch *batch)
-- 
2.42.0