/*
 * Copyright © 2018 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "nir_instr_set.h"
#include "nir_search_helpers.h"
#include "nir_builder.h"
#include "util/u_vector.h"

/* Partial redundancy elimination of compares
 *
 * Seaches for comparisons of the form 'a cmp b' that dominate arithmetic
 * instructions like 'b - a'.  The comparison is replaced by the arithmetic
 * instruction, and the result is compared with zero.  For example,
 *
 *       vec1 32 ssa_111 = flt 0.37, ssa_110.w
 *       if ssa_111 {
 *               block block_1:
 *              vec1 32 ssa_112 = fadd ssa_110.w, -0.37
 *              ...
 *
 * becomes
 *
 *       vec1 32 ssa_111 = fadd ssa_110.w, -0.37
 *       vec1 32 ssa_112 = flt 0.0, ssa_111
 *       if ssa_112 {
 *               block block_1:
 *              ...
 */

struct block_queue {
   /**
    * Stack of blocks from the current location in the CFG to the entry point
    * of the function.
    *
    * This is sort of a poor man's dominator tree.
    */
   struct exec_list blocks;

   /** List of freed block_instructions structures that can be reused. */
   struct exec_list reusable_blocks;
};

struct block_instructions {
   struct exec_node node;

   /**
    * Set of comparison instructions from the block that are candidates for
    * being replaced by add instructions.
    */
   struct u_vector instructions;
};

static void
block_queue_init(struct block_queue *bq)
{
   exec_list_make_empty(&bq->blocks);
   exec_list_make_empty(&bq->reusable_blocks);
}

static void
block_queue_finish(struct block_queue *bq)
{
   struct block_instructions *n;

   while ((n = (struct block_instructions *) exec_list_pop_head(&bq->blocks)) != NULL) {
      u_vector_finish(&n->instructions);
      free(n);
   }

   while ((n = (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks)) != NULL) {
      free(n);
   }
}

static struct block_instructions *
push_block(struct block_queue *bq)
{
   struct block_instructions *bi =
      (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks);

   if (bi == NULL) {
      bi = calloc(1, sizeof(struct block_instructions));

      if (bi == NULL)
         return NULL;
   }

   if (!u_vector_init(&bi->instructions,
                      sizeof(nir_alu_instr *),
                      8 * sizeof(nir_alu_instr *))) {
      free(bi);
      return NULL;
   }

   exec_list_push_tail(&bq->blocks, &bi->node);

   return bi;
}

static void
pop_block(struct block_queue *bq, struct block_instructions *bi)
{
   u_vector_finish(&bi->instructions);
   exec_node_remove(&bi->node);
   exec_list_push_head(&bq->reusable_blocks, &bi->node);
}

static void
add_instruction_for_block(struct block_instructions *bi,
                          nir_alu_instr *alu)
{
   nir_alu_instr **data =
      u_vector_add(&bi->instructions);

   *data = alu;
}

static void
rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
                            nir_alu_instr *orig_add, bool zero_on_left)
{
   void *const mem_ctx = ralloc_parent(orig_cmp);

   bld->cursor = nir_before_instr(&orig_cmp->instr);

   /* This is somewhat tricky.  The compare instruction may be something like
    * (fcmp, a, b) while the add instruction is something like (fadd, fneg(a),
    * b).  This is problematic because the SSA value for the fneg(a) may not
    * exist yet at the compare instruction.
    *
    * We fabricate the operands of the new add.  This is done using
    * information provided by zero_on_left.  If zero_on_left is true, we know
    * the resulting compare instruction is (fcmp, 0.0, (fadd, x, y)).  If the
    * original compare instruction was (fcmp, a, b), x = b and y = -a.  If
    * zero_on_left is false, the resulting compare instruction is (fcmp,
    * (fadd, x, y), 0.0) and x = a and y = -b.
    */
   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0);
   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1);

   nir_ssa_def *const fadd = zero_on_left
      ? nir_fadd(bld, b, nir_fneg(bld, a))
      : nir_fadd(bld, a, nir_fneg(bld, b));

   nir_ssa_def *const zero =
      nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size);

   nir_ssa_def *const cmp = zero_on_left
      ? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL)
      : nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL);

   /* Generating extra moves of the results is the easy way to make sure the
    * writemasks match the original instructions.  Later optimization passes
    * will clean these up.  This is similar to nir_replace_instr (in
    * nir_search.c).
    */
   nir_alu_instr *mov_add = nir_alu_instr_create(mem_ctx, nir_op_mov);
   mov_add->dest.write_mask = orig_add->dest.write_mask;
   nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest,
                     orig_add->dest.dest.ssa.num_components,
                     orig_add->dest.dest.ssa.bit_size, NULL);
   mov_add->src[0].src = nir_src_for_ssa(fadd);

   nir_builder_instr_insert(bld, &mov_add->instr);

   nir_alu_instr *mov_cmp = nir_alu_instr_create(mem_ctx, nir_op_mov);
   mov_cmp->dest.write_mask = orig_cmp->dest.write_mask;
   nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest,
                     orig_cmp->dest.dest.ssa.num_components,
                     orig_cmp->dest.dest.ssa.bit_size, NULL);
   mov_cmp->src[0].src = nir_src_for_ssa(cmp);

   nir_builder_instr_insert(bld, &mov_cmp->instr);

   nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa,
                            &mov_cmp->dest.dest.ssa);
   nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa,
                            &mov_add->dest.dest.ssa);

   /* We know these have no more uses because we just rewrote them all, so we
    * can remove them.
    */
   nir_instr_remove(&orig_cmp->instr);
   nir_instr_remove(&orig_add->instr);
}

static bool
comparison_pre_block(nir_block *block, struct block_queue *bq, nir_builder *bld)
{
   bool progress = false;

   struct block_instructions *bi = push_block(bq);
   if (bi == NULL)
      return false;

   /* Starting with the current block, examine each instruction.  If the
    * instruction is a comparison that matches the '±a cmp ±b' pattern, add it
    * to the block_instructions::instructions set.  If the instruction is an
    * add instruction, walk up the block queue looking at the stored
    * instructions.  If a matching comparison is found, move the addition and
    * replace the comparison with a different comparison based on the result
    * of the addition.  All of the blocks in the queue are guaranteed to be
    * dominators of the current block.
    *
    * After processing the current block, recurse into the blocks dominated by
    * the current block.
    */
   nir_foreach_instr_safe(instr, block) {
      if (instr->type != nir_instr_type_alu)
         continue;

      nir_alu_instr *const alu = nir_instr_as_alu(instr);

      if (alu->dest.dest.ssa.num_components != 1)
         continue;

      if (alu->dest.saturate)
         continue;

      static const uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = {0};

      switch (alu->op) {
      case nir_op_fadd: {
         /* If the instruction is fadd, check it against comparison
          * instructions that dominate it.
          */
         struct block_instructions *b =
            (struct block_instructions *) exec_list_get_head_raw(&bq->blocks);

         while (b->node.next != NULL) {
            nir_alu_instr **a;
            bool rewrote_compare = false;

            u_vector_foreach(a, &b->instructions) {
               nir_alu_instr *const cmp = *a;

               if (cmp == NULL)
                  continue;

               /* The operands of both instructions are, with some liberty,
                * commutative.  Check all four permutations.  The third and
                * fourth permutations are negations of the first two.
                */
               if ((nir_alu_srcs_equal(cmp, alu, 0, 0) &&
                    nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) ||
                   (nir_alu_srcs_equal(cmp, alu, 0, 1) &&
                    nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) {
                  /* These are the cases where (A cmp B) matches either (A +
                   * -B) or (-B + A)
                   *
                   *    A cmp B <=> A + -B cmp 0
                   */
                  rewrite_compare_instruction(bld, cmp, alu, false);

                  *a = NULL;
                  rewrote_compare = true;
                  break;
               } else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) &&
                           nir_alu_srcs_negative_equal(cmp, alu, 0, 1)) ||
                          (nir_alu_srcs_equal(cmp, alu, 1, 1) &&
                           nir_alu_srcs_negative_equal(cmp, alu, 0, 0))) {
                  /* This is the case where (A cmp B) matches (B + -A) or (-A
                   * + B).
                   *
                   *    A cmp B <=> 0 cmp B + -A
                   */
                  rewrite_compare_instruction(bld, cmp, alu, true);

                  *a = NULL;
                  rewrote_compare = true;
                  break;
               }
            }

            /* Bail after a compare in the most dominating block is found.
             * This is necessary because 'alu' has been removed from the
             * instruction stream.  Should there be a matching compare in
             * another block, calling rewrite_compare_instruction again will
             * try to operate on a node that is not in the list as if it were
             * in the list.
             *
             * FINISHME: There may be opportunity for additional optimization
             * here.  I discovered this problem due to a shader in Guacamelee.
             * It may be possible to rewrite the matching compares that are
             * encountered later to reuse the result from the compare that was
             * first rewritten.  It's also possible that this is just taken
             * care of by calling the optimization pass repeatedly.
             */
            if (rewrote_compare) {
               progress = true;
               break;
            }

            b = (struct block_instructions *) b->node.next;
         }

         break;
      }

      case nir_op_flt:
      case nir_op_fge:
      case nir_op_fneu:
      case nir_op_feq:
         /* If the instruction is a comparison that is used by an if-statement
          * and neither operand is immediate value 0, add it to the set.
          */
         if (is_used_by_if(alu) &&
             is_not_const_zero(NULL, alu, 0, 1, swizzle) &&
             is_not_const_zero(NULL, alu, 1, 1, swizzle))
            add_instruction_for_block(bi, alu);

         break;

      default:
         break;
      }
   }

   for (unsigned i = 0; i < block->num_dom_children; i++) {
      nir_block *child = block->dom_children[i];

      if (comparison_pre_block(child, bq, bld))
         progress = true;
   }

   pop_block(bq, bi);

   return progress;
}

bool
nir_opt_comparison_pre_impl(nir_function_impl *impl)
{
   struct block_queue bq;
   nir_builder bld;

   block_queue_init(&bq);
   nir_builder_init(&bld, impl);

   nir_metadata_require(impl, nir_metadata_dominance);

   const bool progress =
      comparison_pre_block(nir_start_block(impl), &bq, &bld);

   block_queue_finish(&bq);

   if (progress) {
      nir_metadata_preserve(impl, nir_metadata_block_index |
                                  nir_metadata_dominance);
   } else {
      nir_metadata_preserve(impl, nir_metadata_all);
   }

   return progress;
}

bool
nir_opt_comparison_pre(nir_shader *shader)
{
   bool progress = false;

   nir_foreach_function(function, shader) {
      if (function->impl)
         progress |= nir_opt_comparison_pre_impl(function->impl);
   }

   return progress;
}