WARNING: This server is unstable and will be retired in the next days. If you want to keep this forum available, please request immediately a migration on the Nabble Support forum. Forums that don't receive any migration request will be deleted forever.

[PATCH] Hoist adjacent pointer loads

View: New views
20 Messages — Rating Filter:   Alert me  
< Prev | 1 - 2 | Next >

[PATCH] Hoist adjacent pointer loads

by William J. Schmidt :: Rate this Message:

| View Threaded | Show Only this Message

This patch was posted for comment back in February during stage 4.  It
addresses a performance issue noted in the EEMBC routelookup benchmark
on a common idiom:

  if (...)
    x = y->left;
  else
    x = y->right;

If the two loads can be hoisted out of the if/else, the if/else can be
replaced by a conditional move instruction on architectures that support
one.  Because this speculates one of the loads, the patch constrains the
optimization to avoid introducing page faults.

Bootstrapped and regression tested on powerpc-unknown-linux-gnu with no
new failures.  The patch provides significant improvement to the
routelookup benchmark, and is neutral on SPEC cpu2000/cpu2006.

One question is what optimization level should be required for this.
Because of the speculation, -O3 might be in order.  I don't believe
-Ofast is required as there is no potential correctness issue involved.
Right now the patch doesn't check the optimization level (like the rest
of the phi-opt transforms), which is likely a poor choice.

Ok for trunk?

Thanks,
Bill


2012-05-03  Bill Schmidt  <wschmidt@...>

        * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
        declaration.
        (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
        (tree_ssa_phiopt): Call gate_hoist_loads.
        (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
        (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
        hoist_adjacent_loads.
        (local_reg_dependence): New function.
        (local_mem_dependence): Likewise.
        (hoist_adjacent_loads): Likewise.
        (gate_hoist_loads): Likewise.
        * common.opt (fhoist-adjacent-loads): New switch.
        * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
        * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.


Index: gcc/tree-ssa-phiopt.c
===================================================================
--- gcc/tree-ssa-phiopt.c (revision 187057)
+++ gcc/tree-ssa-phiopt.c (working copy)
@@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
 #include "cfgloop.h"
 #include "tree-data-ref.h"
 #include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "insn-config.h"
+#include "expr.h"
+#include "optabs.h"
 
+#ifndef HAVE_conditional_move
+#define HAVE_conditional_move (0)
+#endif
+
 static unsigned int tree_ssa_phiopt (void);
-static unsigned int tree_ssa_phiopt_worker (bool);
+static unsigned int tree_ssa_phiopt_worker (bool, bool);
 static bool conditional_replacement (basic_block, basic_block,
      edge, edge, gimple, tree, tree);
 static int value_replacement (basic_block, basic_block,
@@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
 static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
 static struct pointer_set_t * get_non_trapping (void);
 static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
+static void hoist_adjacent_loads (basic_block, basic_block,
+  basic_block, basic_block);
+static bool gate_hoist_loads (void);
 
 /* This pass tries to replaces an if-then-else block with an
    assignment.  We have four kinds of transformations.  Some of these
@@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
      bb2:
        x = PHI <x' (bb0), ...>;
 
-   A similar transformation is done for MAX_EXPR.  */
+   A similar transformation is done for MAX_EXPR.
 
+
+   This pass also performs a fifth transformation of a slightly different
+   flavor.
+
+   Adjacent Load Hoisting
+   ----------------------
+  
+   This transformation replaces
+
+     bb0:
+       if (...) goto bb2; else goto bb1;
+     bb1:
+       x1 = (<expr>).field1;
+       goto bb3;
+     bb2:
+       x2 = (<expr>).field2;
+     bb3:
+       # x = PHI <x1, x2>;
+
+   with
+
+     bb0:
+       x1 = (<expr>).field1;
+       x2 = (<expr>).field2;
+       if (...) goto bb2; else goto bb1;
+     bb1:
+       goto bb3;
+     bb2:
+     bb3:
+       # x = PHI <x1, x2>;
+
+   The purpose of this transformation is to enable generation of conditional
+   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
+   the loads is speculative, the transformation is restricted to very
+   specific cases to avoid introducing a page fault.  We are looking for
+   the common idiom:
+
+     if (...)
+       x = y->left;
+     else
+       x = y->right;
+
+   where left and right are typically adjacent pointers in a tree structure.  */
+
 static unsigned int
 tree_ssa_phiopt (void)
 {
-  return tree_ssa_phiopt_worker (false);
+  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
 }
 
 /* This pass tries to transform conditional stores into unconditional
@@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
 static unsigned int
 tree_ssa_cs_elim (void)
 {
-  return tree_ssa_phiopt_worker (true);
+  return tree_ssa_phiopt_worker (true, false);
 }
 
 /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
@@ -227,9 +282,11 @@ static tree condstoretemp;
 /* The core routine of conditional store replacement and normal
    phi optimizations.  Both share much of the infrastructure in how
    to match applicable basic block patterns.  DO_STORE_ELIM is true
-   when we want to do conditional store replacement, false otherwise.  */
+   when we want to do conditional store replacement, false otherwise.
+   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
+   of diamond control flow patterns, false otherwise.  */
 static unsigned int
-tree_ssa_phiopt_worker (bool do_store_elim)
+tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
 {
   basic_block bb;
   basic_block *bb_order;
@@ -312,6 +369,20 @@ static unsigned int
     cfgchanged = true;
   continue;
  }
+      else if (do_hoist_loads
+ && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
+ {
+  basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
+
+  if (single_succ_p (bb1)
+      && single_succ_p (bb2)
+      && single_pred_p (bb1)
+      && single_pred_p (bb2)
+      && EDGE_COUNT (bb->succs) == 2
+      && EDGE_COUNT (bb3->preds) == 2)
+    hoist_adjacent_loads (bb, bb1, bb2, bb3);
+  continue;
+ }
       else
  continue;      
 
@@ -1707,6 +1778,220 @@ cond_if_else_store_replacement (basic_block then_b
   return ok;
 }
 
+/* Return TRUE iff EXPR contains any SSA names that are defined in BB.  */
+
+static bool
+local_reg_dependence (tree expr, basic_block bb)
+{
+  int i;
+
+  if (TREE_CODE (expr) == SSA_NAME)
+    {
+      if (SSA_NAME_IS_DEFAULT_DEF (expr))
+ return false;
+      else
+ return (gimple_bb (SSA_NAME_DEF_STMT (expr)) == bb);
+    }
+
+  for (i = 0; i < TREE_OPERAND_LENGTH (expr); i++)
+    if (TREE_OPERAND (expr, i)
+ && local_reg_dependence (TREE_OPERAND (expr, i), bb))
+      return true;
+
+  return false;
+}
+
+/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
+
+static bool
+local_mem_dependence (gimple stmt, basic_block bb)
+{
+  tree vuse = gimple_vuse (stmt);
+  gimple def;
+
+  if (!vuse)
+    return false;
+
+  def = SSA_NAME_DEF_STMT (vuse);
+  return (def && gimple_bb (def) == bb);
+}
+
+/* Given a "diamond" control-flow pattern where BB0 tests a condition,
+   BB1 and BB2 are "then" and "else" blocks dependent on this test,
+   and BB3 rejoins control flow following BB1 and BB2, look for
+   opportunities to hoist loads as follows.  If BB3 contains a PHI of
+   two loads, one each occurring in BB1 and BB2, and the loads are
+   provably of adjacent fields in the same structure, then move both
+   loads into BB0.  Of course this can only be done if there are no
+   dependencies preventing such motion.
+
+   One of the hoisted loads will always be speculative, so the
+   transformation is currently conservative:
+
+    - The fields must be strictly adjacent.
+    - Hoisting is only done on aligned pointer fields.
+    - The two fields must occupy a single memory block that is
+      guaranteed to not cross a page boundary.
+
+    The last is difficult to prove, as such memory blocks should be
+    aligned on the minimum of the stack alignment boundary and the
+    alignment guaranteed by heap allocation interfaces.  Thus we rely
+    on a parameter for the alignment value.
+
+    Provided a good value is used for the last case, the first two
+    restrictions can probably be relaxed.  */
+
+static void
+hoist_adjacent_loads (basic_block bb0, basic_block bb1,
+      basic_block bb2, basic_block bb3)
+{
+  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
+  gimple_stmt_iterator gsi;
+
+  /* Walk the phis in bb3 looking for an opportunity.  We are looking
+     for phis of two SSA names, one each of which is defined in bb1 and
+     bb2.  */
+  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple phi_stmt = gsi_stmt (gsi);
+      gimple def1, def2;
+      tree arg1, arg2, ref1, ref2, field1, field2;
+      tree tree_offset1, tree_offset2, tree_size1, tree_size2;
+      int offset1, offset2, size1, size2, block_offset;
+      unsigned align1, align2;
+      gimple_stmt_iterator gsi2;
+
+      if (gimple_phi_num_args (phi_stmt) != 2)
+ continue;
+
+      arg1 = gimple_phi_arg_def (phi_stmt, 0);
+      arg2 = gimple_phi_arg_def (phi_stmt, 1);
+      
+      if (TREE_CODE (arg1) != SSA_NAME
+  || TREE_CODE (arg2) != SSA_NAME
+  || SSA_NAME_IS_DEFAULT_DEF (arg1)
+  || SSA_NAME_IS_DEFAULT_DEF (arg2)
+  /* FORNOW: Only do this optimization for pointer types.  */
+  || !POINTER_TYPE_P (TREE_TYPE (arg1)))
+ continue;
+
+      def1 = SSA_NAME_DEF_STMT (arg1);
+      def2 = SSA_NAME_DEF_STMT (arg2);
+
+      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
+  && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
+ continue;
+
+      /* Unless -fhoist-adjacent-loads was specified, check the mode
+ of the arguments to be sure a conditional move can be generated
+ for it.  */
+      if (flag_hoist_adjacent_loads != 1
+  && !optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
+ continue;
+
+      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
+      if (!gimple_assign_single_p (def1)
+  || !gimple_assign_single_p (def2))
+ continue;
+
+      ref1 = gimple_assign_rhs1 (def1);
+      ref2 = gimple_assign_rhs1 (def2);
+
+      if (TREE_CODE (ref1) != COMPONENT_REF
+  || TREE_CODE (ref2) != COMPONENT_REF)
+ continue;
+
+      /* The zeroth operand of the two component references must be
+ identical.  It is not sufficient to compare get_base_address of
+ the two references, because this could allow for different
+ elements of the same array in the two trees.  It is not safe to
+ assume that the existence of one array element implies the
+ existence of a different one.  */
+      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
+ continue;
+
+      field1 = TREE_OPERAND (ref1, 1);
+      field2 = TREE_OPERAND (ref2, 1);
+
+      tree_offset1 = DECL_FIELD_BIT_OFFSET (field1);
+      tree_offset2 = DECL_FIELD_BIT_OFFSET (field2);
+      tree_size1 = DECL_SIZE_UNIT (field1);
+      tree_size2 = DECL_SIZE_UNIT (field2);
+
+      if (TREE_CODE (tree_offset1) != INTEGER_CST
+  || TREE_CODE (tree_offset2) != INTEGER_CST
+  || TREE_CODE (tree_size1) != INTEGER_CST
+  || TREE_CODE (tree_size2) != INTEGER_CST)
+ continue;
+
+      /* FORNOW: The two field references must be byte-aligned, naturally
+ aligned within the record, and adjacent.  */
+      offset1 = TREE_INT_CST_LOW (tree_offset1);
+      offset2 = TREE_INT_CST_LOW (tree_offset2);
+      size1 = TREE_INT_CST_LOW (tree_size1);
+      size2 = TREE_INT_CST_LOW (tree_size2);
+      align1 = TYPE_ALIGN (TREE_TYPE (arg1));
+      align2 = TYPE_ALIGN (TREE_TYPE (arg2));
+
+      if (offset1 % BITS_PER_UNIT != 0
+  || offset2 % BITS_PER_UNIT != 0
+  || offset1 % align1 != 0
+  || offset2 % align2 != 0)
+ continue;
+
+      offset1 /= BITS_PER_UNIT;
+      offset2 /= BITS_PER_UNIT;
+
+      if (offset1 + size1 != offset2
+  && offset2 + size2 != offset1)
+ continue;
+
+      /* The two field references must fit within a block of memory that
+ is guaranteed to be on the same page.  */
+      block_offset = MIN (offset1, offset2) % param_align;
+
+      if (block_offset + size1 + size2 > param_align)
+ continue;
+
+      /* The two expressions cannot be dependent upon SSA names
+ or vdefs defined in bb1/bb2.  */
+      if (local_reg_dependence (TREE_OPERAND (ref1, 0), bb1)
+  || local_reg_dependence (TREE_OPERAND (ref2, 0), bb2)
+  || local_mem_dependence (def1, bb1)
+  || local_mem_dependence (def2, bb2))
+ continue;
+
+      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
+ bb0.  */
+      gsi2 = gsi_for_stmt (def1);
+      gsi_move_to_bb_end (&gsi2, bb0);
+      gsi2 = gsi_for_stmt (def2);
+      gsi_move_to_bb_end (&gsi2, bb0);
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+  fprintf (dump_file,
+   "\nHoisting adjacent loads from %d and %d into %d: \n",
+   bb1->index, bb2->index, bb0->index);
+  print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
+  print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
+ }
+    }
+}
+
+/* Determine whether we should attempt to hoist adjacent loads out of
+   diamond patterns in pass_phiopt.  Always hoist loads if
+   -fhoist-adjacent-loads is specified, or if the target machine has
+   a conditional move instruction and -fno-hoist-adjacent-loads is
+   not specified.  */
+
+static bool
+gate_hoist_loads (void)
+{
+  return (flag_hoist_adjacent_loads == 1
+  || (HAVE_conditional_move && flag_hoist_adjacent_loads != 0));
+}
+
 /* Always do these optimizations if we have SSA
    trees to work on.  */
 static bool
Index: gcc/common.opt
===================================================================
--- gcc/common.opt (revision 187057)
+++ gcc/common.opt (working copy)
@@ -1186,6 +1186,11 @@ fgraphite-identity
 Common Report Var(flag_graphite_identity) Optimization
 Enable Graphite Identity transformation
 
+fhoist-adjacent-loads
+Common Report Var(flag_hoist_adjacent_loads) Init(-1) Optimization
+Enable hoisting adjacent loads to encourage generating conditional move
+instructions
+
 floop-parallelize-all
 Common Report Var(flag_loop_parallelize_all) Optimization
 Mark all loops as parallel
Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in (revision 187057)
+++ gcc/Makefile.in (working copy)
@@ -2351,7 +2351,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
    $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
    $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
-   $(TREE_DATA_REF_H) tree-pretty-print.h
+   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
+   insn-config.h $(EXPR_H) $(OPTABS_H)
 tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
Index: gcc/params.def
===================================================================
--- gcc/params.def (revision 187057)
+++ gcc/params.def (working copy)
@@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
   "track string lengths",
   1000, 0, 0)
 
+/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
+DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
+  "min-cmove-struct-align",
+  "Minimum byte alignment to assume for structures in the stack "
+  "or heap when considering load hoisting for conditional moves",
+  16, 8, 256)
+
 /*
 Local variables:
 mode:c



Re: [PATCH] Hoist adjacent pointer loads

by Jeff Law :: Rate this Message:

| View Threaded | Show Only this Message

On 05/03/2012 08:33 AM, William J. Schmidt wrote:

> This patch was posted for comment back in February during stage 4.  It
> addresses a performance issue noted in the EEMBC routelookup benchmark
> on a common idiom:
>
>    if (...)
>      x = y->left;
>    else
>      x = y->right;
>
> If the two loads can be hoisted out of the if/else, the if/else can be
> replaced by a conditional move instruction on architectures that support
> one.  Because this speculates one of the loads, the patch constrains the
> optimization to avoid introducing page faults.
>
> Bootstrapped and regression tested on powerpc-unknown-linux-gnu with no
> new failures.  The patch provides significant improvement to the
> routelookup benchmark, and is neutral on SPEC cpu2000/cpu2006.
>
> One question is what optimization level should be required for this.
> Because of the speculation, -O3 might be in order.  I don't believe
> -Ofast is required as there is no potential correctness issue involved.
> Right now the patch doesn't check the optimization level (like the rest
> of the phi-opt transforms), which is likely a poor choice.
Doesn't this need to be conditionalized on the memory model that's
currently active?

jeff

Re: [PATCH] Hoist adjacent pointer loads

by William J. Schmidt :: Rate this Message:

| View Threaded | Show Only this Message

On Thu, 2012-05-03 at 09:40 -0600, Jeff Law wrote:

> On 05/03/2012 08:33 AM, William J. Schmidt wrote:
> > This patch was posted for comment back in February during stage 4.  It
> > addresses a performance issue noted in the EEMBC routelookup benchmark
> > on a common idiom:
> >
> >    if (...)
> >      x = y->left;
> >    else
> >      x = y->right;
> >
> > If the two loads can be hoisted out of the if/else, the if/else can be
> > replaced by a conditional move instruction on architectures that support
> > one.  Because this speculates one of the loads, the patch constrains the
> > optimization to avoid introducing page faults.
> >
> > Bootstrapped and regression tested on powerpc-unknown-linux-gnu with no
> > new failures.  The patch provides significant improvement to the
> > routelookup benchmark, and is neutral on SPEC cpu2000/cpu2006.
> >
> > One question is what optimization level should be required for this.
> > Because of the speculation, -O3 might be in order.  I don't believe
> > -Ofast is required as there is no potential correctness issue involved.
> > Right now the patch doesn't check the optimization level (like the rest
> > of the phi-opt transforms), which is likely a poor choice.
> Doesn't this need to be conditionalized on the memory model that's
> currently active?
>
Yes and no.  What's important is that you don't want to introduce page
faults (or less urgently, cache misses) by speculating the load.  So the
patch is currently extremely constrained, and likely will always stay
that way.  Only fields that are pointers and that are strictly adjacent
are hoisted, and only if they're in the same 16-byte block.  (The number
16 is a parameter that can be adjusted.)

Hopefully I didn't miss your point -- let me know if I did and I'll try
again. :)

Thanks,
Bill

> jeff
>


Re: [PATCH] Hoist adjacent pointer loads

by Jeff Law :: Rate this Message:

| View Threaded | Show Only this Message

On 05/03/2012 10:47 AM, William J. Schmidt wrote:

>>
> Yes and no.  What's important is that you don't want to introduce page
> faults (or less urgently, cache misses) by speculating the load.  So the
> patch is currently extremely constrained, and likely will always stay
> that way.  Only fields that are pointers and that are strictly adjacent
> are hoisted, and only if they're in the same 16-byte block.  (The number
> 16 is a parameter that can be adjusted.)
>
> Hopefully I didn't miss your point -- let me know if I did and I'll try
> again. :)
You missed the point :-)

Under the C++11 memory model you can't introduce new data races on
objects which might be visible to multiple threads.  This requirement
can restrict speculation in many cases.  Furthermore, it sounds like C11
will have similar constraints.

I believe there's a wiki page which touches on these kinds of issues.

That doesn't mean we can't ever do the optimization, just that we have
to be more careful than we have in the past when mucking around with
memory optimizations.

jeff

Re: [PATCH] Hoist adjacent pointer loads

by William J. Schmidt :: Rate this Message:

| View Threaded | Show Only this Message



On Thu, 2012-05-03 at 11:44 -0600, Jeff Law wrote:

> On 05/03/2012 10:47 AM, William J. Schmidt wrote:
> >>
> > Yes and no.  What's important is that you don't want to introduce page
> > faults (or less urgently, cache misses) by speculating the load.  So the
> > patch is currently extremely constrained, and likely will always stay
> > that way.  Only fields that are pointers and that are strictly adjacent
> > are hoisted, and only if they're in the same 16-byte block.  (The number
> > 16 is a parameter that can be adjusted.)
> >
> > Hopefully I didn't miss your point -- let me know if I did and I'll try
> > again. :)
> You missed the point :-)
>
> Under the C++11 memory model you can't introduce new data races on
> objects which might be visible to multiple threads.  This requirement
> can restrict speculation in many cases.  Furthermore, it sounds like C11
> will have similar constraints.
>
> I believe there's a wiki page which touches on these kinds of issues.
>
> That doesn't mean we can't ever do the optimization, just that we have
> to be more careful than we have in the past when mucking around with
> memory optimizations.

OK, thanks!  Looks like I have some reading to do about the new memory
models.

However, from the wiki page I see:  "A speculative load which has its
results thrown away are considered to not have changed the semantics of
the program, and are therefore allowed."  That seems to cover the case
here: the load is hoisted, but if the path where it was originally
loaded is not executed, its result is discarded.

If needed, though, what flags/detection mechanisms are available for
determining that the load speculation should be disabled?

Thanks,
Bill
>
> jeff
>


Re: [PATCH] Hoist adjacent pointer loads

by Jeff Law :: Rate this Message:

| View Threaded | Show Only this Message

On 05/03/2012 12:11 PM, William J. Schmidt wrote:
>
>
> However, from the wiki page I see:  "A speculative load which has its
> results thrown away are considered to not have changed the semantics of
> the program, and are therefore allowed."  That seems to cover the case
> here: the load is hoisted, but if the path where it was originally
> loaded is not executed, its result is discarded.
Ah, yes, I remember that discussion now.  Ignore my concerns :-)


>
> If needed, though, what flags/detection mechanisms are available for
> determining that the load speculation should be disabled?
I think the flags are something like -fallow-load-data-races
-fallow-store-data-races and the like.

jeff

Ping: [PATCH] Hoist adjacent pointer loads

by William J. Schmidt :: Rate this Message:

| View Threaded | Show Only this Message

Ping.

Thanks,
Bill

On Thu, 2012-05-03 at 09:33 -0500, William J. Schmidt wrote:

> This patch was posted for comment back in February during stage 4.  It
> addresses a performance issue noted in the EEMBC routelookup benchmark
> on a common idiom:
>
>   if (...)
>     x = y->left;
>   else
>     x = y->right;
>
> If the two loads can be hoisted out of the if/else, the if/else can be
> replaced by a conditional move instruction on architectures that support
> one.  Because this speculates one of the loads, the patch constrains the
> optimization to avoid introducing page faults.
>
> Bootstrapped and regression tested on powerpc-unknown-linux-gnu with no
> new failures.  The patch provides significant improvement to the
> routelookup benchmark, and is neutral on SPEC cpu2000/cpu2006.
>
> One question is what optimization level should be required for this.
> Because of the speculation, -O3 might be in order.  I don't believe
> -Ofast is required as there is no potential correctness issue involved.
> Right now the patch doesn't check the optimization level (like the rest
> of the phi-opt transforms), which is likely a poor choice.
>
> Ok for trunk?
>
> Thanks,
> Bill
>
>
> 2012-05-03  Bill Schmidt  <wschmidt@...>
>
> * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> declaration.
> (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> (tree_ssa_phiopt): Call gate_hoist_loads.
> (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> hoist_adjacent_loads.
> (local_reg_dependence): New function.
> (local_mem_dependence): Likewise.
> (hoist_adjacent_loads): Likewise.
> (gate_hoist_loads): Likewise.
> * common.opt (fhoist-adjacent-loads): New switch.
> * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
>
>
> Index: gcc/tree-ssa-phiopt.c
> ===================================================================
> --- gcc/tree-ssa-phiopt.c (revision 187057)
> +++ gcc/tree-ssa-phiopt.c (working copy)
> @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfgloop.h"
>  #include "tree-data-ref.h"
>  #include "tree-pretty-print.h"
> +#include "gimple-pretty-print.h"
> +#include "insn-config.h"
> +#include "expr.h"
> +#include "optabs.h"
>
> +#ifndef HAVE_conditional_move
> +#define HAVE_conditional_move (0)
> +#endif
> +
>  static unsigned int tree_ssa_phiopt (void);
> -static unsigned int tree_ssa_phiopt_worker (bool);
> +static unsigned int tree_ssa_phiopt_worker (bool, bool);
>  static bool conditional_replacement (basic_block, basic_block,
>       edge, edge, gimple, tree, tree);
>  static int value_replacement (basic_block, basic_block,
> @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
>  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
>  static struct pointer_set_t * get_non_trapping (void);
>  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> +static void hoist_adjacent_loads (basic_block, basic_block,
> +  basic_block, basic_block);
> +static bool gate_hoist_loads (void);
>
>  /* This pass tries to replaces an if-then-else block with an
>     assignment.  We have four kinds of transformations.  Some of these
> @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
>       bb2:
>         x = PHI <x' (bb0), ...>;
>
> -   A similar transformation is done for MAX_EXPR.  */
> +   A similar transformation is done for MAX_EXPR.
>
> +
> +   This pass also performs a fifth transformation of a slightly different
> +   flavor.
> +
> +   Adjacent Load Hoisting
> +   ----------------------
> +  
> +   This transformation replaces
> +
> +     bb0:
> +       if (...) goto bb2; else goto bb1;
> +     bb1:
> +       x1 = (<expr>).field1;
> +       goto bb3;
> +     bb2:
> +       x2 = (<expr>).field2;
> +     bb3:
> +       # x = PHI <x1, x2>;
> +
> +   with
> +
> +     bb0:
> +       x1 = (<expr>).field1;
> +       x2 = (<expr>).field2;
> +       if (...) goto bb2; else goto bb1;
> +     bb1:
> +       goto bb3;
> +     bb2:
> +     bb3:
> +       # x = PHI <x1, x2>;
> +
> +   The purpose of this transformation is to enable generation of conditional
> +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> +   the loads is speculative, the transformation is restricted to very
> +   specific cases to avoid introducing a page fault.  We are looking for
> +   the common idiom:
> +
> +     if (...)
> +       x = y->left;
> +     else
> +       x = y->right;
> +
> +   where left and right are typically adjacent pointers in a tree structure.  */
> +
>  static unsigned int
>  tree_ssa_phiopt (void)
>  {
> -  return tree_ssa_phiopt_worker (false);
> +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
>  }
>
>  /* This pass tries to transform conditional stores into unconditional
> @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
>  static unsigned int
>  tree_ssa_cs_elim (void)
>  {
> -  return tree_ssa_phiopt_worker (true);
> +  return tree_ssa_phiopt_worker (true, false);
>  }
>
>  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> @@ -227,9 +282,11 @@ static tree condstoretemp;
>  /* The core routine of conditional store replacement and normal
>     phi optimizations.  Both share much of the infrastructure in how
>     to match applicable basic block patterns.  DO_STORE_ELIM is true
> -   when we want to do conditional store replacement, false otherwise.  */
> +   when we want to do conditional store replacement, false otherwise.
> +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> +   of diamond control flow patterns, false otherwise.  */
>  static unsigned int
> -tree_ssa_phiopt_worker (bool do_store_elim)
> +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
>  {
>    basic_block bb;
>    basic_block *bb_order;
> @@ -312,6 +369,20 @@ static unsigned int
>      cfgchanged = true;
>    continue;
>   }
> +      else if (do_hoist_loads
> + && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> + {
> +  basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> +
> +  if (single_succ_p (bb1)
> +      && single_succ_p (bb2)
> +      && single_pred_p (bb1)
> +      && single_pred_p (bb2)
> +      && EDGE_COUNT (bb->succs) == 2
> +      && EDGE_COUNT (bb3->preds) == 2)
> +    hoist_adjacent_loads (bb, bb1, bb2, bb3);
> +  continue;
> + }
>        else
>   continue;      
>
> @@ -1707,6 +1778,220 @@ cond_if_else_store_replacement (basic_block then_b
>    return ok;
>  }
>
> +/* Return TRUE iff EXPR contains any SSA names that are defined in BB.  */
> +
> +static bool
> +local_reg_dependence (tree expr, basic_block bb)
> +{
> +  int i;
> +
> +  if (TREE_CODE (expr) == SSA_NAME)
> +    {
> +      if (SSA_NAME_IS_DEFAULT_DEF (expr))
> + return false;
> +      else
> + return (gimple_bb (SSA_NAME_DEF_STMT (expr)) == bb);
> +    }
> +
> +  for (i = 0; i < TREE_OPERAND_LENGTH (expr); i++)
> +    if (TREE_OPERAND (expr, i)
> + && local_reg_dependence (TREE_OPERAND (expr, i), bb))
> +      return true;
> +
> +  return false;
> +}
> +
> +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> +
> +static bool
> +local_mem_dependence (gimple stmt, basic_block bb)
> +{
> +  tree vuse = gimple_vuse (stmt);
> +  gimple def;
> +
> +  if (!vuse)
> +    return false;
> +
> +  def = SSA_NAME_DEF_STMT (vuse);
> +  return (def && gimple_bb (def) == bb);
> +}
> +
> +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> +   and BB3 rejoins control flow following BB1 and BB2, look for
> +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> +   two loads, one each occurring in BB1 and BB2, and the loads are
> +   provably of adjacent fields in the same structure, then move both
> +   loads into BB0.  Of course this can only be done if there are no
> +   dependencies preventing such motion.
> +
> +   One of the hoisted loads will always be speculative, so the
> +   transformation is currently conservative:
> +
> +    - The fields must be strictly adjacent.
> +    - Hoisting is only done on aligned pointer fields.
> +    - The two fields must occupy a single memory block that is
> +      guaranteed to not cross a page boundary.
> +
> +    The last is difficult to prove, as such memory blocks should be
> +    aligned on the minimum of the stack alignment boundary and the
> +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> +    on a parameter for the alignment value.
> +
> +    Provided a good value is used for the last case, the first two
> +    restrictions can probably be relaxed.  */
> +
> +static void
> +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> +      basic_block bb2, basic_block bb3)
> +{
> +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> +  gimple_stmt_iterator gsi;
> +
> +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> +     for phis of two SSA names, one each of which is defined in bb1 and
> +     bb2.  */
> +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple phi_stmt = gsi_stmt (gsi);
> +      gimple def1, def2;
> +      tree arg1, arg2, ref1, ref2, field1, field2;
> +      tree tree_offset1, tree_offset2, tree_size1, tree_size2;
> +      int offset1, offset2, size1, size2, block_offset;
> +      unsigned align1, align2;
> +      gimple_stmt_iterator gsi2;
> +
> +      if (gimple_phi_num_args (phi_stmt) != 2)
> + continue;
> +
> +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> +      
> +      if (TREE_CODE (arg1) != SSA_NAME
> +  || TREE_CODE (arg2) != SSA_NAME
> +  || SSA_NAME_IS_DEFAULT_DEF (arg1)
> +  || SSA_NAME_IS_DEFAULT_DEF (arg2)
> +  /* FORNOW: Only do this optimization for pointer types.  */
> +  || !POINTER_TYPE_P (TREE_TYPE (arg1)))
> + continue;
> +
> +      def1 = SSA_NAME_DEF_STMT (arg1);
> +      def2 = SSA_NAME_DEF_STMT (arg2);
> +
> +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> +  && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> + continue;
> +
> +      /* Unless -fhoist-adjacent-loads was specified, check the mode
> + of the arguments to be sure a conditional move can be generated
> + for it.  */
> +      if (flag_hoist_adjacent_loads != 1
> +  && !optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> + continue;
> +
> +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> +      if (!gimple_assign_single_p (def1)
> +  || !gimple_assign_single_p (def2))
> + continue;
> +
> +      ref1 = gimple_assign_rhs1 (def1);
> +      ref2 = gimple_assign_rhs1 (def2);
> +
> +      if (TREE_CODE (ref1) != COMPONENT_REF
> +  || TREE_CODE (ref2) != COMPONENT_REF)
> + continue;
> +
> +      /* The zeroth operand of the two component references must be
> + identical.  It is not sufficient to compare get_base_address of
> + the two references, because this could allow for different
> + elements of the same array in the two trees.  It is not safe to
> + assume that the existence of one array element implies the
> + existence of a different one.  */
> +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> + continue;
> +
> +      field1 = TREE_OPERAND (ref1, 1);
> +      field2 = TREE_OPERAND (ref2, 1);
> +
> +      tree_offset1 = DECL_FIELD_BIT_OFFSET (field1);
> +      tree_offset2 = DECL_FIELD_BIT_OFFSET (field2);
> +      tree_size1 = DECL_SIZE_UNIT (field1);
> +      tree_size2 = DECL_SIZE_UNIT (field2);
> +
> +      if (TREE_CODE (tree_offset1) != INTEGER_CST
> +  || TREE_CODE (tree_offset2) != INTEGER_CST
> +  || TREE_CODE (tree_size1) != INTEGER_CST
> +  || TREE_CODE (tree_size2) != INTEGER_CST)
> + continue;
> +
> +      /* FORNOW: The two field references must be byte-aligned, naturally
> + aligned within the record, and adjacent.  */
> +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> +      size1 = TREE_INT_CST_LOW (tree_size1);
> +      size2 = TREE_INT_CST_LOW (tree_size2);
> +      align1 = TYPE_ALIGN (TREE_TYPE (arg1));
> +      align2 = TYPE_ALIGN (TREE_TYPE (arg2));
> +
> +      if (offset1 % BITS_PER_UNIT != 0
> +  || offset2 % BITS_PER_UNIT != 0
> +  || offset1 % align1 != 0
> +  || offset2 % align2 != 0)
> + continue;
> +
> +      offset1 /= BITS_PER_UNIT;
> +      offset2 /= BITS_PER_UNIT;
> +
> +      if (offset1 + size1 != offset2
> +  && offset2 + size2 != offset1)
> + continue;
> +
> +      /* The two field references must fit within a block of memory that
> + is guaranteed to be on the same page.  */
> +      block_offset = MIN (offset1, offset2) % param_align;
> +
> +      if (block_offset + size1 + size2 > param_align)
> + continue;
> +
> +      /* The two expressions cannot be dependent upon SSA names
> + or vdefs defined in bb1/bb2.  */
> +      if (local_reg_dependence (TREE_OPERAND (ref1, 0), bb1)
> +  || local_reg_dependence (TREE_OPERAND (ref2, 0), bb2)
> +  || local_mem_dependence (def1, bb1)
> +  || local_mem_dependence (def2, bb2))
> + continue;
> +
> +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> + bb0.  */
> +      gsi2 = gsi_for_stmt (def1);
> +      gsi_move_to_bb_end (&gsi2, bb0);
> +      gsi2 = gsi_for_stmt (def2);
> +      gsi_move_to_bb_end (&gsi2, bb0);
> +
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> + {
> +  fprintf (dump_file,
> +   "\nHoisting adjacent loads from %d and %d into %d: \n",
> +   bb1->index, bb2->index, bb0->index);
> +  print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> +  print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> + }
> +    }
> +}
> +
> +/* Determine whether we should attempt to hoist adjacent loads out of
> +   diamond patterns in pass_phiopt.  Always hoist loads if
> +   -fhoist-adjacent-loads is specified, or if the target machine has
> +   a conditional move instruction and -fno-hoist-adjacent-loads is
> +   not specified.  */
> +
> +static bool
> +gate_hoist_loads (void)
> +{
> +  return (flag_hoist_adjacent_loads == 1
> +  || (HAVE_conditional_move && flag_hoist_adjacent_loads != 0));
> +}
> +
>  /* Always do these optimizations if we have SSA
>     trees to work on.  */
>  static bool
> Index: gcc/common.opt
> ===================================================================
> --- gcc/common.opt (revision 187057)
> +++ gcc/common.opt (working copy)
> @@ -1186,6 +1186,11 @@ fgraphite-identity
>  Common Report Var(flag_graphite_identity) Optimization
>  Enable Graphite Identity transformation
>
> +fhoist-adjacent-loads
> +Common Report Var(flag_hoist_adjacent_loads) Init(-1) Optimization
> +Enable hoisting adjacent loads to encourage generating conditional move
> +instructions
> +
>  floop-parallelize-all
>  Common Report Var(flag_loop_parallelize_all) Optimization
>  Mark all loops as parallel
> Index: gcc/Makefile.in
> ===================================================================
> --- gcc/Makefile.in (revision 187057)
> +++ gcc/Makefile.in (working copy)
> @@ -2351,7 +2351,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
>     $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
>     $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
>     $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> -   $(TREE_DATA_REF_H) tree-pretty-print.h
> +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> +   insn-config.h $(EXPR_H) $(OPTABS_H)
>  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
>     $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
>     $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> Index: gcc/params.def
> ===================================================================
> --- gcc/params.def (revision 187057)
> +++ gcc/params.def (working copy)
> @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
>    "track string lengths",
>    1000, 0, 0)
>
> +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> +  "min-cmove-struct-align",
> +  "Minimum byte alignment to assume for structures in the stack "
> +  "or heap when considering load hoisting for conditional moves",
> +  16, 8, 256)
> +
>  /*
>  Local variables:
>  mode:c
>
>


Re: [PATCH] Hoist adjacent pointer loads

by Richard Guenther-2 :: Rate this Message:

| View Threaded | Show Only this Message

On Thu, May 3, 2012 at 4:33 PM, William J. Schmidt
<wschmidt@...> wrote:

> This patch was posted for comment back in February during stage 4.  It
> addresses a performance issue noted in the EEMBC routelookup benchmark
> on a common idiom:
>
>  if (...)
>    x = y->left;
>  else
>    x = y->right;
>
> If the two loads can be hoisted out of the if/else, the if/else can be
> replaced by a conditional move instruction on architectures that support
> one.  Because this speculates one of the loads, the patch constrains the
> optimization to avoid introducing page faults.
>
> Bootstrapped and regression tested on powerpc-unknown-linux-gnu with no
> new failures.  The patch provides significant improvement to the
> routelookup benchmark, and is neutral on SPEC cpu2000/cpu2006.
>
> One question is what optimization level should be required for this.
> Because of the speculation, -O3 might be in order.  I don't believe
> -Ofast is required as there is no potential correctness issue involved.
> Right now the patch doesn't check the optimization level (like the rest
> of the phi-opt transforms), which is likely a poor choice.
>
> Ok for trunk?
>
> Thanks,
> Bill
>
>
> 2012-05-03  Bill Schmidt  <wschmidt@...>
>
>        * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
>        declaration.
>        (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
>        (tree_ssa_phiopt): Call gate_hoist_loads.
>        (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
>        (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
>        hoist_adjacent_loads.
>        (local_reg_dependence): New function.
>        (local_mem_dependence): Likewise.
>        (hoist_adjacent_loads): Likewise.
>        (gate_hoist_loads): Likewise.
>        * common.opt (fhoist-adjacent-loads): New switch.
>        * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
>        * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
>
>
> Index: gcc/tree-ssa-phiopt.c
> ===================================================================
> --- gcc/tree-ssa-phiopt.c       (revision 187057)
> +++ gcc/tree-ssa-phiopt.c       (working copy)
> @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfgloop.h"
>  #include "tree-data-ref.h"
>  #include "tree-pretty-print.h"
> +#include "gimple-pretty-print.h"
> +#include "insn-config.h"
> +#include "expr.h"
> +#include "optabs.h"
>
> +#ifndef HAVE_conditional_move
> +#define HAVE_conditional_move (0)
> +#endif
> +
>  static unsigned int tree_ssa_phiopt (void);
> -static unsigned int tree_ssa_phiopt_worker (bool);
> +static unsigned int tree_ssa_phiopt_worker (bool, bool);
>  static bool conditional_replacement (basic_block, basic_block,
>                                     edge, edge, gimple, tree, tree);
>  static int value_replacement (basic_block, basic_block,
> @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
>  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
>  static struct pointer_set_t * get_non_trapping (void);
>  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> +static void hoist_adjacent_loads (basic_block, basic_block,
> +                                 basic_block, basic_block);
> +static bool gate_hoist_loads (void);
>
>  /* This pass tries to replaces an if-then-else block with an
>    assignment.  We have four kinds of transformations.  Some of these
> @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
>      bb2:
>        x = PHI <x' (bb0), ...>;
>
> -   A similar transformation is done for MAX_EXPR.  */
> +   A similar transformation is done for MAX_EXPR.
>
> +
> +   This pass also performs a fifth transformation of a slightly different
> +   flavor.
> +
> +   Adjacent Load Hoisting
> +   ----------------------
> +
> +   This transformation replaces
> +
> +     bb0:
> +       if (...) goto bb2; else goto bb1;
> +     bb1:
> +       x1 = (<expr>).field1;
> +       goto bb3;
> +     bb2:
> +       x2 = (<expr>).field2;
> +     bb3:
> +       # x = PHI <x1, x2>;
> +
> +   with
> +
> +     bb0:
> +       x1 = (<expr>).field1;
> +       x2 = (<expr>).field2;
> +       if (...) goto bb2; else goto bb1;
> +     bb1:
> +       goto bb3;
> +     bb2:
> +     bb3:
> +       # x = PHI <x1, x2>;
> +
> +   The purpose of this transformation is to enable generation of conditional
> +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> +   the loads is speculative, the transformation is restricted to very
> +   specific cases to avoid introducing a page fault.  We are looking for
> +   the common idiom:
> +
> +     if (...)
> +       x = y->left;
> +     else
> +       x = y->right;
> +
> +   where left and right are typically adjacent pointers in a tree structure.  */
> +
>  static unsigned int
>  tree_ssa_phiopt (void)
>  {
> -  return tree_ssa_phiopt_worker (false);
> +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
>  }
>
>  /* This pass tries to transform conditional stores into unconditional
> @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
>  static unsigned int
>  tree_ssa_cs_elim (void)
>  {
> -  return tree_ssa_phiopt_worker (true);
> +  return tree_ssa_phiopt_worker (true, false);
>  }
>
>  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> @@ -227,9 +282,11 @@ static tree condstoretemp;
>  /* The core routine of conditional store replacement and normal
>    phi optimizations.  Both share much of the infrastructure in how
>    to match applicable basic block patterns.  DO_STORE_ELIM is true
> -   when we want to do conditional store replacement, false otherwise.  */
> +   when we want to do conditional store replacement, false otherwise.
> +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> +   of diamond control flow patterns, false otherwise.  */
>  static unsigned int
> -tree_ssa_phiopt_worker (bool do_store_elim)
> +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
>  {
>   basic_block bb;
>   basic_block *bb_order;
> @@ -312,6 +369,20 @@ static unsigned int
>            cfgchanged = true;
>          continue;
>        }
> +      else if (do_hoist_loads
> +                && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> +       {
> +         basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> +
> +         if (single_succ_p (bb1)
> +             && single_succ_p (bb2)
> +             && single_pred_p (bb1)
> +             && single_pred_p (bb2)

and same pred?

> +             && EDGE_COUNT (bb->succs) == 2
> +             && EDGE_COUNT (bb3->preds) == 2)
> +           hoist_adjacent_loads (bb, bb1, bb2, bb3);
> +         continue;
> +       }
>       else
>        continue;
>
> @@ -1707,6 +1778,220 @@ cond_if_else_store_replacement (basic_block then_b
>   return ok;
>  }
>
> +/* Return TRUE iff EXPR contains any SSA names that are defined in BB.  */
> +
> +static bool
> +local_reg_dependence (tree expr, basic_block bb)
> +{
> +  int i;
> +
> +  if (TREE_CODE (expr) == SSA_NAME)
> +    {
> +      if (SSA_NAME_IS_DEFAULT_DEF (expr))
> +       return false;
> +      else
> +       return (gimple_bb (SSA_NAME_DEF_STMT (expr)) == bb);
> +    }
> +
> +  for (i = 0; i < TREE_OPERAND_LENGTH (expr); i++)
> +    if (TREE_OPERAND (expr, i)
> +       && local_reg_dependence (TREE_OPERAND (expr, i), bb))
> +      return true;

This looks odd - what are you trying to catch here?

> +  return false;
> +}
> +
> +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> +
> +static bool
> +local_mem_dependence (gimple stmt, basic_block bb)
> +{
> +  tree vuse = gimple_vuse (stmt);
> +  gimple def;
> +
> +  if (!vuse)
> +    return false;
> +
> +  def = SSA_NAME_DEF_STMT (vuse);
> +  return (def && gimple_bb (def) == bb);
> +}
> +
> +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> +   and BB3 rejoins control flow following BB1 and BB2, look for
> +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> +   two loads, one each occurring in BB1 and BB2, and the loads are
> +   provably of adjacent fields in the same structure, then move both
> +   loads into BB0.  Of course this can only be done if there are no
> +   dependencies preventing such motion.
> +
> +   One of the hoisted loads will always be speculative, so the
> +   transformation is currently conservative:
> +
> +    - The fields must be strictly adjacent.
> +    - Hoisting is only done on aligned pointer fields.
> +    - The two fields must occupy a single memory block that is
> +      guaranteed to not cross a page boundary.
> +
> +    The last is difficult to prove, as such memory blocks should be
> +    aligned on the minimum of the stack alignment boundary and the
> +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> +    on a parameter for the alignment value.
> +
> +    Provided a good value is used for the last case, the first two
> +    restrictions can probably be relaxed.  */
> +
> +static void
> +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> +                     basic_block bb2, basic_block bb3)
> +{
> +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> +  gimple_stmt_iterator gsi;
> +
> +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> +     for phis of two SSA names, one each of which is defined in bb1 and
> +     bb2.  */
> +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple phi_stmt = gsi_stmt (gsi);
> +      gimple def1, def2;
> +      tree arg1, arg2, ref1, ref2, field1, field2;
> +      tree tree_offset1, tree_offset2, tree_size1, tree_size2;
> +      int offset1, offset2, size1, size2, block_offset;
> +      unsigned align1, align2;
> +      gimple_stmt_iterator gsi2;
> +
> +      if (gimple_phi_num_args (phi_stmt) != 2)
> +       continue;
> +
> +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> +
> +      if (TREE_CODE (arg1) != SSA_NAME
> +         || TREE_CODE (arg2) != SSA_NAME
> +         || SSA_NAME_IS_DEFAULT_DEF (arg1)
> +         || SSA_NAME_IS_DEFAULT_DEF (arg2)
> +         /* FORNOW: Only do this optimization for pointer types.  */
> +         || !POINTER_TYPE_P (TREE_TYPE (arg1)))

I'd like to see it enabled generally, not only for pointer types.

> +       continue;
> +
> +      def1 = SSA_NAME_DEF_STMT (arg1);
> +      def2 = SSA_NAME_DEF_STMT (arg2);
> +
> +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> +         && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> +       continue;
> +
> +      /* Unless -fhoist-adjacent-loads was specified, check the mode
> +        of the arguments to be sure a conditional move can be generated
> +        for it.  */
> +      if (flag_hoist_adjacent_loads != 1
> +         && !optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> +       continue;
> +
> +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> +      if (!gimple_assign_single_p (def1)
> +         || !gimple_assign_single_p (def2))
> +       continue;
> +
> +      ref1 = gimple_assign_rhs1 (def1);
> +      ref2 = gimple_assign_rhs1 (def2);
> +
> +      if (TREE_CODE (ref1) != COMPONENT_REF
> +         || TREE_CODE (ref2) != COMPONENT_REF)
> +       continue;
> +
> +      /* The zeroth operand of the two component references must be
> +        identical.  It is not sufficient to compare get_base_address of
> +        the two references, because this could allow for different
> +        elements of the same array in the two trees.  It is not safe to
> +        assume that the existence of one array element implies the
> +        existence of a different one.  */
> +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> +       continue;
> +
> +      field1 = TREE_OPERAND (ref1, 1);
> +      field2 = TREE_OPERAND (ref2, 1);
> +
> +      tree_offset1 = DECL_FIELD_BIT_OFFSET (field1);
> +      tree_offset2 = DECL_FIELD_BIT_OFFSET (field2);
> +      tree_size1 = DECL_SIZE_UNIT (field1);
> +      tree_size2 = DECL_SIZE_UNIT (field2);
> +
> +      if (TREE_CODE (tree_offset1) != INTEGER_CST
> +         || TREE_CODE (tree_offset2) != INTEGER_CST

DECL_FIELD_BIT_OFFSETs are always INTEGER_CSTs.  You fail to
verify that DECL_FIELD_OFFSET of the two fields matches.

> +         || TREE_CODE (tree_size1) != INTEGER_CST
> +         || TREE_CODE (tree_size2) != INTEGER_CST)
> +       continue;
> +
> +      /* FORNOW: The two field references must be byte-aligned, naturally
> +        aligned within the record, and adjacent.  */
> +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> +      size1 = TREE_INT_CST_LOW (tree_size1);
> +      size2 = TREE_INT_CST_LOW (tree_size2);

You should have checked host_integerp above if you just handle TREE_INT_CST_LOW.

> +      align1 = TYPE_ALIGN (TREE_TYPE (arg1));
> +      align2 = TYPE_ALIGN (TREE_TYPE (arg2));

You should be using DECL_ALIGN of the field-decls here.

> +      if (offset1 % BITS_PER_UNIT != 0
> +         || offset2 % BITS_PER_UNIT != 0
> +         || offset1 % align1 != 0
> +         || offset2 % align2 != 0)
> +       continue;
> +
> +      offset1 /= BITS_PER_UNIT;
> +      offset2 /= BITS_PER_UNIT;
> +
> +      if (offset1 + size1 != offset2
> +         && offset2 + size2 != offset1)
> +       continue;

I think it's more natural to re-write all of the above to verify that
field1 follows
field2 by means of DECL_CHAIN (ignoring intermediate non-FIELD_DECLS)
(or the other way around) and to only verify alignment of field1 and the extent
of both accesses.  Note that field alignment can be a tricky business,
but DECL_ALIGN of the first FIELD_DECL should be a reasonable hint
(larger alignment cannot be easily detected because all information is relative
to a possibly parent struct FIELD_DECL you may not even see).

> +      /* The two field references must fit within a block of memory that
> +        is guaranteed to be on the same page.  */
> +      block_offset = MIN (offset1, offset2) % param_align;
> +
> +      if (block_offset + size1 + size2 > param_align)
> +       continue;
> +
> +      /* The two expressions cannot be dependent upon SSA names
> +        or vdefs defined in bb1/bb2.  */
> +      if (local_reg_dependence (TREE_OPERAND (ref1, 0), bb1)
> +         || local_reg_dependence (TREE_OPERAND (ref2, 0), bb2)
> +         || local_mem_dependence (def1, bb1)
> +         || local_mem_dependence (def2, bb2))
> +       continue;

You should be using FOR_EACH_SSA_USE_OPERAND here on the two
definition statements (and accordingly adjust those helpers, or even better,
trivially inline them).  I think you want a can_hoist_to_bb_end (stmt, bb)
helper, which is really what you check here.  Maybe that already exists
somewhere even.

> +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> +        bb0.  */
> +      gsi2 = gsi_for_stmt (def1);
> +      gsi_move_to_bb_end (&gsi2, bb0);
> +      gsi2 = gsi_for_stmt (def2);
> +      gsi_move_to_bb_end (&gsi2, bb0);

Should you be paying attention to the order of the loads here, thus load the
first memory part first?

Are we sure we always transform the diamond to a conditional move after
this transform, in the same pass?  I am concerned on how this conflicts
with the patch floating around that enables sinking of loads - what do we do
to avoid undoing this transform?

Thanks,
Richard.

> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       {
> +         fprintf (dump_file,
> +                  "\nHoisting adjacent loads from %d and %d into %d: \n",
> +                  bb1->index, bb2->index, bb0->index);
> +         print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> +         print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> +       }
> +    }
> +}
> +
> +/* Determine whether we should attempt to hoist adjacent loads out of
> +   diamond patterns in pass_phiopt.  Always hoist loads if
> +   -fhoist-adjacent-loads is specified, or if the target machine has
> +   a conditional move instruction and -fno-hoist-adjacent-loads is
> +   not specified.  */
> +
> +static bool
> +gate_hoist_loads (void)
> +{
> +  return (flag_hoist_adjacent_loads == 1
> +         || (HAVE_conditional_move && flag_hoist_adjacent_loads != 0));
> +}
> +
>  /* Always do these optimizations if we have SSA
>    trees to work on.  */
>  static bool
> Index: gcc/common.opt
> ===================================================================
> --- gcc/common.opt      (revision 187057)
> +++ gcc/common.opt      (working copy)
> @@ -1186,6 +1186,11 @@ fgraphite-identity
>  Common Report Var(flag_graphite_identity) Optimization
>  Enable Graphite Identity transformation
>
> +fhoist-adjacent-loads
> +Common Report Var(flag_hoist_adjacent_loads) Init(-1) Optimization
> +Enable hoisting adjacent loads to encourage generating conditional move
> +instructions
> +
>  floop-parallelize-all
>  Common Report Var(flag_loop_parallelize_all) Optimization
>  Mark all loops as parallel
> Index: gcc/Makefile.in
> ===================================================================
> --- gcc/Makefile.in     (revision 187057)
> +++ gcc/Makefile.in     (working copy)
> @@ -2351,7 +2351,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
>    $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
>    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
>    $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> -   $(TREE_DATA_REF_H) tree-pretty-print.h
> +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> +   insn-config.h $(EXPR_H) $(OPTABS_H)
>  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
>    $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
>    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> Index: gcc/params.def
> ===================================================================
> --- gcc/params.def      (revision 187057)
> +++ gcc/params.def      (working copy)
> @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
>          "track string lengths",
>          1000, 0, 0)
>
> +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> +         "min-cmove-struct-align",
> +         "Minimum byte alignment to assume for structures in the stack "
> +         "or heap when considering load hoisting for conditional moves",
> +         16, 8, 256)
> +
>  /*
>  Local variables:
>  mode:c
>
>

Re: [PATCH] Hoist adjacent pointer loads

by William J. Schmidt :: Rate this Message:

| View Threaded | Show Only this Message

On Mon, 2012-05-21 at 14:17 +0200, Richard Guenther wrote:

> On Thu, May 3, 2012 at 4:33 PM, William J. Schmidt
> <wschmidt@...> wrote:
> > This patch was posted for comment back in February during stage 4.  It
> > addresses a performance issue noted in the EEMBC routelookup benchmark
> > on a common idiom:
> >
> >  if (...)
> >    x = y->left;
> >  else
> >    x = y->right;
> >
> > If the two loads can be hoisted out of the if/else, the if/else can be
> > replaced by a conditional move instruction on architectures that support
> > one.  Because this speculates one of the loads, the patch constrains the
> > optimization to avoid introducing page faults.
> >
> > Bootstrapped and regression tested on powerpc-unknown-linux-gnu with no
> > new failures.  The patch provides significant improvement to the
> > routelookup benchmark, and is neutral on SPEC cpu2000/cpu2006.
> >
> > One question is what optimization level should be required for this.
> > Because of the speculation, -O3 might be in order.  I don't believe
> > -Ofast is required as there is no potential correctness issue involved.
> > Right now the patch doesn't check the optimization level (like the rest
> > of the phi-opt transforms), which is likely a poor choice.
> >
> > Ok for trunk?
> >
> > Thanks,
> > Bill
> >
> >
> > 2012-05-03  Bill Schmidt  <wschmidt@...>
> >
> >        * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> >        declaration.
> >        (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> >        (tree_ssa_phiopt): Call gate_hoist_loads.
> >        (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> >        (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> >        hoist_adjacent_loads.
> >        (local_reg_dependence): New function.
> >        (local_mem_dependence): Likewise.
> >        (hoist_adjacent_loads): Likewise.
> >        (gate_hoist_loads): Likewise.
> >        * common.opt (fhoist-adjacent-loads): New switch.
> >        * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> >        * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
> >
> >
> > Index: gcc/tree-ssa-phiopt.c
> > ===================================================================
> > --- gcc/tree-ssa-phiopt.c       (revision 187057)
> > +++ gcc/tree-ssa-phiopt.c       (working copy)
> > @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "cfgloop.h"
> >  #include "tree-data-ref.h"
> >  #include "tree-pretty-print.h"
> > +#include "gimple-pretty-print.h"
> > +#include "insn-config.h"
> > +#include "expr.h"
> > +#include "optabs.h"
> >
> > +#ifndef HAVE_conditional_move
> > +#define HAVE_conditional_move (0)
> > +#endif
> > +
> >  static unsigned int tree_ssa_phiopt (void);
> > -static unsigned int tree_ssa_phiopt_worker (bool);
> > +static unsigned int tree_ssa_phiopt_worker (bool, bool);
> >  static bool conditional_replacement (basic_block, basic_block,
> >                                     edge, edge, gimple, tree, tree);
> >  static int value_replacement (basic_block, basic_block,
> > @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
> >  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
> >  static struct pointer_set_t * get_non_trapping (void);
> >  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> > +static void hoist_adjacent_loads (basic_block, basic_block,
> > +                                 basic_block, basic_block);
> > +static bool gate_hoist_loads (void);
> >
> >  /* This pass tries to replaces an if-then-else block with an
> >    assignment.  We have four kinds of transformations.  Some of these
> > @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
> >      bb2:
> >        x = PHI <x' (bb0), ...>;
> >
> > -   A similar transformation is done for MAX_EXPR.  */
> > +   A similar transformation is done for MAX_EXPR.
> >
> > +
> > +   This pass also performs a fifth transformation of a slightly different
> > +   flavor.
> > +
> > +   Adjacent Load Hoisting
> > +   ----------------------
> > +
> > +   This transformation replaces
> > +
> > +     bb0:
> > +       if (...) goto bb2; else goto bb1;
> > +     bb1:
> > +       x1 = (<expr>).field1;
> > +       goto bb3;
> > +     bb2:
> > +       x2 = (<expr>).field2;
> > +     bb3:
> > +       # x = PHI <x1, x2>;
> > +
> > +   with
> > +
> > +     bb0:
> > +       x1 = (<expr>).field1;
> > +       x2 = (<expr>).field2;
> > +       if (...) goto bb2; else goto bb1;
> > +     bb1:
> > +       goto bb3;
> > +     bb2:
> > +     bb3:
> > +       # x = PHI <x1, x2>;
> > +
> > +   The purpose of this transformation is to enable generation of conditional
> > +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> > +   the loads is speculative, the transformation is restricted to very
> > +   specific cases to avoid introducing a page fault.  We are looking for
> > +   the common idiom:
> > +
> > +     if (...)
> > +       x = y->left;
> > +     else
> > +       x = y->right;
> > +
> > +   where left and right are typically adjacent pointers in a tree structure.  */
> > +
> >  static unsigned int
> >  tree_ssa_phiopt (void)
> >  {
> > -  return tree_ssa_phiopt_worker (false);
> > +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
> >  }
> >
> >  /* This pass tries to transform conditional stores into unconditional
> > @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
> >  static unsigned int
> >  tree_ssa_cs_elim (void)
> >  {
> > -  return tree_ssa_phiopt_worker (true);
> > +  return tree_ssa_phiopt_worker (true, false);
> >  }
> >
> >  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> > @@ -227,9 +282,11 @@ static tree condstoretemp;
> >  /* The core routine of conditional store replacement and normal
> >    phi optimizations.  Both share much of the infrastructure in how
> >    to match applicable basic block patterns.  DO_STORE_ELIM is true
> > -   when we want to do conditional store replacement, false otherwise.  */
> > +   when we want to do conditional store replacement, false otherwise.
> > +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> > +   of diamond control flow patterns, false otherwise.  */
> >  static unsigned int
> > -tree_ssa_phiopt_worker (bool do_store_elim)
> > +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
> >  {
> >   basic_block bb;
> >   basic_block *bb_order;
> > @@ -312,6 +369,20 @@ static unsigned int
> >            cfgchanged = true;
> >          continue;
> >        }
> > +      else if (do_hoist_loads
> > +                && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> > +       {
> > +         basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> > +
> > +         if (single_succ_p (bb1)
> > +             && single_succ_p (bb2)
> > +             && single_pred_p (bb1)
> > +             && single_pred_p (bb2)
>
> and same pred?

bb1 and bb2 are known to be successors of bb, so this is handled.

>
> > +             && EDGE_COUNT (bb->succs) == 2
> > +             && EDGE_COUNT (bb3->preds) == 2)
> > +           hoist_adjacent_loads (bb, bb1, bb2, bb3);
> > +         continue;
> > +       }
> >       else
> >        continue;
> >
> > @@ -1707,6 +1778,220 @@ cond_if_else_store_replacement (basic_block then_b
> >   return ok;
> >  }
> >
> > +/* Return TRUE iff EXPR contains any SSA names that are defined in BB.  */
> > +
> > +static bool
> > +local_reg_dependence (tree expr, basic_block bb)
> > +{
> > +  int i;
> > +
> > +  if (TREE_CODE (expr) == SSA_NAME)
> > +    {
> > +      if (SSA_NAME_IS_DEFAULT_DEF (expr))
> > +       return false;
> > +      else
> > +       return (gimple_bb (SSA_NAME_DEF_STMT (expr)) == bb);
> > +    }
> > +
> > +  for (i = 0; i < TREE_OPERAND_LENGTH (expr); i++)
> > +    if (TREE_OPERAND (expr, i)
> > +       && local_reg_dependence (TREE_OPERAND (expr, i), bb))
> > +      return true;
>
> This looks odd - what are you trying to catch here?

My initial thought was, we can't hoist an expr relying on an SSA name
defined in the block it's being hoisted from.  Given all the other
requirements on the refs, though, I suppose that's impossible.  (If the
refs are identical except for the FIELD_DECL, they can't be locally
dependent on any SSA names.)  Should be able to remove this.  I was
adding code for the memory dependence and got carried away.

>
> > +  return false;
> > +}
> > +
> > +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> > +
> > +static bool
> > +local_mem_dependence (gimple stmt, basic_block bb)
> > +{
> > +  tree vuse = gimple_vuse (stmt);
> > +  gimple def;
> > +
> > +  if (!vuse)
> > +    return false;
> > +
> > +  def = SSA_NAME_DEF_STMT (vuse);
> > +  return (def && gimple_bb (def) == bb);
> > +}
> > +
> > +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> > +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> > +   and BB3 rejoins control flow following BB1 and BB2, look for
> > +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> > +   two loads, one each occurring in BB1 and BB2, and the loads are
> > +   provably of adjacent fields in the same structure, then move both
> > +   loads into BB0.  Of course this can only be done if there are no
> > +   dependencies preventing such motion.
> > +
> > +   One of the hoisted loads will always be speculative, so the
> > +   transformation is currently conservative:
> > +
> > +    - The fields must be strictly adjacent.
> > +    - Hoisting is only done on aligned pointer fields.
> > +    - The two fields must occupy a single memory block that is
> > +      guaranteed to not cross a page boundary.
> > +
> > +    The last is difficult to prove, as such memory blocks should be
> > +    aligned on the minimum of the stack alignment boundary and the
> > +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> > +    on a parameter for the alignment value.
> > +
> > +    Provided a good value is used for the last case, the first two
> > +    restrictions can probably be relaxed.  */
> > +
> > +static void
> > +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> > +                     basic_block bb2, basic_block bb3)
> > +{
> > +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> > +  gimple_stmt_iterator gsi;
> > +
> > +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> > +     for phis of two SSA names, one each of which is defined in bb1 and
> > +     bb2.  */
> > +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> > +    {
> > +      gimple phi_stmt = gsi_stmt (gsi);
> > +      gimple def1, def2;
> > +      tree arg1, arg2, ref1, ref2, field1, field2;
> > +      tree tree_offset1, tree_offset2, tree_size1, tree_size2;
> > +      int offset1, offset2, size1, size2, block_offset;
> > +      unsigned align1, align2;
> > +      gimple_stmt_iterator gsi2;
> > +
> > +      if (gimple_phi_num_args (phi_stmt) != 2)
> > +       continue;
> > +
> > +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> > +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> > +
> > +      if (TREE_CODE (arg1) != SSA_NAME
> > +         || TREE_CODE (arg2) != SSA_NAME
> > +         || SSA_NAME_IS_DEFAULT_DEF (arg1)
> > +         || SSA_NAME_IS_DEFAULT_DEF (arg2)
> > +         /* FORNOW: Only do this optimization for pointer types.  */
> > +         || !POINTER_TYPE_P (TREE_TYPE (arg1)))
>
> I'd like to see it enabled generally, not only for pointer types.

OK.  I'll revise the patch and check some performance numbers.  Just
trying to be conservative given that I can't test all the various kinds
of hardware out there...

>
> > +       continue;
> > +
> > +      def1 = SSA_NAME_DEF_STMT (arg1);
> > +      def2 = SSA_NAME_DEF_STMT (arg2);
> > +
> > +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> > +         && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> > +       continue;
> > +
> > +      /* Unless -fhoist-adjacent-loads was specified, check the mode
> > +        of the arguments to be sure a conditional move can be generated
> > +        for it.  */
> > +      if (flag_hoist_adjacent_loads != 1
> > +         && !optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> > +       continue;
> > +
> > +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> > +      if (!gimple_assign_single_p (def1)
> > +         || !gimple_assign_single_p (def2))
> > +       continue;
> > +
> > +      ref1 = gimple_assign_rhs1 (def1);
> > +      ref2 = gimple_assign_rhs1 (def2);
> > +
> > +      if (TREE_CODE (ref1) != COMPONENT_REF
> > +         || TREE_CODE (ref2) != COMPONENT_REF)
> > +       continue;
> > +
> > +      /* The zeroth operand of the two component references must be
> > +        identical.  It is not sufficient to compare get_base_address of
> > +        the two references, because this could allow for different
> > +        elements of the same array in the two trees.  It is not safe to
> > +        assume that the existence of one array element implies the
> > +        existence of a different one.  */
> > +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> > +       continue;
> > +
> > +      field1 = TREE_OPERAND (ref1, 1);
> > +      field2 = TREE_OPERAND (ref2, 1);
> > +
> > +      tree_offset1 = DECL_FIELD_BIT_OFFSET (field1);
> > +      tree_offset2 = DECL_FIELD_BIT_OFFSET (field2);
> > +      tree_size1 = DECL_SIZE_UNIT (field1);
> > +      tree_size2 = DECL_SIZE_UNIT (field2);
> > +
> > +      if (TREE_CODE (tree_offset1) != INTEGER_CST
> > +         || TREE_CODE (tree_offset2) != INTEGER_CST
>
> DECL_FIELD_BIT_OFFSETs are always INTEGER_CSTs.  You fail to
> verify that DECL_FIELD_OFFSET of the two fields matches.

OK, will change.

>
> > +         || TREE_CODE (tree_size1) != INTEGER_CST
> > +         || TREE_CODE (tree_size2) != INTEGER_CST)
> > +       continue;
> > +
> > +      /* FORNOW: The two field references must be byte-aligned, naturally
> > +        aligned within the record, and adjacent.  */
> > +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> > +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> > +      size1 = TREE_INT_CST_LOW (tree_size1);
> > +      size2 = TREE_INT_CST_LOW (tree_size2);
>
> You should have checked host_integerp above if you just handle TREE_INT_CST_LOW.

Whoops.  Yes.

>
> > +      align1 = TYPE_ALIGN (TREE_TYPE (arg1));
> > +      align2 = TYPE_ALIGN (TREE_TYPE (arg2));
>
> You should be using DECL_ALIGN of the field-decls here.

OK.

>
> > +      if (offset1 % BITS_PER_UNIT != 0
> > +         || offset2 % BITS_PER_UNIT != 0
> > +         || offset1 % align1 != 0
> > +         || offset2 % align2 != 0)
> > +       continue;
> > +
> > +      offset1 /= BITS_PER_UNIT;
> > +      offset2 /= BITS_PER_UNIT;
> > +
> > +      if (offset1 + size1 != offset2
> > +         && offset2 + size2 != offset1)
> > +       continue;
>
> I think it's more natural to re-write all of the above to verify that
> field1 follows
> field2 by means of DECL_CHAIN (ignoring intermediate non-FIELD_DECLS)
> (or the other way around) and to only verify alignment of field1 and the extent
> of both accesses.  Note that field alignment can be a tricky business,
> but DECL_ALIGN of the first FIELD_DECL should be a reasonable hint
> (larger alignment cannot be easily detected because all information is relative
> to a possibly parent struct FIELD_DECL you may not even see).

OK, sounds good.  I'll rewrite it in this fashion.

>
> > +      /* The two field references must fit within a block of memory that
> > +        is guaranteed to be on the same page.  */
> > +      block_offset = MIN (offset1, offset2) % param_align;
> > +
> > +      if (block_offset + size1 + size2 > param_align)
> > +       continue;
> > +
> > +      /* The two expressions cannot be dependent upon SSA names
> > +        or vdefs defined in bb1/bb2.  */
> > +      if (local_reg_dependence (TREE_OPERAND (ref1, 0), bb1)
> > +         || local_reg_dependence (TREE_OPERAND (ref2, 0), bb2)
> > +         || local_mem_dependence (def1, bb1)
> > +         || local_mem_dependence (def2, bb2))
> > +       continue;
>
> You should be using FOR_EACH_SSA_USE_OPERAND here on the two
> definition statements (and accordingly adjust those helpers, or even better,
> trivially inline them).  I think you want a can_hoist_to_bb_end (stmt, bb)
> helper, which is really what you check here.  Maybe that already exists
> somewhere even.

If I remove local_reg_dependence and just leave the local_mem_dependence
checks in place, I believe that should be sufficient, right?  Given the
constraints, checking virtual dependences ought to be enough.

>
> > +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> > +        bb0.  */
> > +      gsi2 = gsi_for_stmt (def1);
> > +      gsi_move_to_bb_end (&gsi2, bb0);
> > +      gsi2 = gsi_for_stmt (def2);
> > +      gsi_move_to_bb_end (&gsi2, bb0);
>
> Should you be paying attention to the order of the loads here, thus load the
> first memory part first?

Perhaps so, in case of a cache miss.  I know of some existing hardware
that will begin a cache fill at the point of the miss and wrap it
around, so such hardware would benefit.

>
> Are we sure we always transform the diamond to a conditional move after
> this transform, in the same pass?  I am concerned on how this conflicts
> with the patch floating around that enables sinking of loads - what do we do
> to avoid undoing this transform?

Hm, I was somewhat relying on the existing rule of no load-sinking.
However, I think we're still OK.  There is a late pass of phiopt long
after the sink pass which will re-do the optimization even if sinking
undoes it.  Right now we rely on RTL if-conversion to generate the
conditional move.  I'm open to suggestions if that doesn't seem like
enough of a guarantee.

Thanks,
Bill

>
> Thanks,
> Richard.
>
> > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > +       {
> > +         fprintf (dump_file,
> > +                  "\nHoisting adjacent loads from %d and %d into %d: \n",
> > +                  bb1->index, bb2->index, bb0->index);
> > +         print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> > +         print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> > +       }
> > +    }
> > +}
> > +
> > +/* Determine whether we should attempt to hoist adjacent loads out of
> > +   diamond patterns in pass_phiopt.  Always hoist loads if
> > +   -fhoist-adjacent-loads is specified, or if the target machine has
> > +   a conditional move instruction and -fno-hoist-adjacent-loads is
> > +   not specified.  */
> > +
> > +static bool
> > +gate_hoist_loads (void)
> > +{
> > +  return (flag_hoist_adjacent_loads == 1
> > +         || (HAVE_conditional_move && flag_hoist_adjacent_loads != 0));
> > +}
> > +
> >  /* Always do these optimizations if we have SSA
> >    trees to work on.  */
> >  static bool
> > Index: gcc/common.opt
> > ===================================================================
> > --- gcc/common.opt      (revision 187057)
> > +++ gcc/common.opt      (working copy)
> > @@ -1186,6 +1186,11 @@ fgraphite-identity
> >  Common Report Var(flag_graphite_identity) Optimization
> >  Enable Graphite Identity transformation
> >
> > +fhoist-adjacent-loads
> > +Common Report Var(flag_hoist_adjacent_loads) Init(-1) Optimization
> > +Enable hoisting adjacent loads to encourage generating conditional move
> > +instructions
> > +
> >  floop-parallelize-all
> >  Common Report Var(flag_loop_parallelize_all) Optimization
> >  Mark all loops as parallel
> > Index: gcc/Makefile.in
> > ===================================================================
> > --- gcc/Makefile.in     (revision 187057)
> > +++ gcc/Makefile.in     (working copy)
> > @@ -2351,7 +2351,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
> >    $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
> >    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
> >    $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> > -   $(TREE_DATA_REF_H) tree-pretty-print.h
> > +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> > +   insn-config.h $(EXPR_H) $(OPTABS_H)
> >  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
> >    $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
> >    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> > Index: gcc/params.def
> > ===================================================================
> > --- gcc/params.def      (revision 187057)
> > +++ gcc/params.def      (working copy)
> > @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
> >          "track string lengths",
> >          1000, 0, 0)
> >
> > +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> > +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> > +         "min-cmove-struct-align",
> > +         "Minimum byte alignment to assume for structures in the stack "
> > +         "or heap when considering load hoisting for conditional moves",
> > +         16, 8, 256)
> > +
> >  /*
> >  Local variables:
> >  mode:c
> >
> >
>


Re: [PATCH] Hoist adjacent pointer loads

by William J. Schmidt :: Rate this Message:

| View Threaded | Show Only this Message

Here's a revision of the hoist-adjacent-loads patch.  Besides hopefully
addressing all your comments, I added a gate of at least -O2 for this
transformation.  Let me know if you prefer a different minimum opt
level.

I'm still running SPEC tests to make sure there are no regressions when
opening this up to non-pointer arguments.  The code bootstraps on
powerpc64-unknown-linux-gnu with no regressions.  Assuming the SPEC
numbers come out as expected, is this ok?

Thanks,
Bill


2012-05-22  Bill Schmidt  <wschmidt@...>

        * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
        declaration.
        (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
        (tree_ssa_phiopt): Call gate_hoist_loads.
        (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
        (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
        hoist_adjacent_loads.
        (local_mem_dependence): New function.
        (hoist_adjacent_loads): Likewise.
        (gate_hoist_loads): Likewise.
        * common.opt (fhoist-adjacent-loads): New switch.
        * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
        * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.


Index: gcc/tree-ssa-phiopt.c
===================================================================
--- gcc/tree-ssa-phiopt.c (revision 187728)
+++ gcc/tree-ssa-phiopt.c (working copy)
@@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
 #include "cfgloop.h"
 #include "tree-data-ref.h"
 #include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "insn-config.h"
+#include "expr.h"
+#include "optabs.h"
 
+#ifndef HAVE_conditional_move
+#define HAVE_conditional_move (0)
+#endif
+
 static unsigned int tree_ssa_phiopt (void);
-static unsigned int tree_ssa_phiopt_worker (bool);
+static unsigned int tree_ssa_phiopt_worker (bool, bool);
 static bool conditional_replacement (basic_block, basic_block,
      edge, edge, gimple, tree, tree);
 static int value_replacement (basic_block, basic_block,
@@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
 static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
 static struct pointer_set_t * get_non_trapping (void);
 static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
+static void hoist_adjacent_loads (basic_block, basic_block,
+  basic_block, basic_block);
+static bool gate_hoist_loads (void);
 
 /* This pass tries to replaces an if-then-else block with an
    assignment.  We have four kinds of transformations.  Some of these
@@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
      bb2:
        x = PHI <x' (bb0), ...>;
 
-   A similar transformation is done for MAX_EXPR.  */
+   A similar transformation is done for MAX_EXPR.
 
+
+   This pass also performs a fifth transformation of a slightly different
+   flavor.
+
+   Adjacent Load Hoisting
+   ----------------------
+  
+   This transformation replaces
+
+     bb0:
+       if (...) goto bb2; else goto bb1;
+     bb1:
+       x1 = (<expr>).field1;
+       goto bb3;
+     bb2:
+       x2 = (<expr>).field2;
+     bb3:
+       # x = PHI <x1, x2>;
+
+   with
+
+     bb0:
+       x1 = (<expr>).field1;
+       x2 = (<expr>).field2;
+       if (...) goto bb2; else goto bb1;
+     bb1:
+       goto bb3;
+     bb2:
+     bb3:
+       # x = PHI <x1, x2>;
+
+   The purpose of this transformation is to enable generation of conditional
+   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
+   the loads is speculative, the transformation is restricted to very
+   specific cases to avoid introducing a page fault.  We are looking for
+   the common idiom:
+
+     if (...)
+       x = y->left;
+     else
+       x = y->right;
+
+   where left and right are typically adjacent pointers in a tree structure.  */
+
 static unsigned int
 tree_ssa_phiopt (void)
 {
-  return tree_ssa_phiopt_worker (false);
+  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
 }
 
 /* This pass tries to transform conditional stores into unconditional
@@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
 static unsigned int
 tree_ssa_cs_elim (void)
 {
-  return tree_ssa_phiopt_worker (true);
+  return tree_ssa_phiopt_worker (true, false);
 }
 
 /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
@@ -227,9 +282,11 @@ static tree condstoretemp;
 /* The core routine of conditional store replacement and normal
    phi optimizations.  Both share much of the infrastructure in how
    to match applicable basic block patterns.  DO_STORE_ELIM is true
-   when we want to do conditional store replacement, false otherwise.  */
+   when we want to do conditional store replacement, false otherwise.
+   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
+   of diamond control flow patterns, false otherwise.  */
 static unsigned int
-tree_ssa_phiopt_worker (bool do_store_elim)
+tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
 {
   basic_block bb;
   basic_block *bb_order;
@@ -312,6 +369,20 @@ static unsigned int
     cfgchanged = true;
   continue;
  }
+      else if (do_hoist_loads
+ && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
+ {
+  basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
+
+  if (single_succ_p (bb1)
+      && single_succ_p (bb2)
+      && single_pred_p (bb1)
+      && single_pred_p (bb2)
+      && EDGE_COUNT (bb->succs) == 2
+      && EDGE_COUNT (bb3->preds) == 2)
+    hoist_adjacent_loads (bb, bb1, bb2, bb3);
+  continue;
+ }
       else
  continue;      
 
@@ -1707,6 +1778,206 @@ cond_if_else_store_replacement (basic_block then_b
   return ok;
 }
 
+/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
+
+static bool
+local_mem_dependence (gimple stmt, basic_block bb)
+{
+  tree vuse = gimple_vuse (stmt);
+  gimple def;
+
+  if (!vuse)
+    return false;
+
+  def = SSA_NAME_DEF_STMT (vuse);
+  return (def && gimple_bb (def) == bb);
+}
+
+/* Given a "diamond" control-flow pattern where BB0 tests a condition,
+   BB1 and BB2 are "then" and "else" blocks dependent on this test,
+   and BB3 rejoins control flow following BB1 and BB2, look for
+   opportunities to hoist loads as follows.  If BB3 contains a PHI of
+   two loads, one each occurring in BB1 and BB2, and the loads are
+   provably of adjacent fields in the same structure, then move both
+   loads into BB0.  Of course this can only be done if there are no
+   dependencies preventing such motion.
+
+   One of the hoisted loads will always be speculative, so the
+   transformation is currently conservative:
+
+    - The fields must be strictly adjacent.
+    - The two fields must occupy a single memory block that is
+      guaranteed to not cross a page boundary.
+
+    The last is difficult to prove, as such memory blocks should be
+    aligned on the minimum of the stack alignment boundary and the
+    alignment guaranteed by heap allocation interfaces.  Thus we rely
+    on a parameter for the alignment value.
+
+    Provided a good value is used for the last case, the first
+    restriction could possibly be relaxed.  */
+
+static void
+hoist_adjacent_loads (basic_block bb0, basic_block bb1,
+      basic_block bb2, basic_block bb3)
+{
+  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
+  gimple_stmt_iterator gsi;
+
+  /* Walk the phis in bb3 looking for an opportunity.  We are looking
+     for phis of two SSA names, one each of which is defined in bb1 and
+     bb2.  */
+  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple phi_stmt = gsi_stmt (gsi);
+      gimple def1, def2, defswap;
+      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
+      tree tree_offset1, tree_offset2, tree_size2, next;
+      int offset1, offset2, size2, block_offset;
+      unsigned align1;
+      gimple_stmt_iterator gsi2;
+
+      if (gimple_phi_num_args (phi_stmt) != 2)
+ continue;
+
+      arg1 = gimple_phi_arg_def (phi_stmt, 0);
+      arg2 = gimple_phi_arg_def (phi_stmt, 1);
+      
+      if (TREE_CODE (arg1) != SSA_NAME
+  || TREE_CODE (arg2) != SSA_NAME
+  || SSA_NAME_IS_DEFAULT_DEF (arg1)
+  || SSA_NAME_IS_DEFAULT_DEF (arg2))
+ continue;
+
+      def1 = SSA_NAME_DEF_STMT (arg1);
+      def2 = SSA_NAME_DEF_STMT (arg2);
+
+      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
+  && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
+ continue;
+
+      /* Unless -fhoist-adjacent-loads was specified, check the mode
+ of the arguments to be sure a conditional move can be generated
+ for it.  */
+      if (flag_hoist_adjacent_loads != 1
+  && !optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
+ continue;
+
+      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
+      if (!gimple_assign_single_p (def1)
+  || !gimple_assign_single_p (def2))
+ continue;
+
+      ref1 = gimple_assign_rhs1 (def1);
+      ref2 = gimple_assign_rhs1 (def2);
+
+      if (TREE_CODE (ref1) != COMPONENT_REF
+  || TREE_CODE (ref2) != COMPONENT_REF)
+ continue;
+
+      /* The zeroth operand of the two component references must be
+ identical.  It is not sufficient to compare get_base_address of
+ the two references, because this could allow for different
+ elements of the same array in the two trees.  It is not safe to
+ assume that the existence of one array element implies the
+ existence of a different one.  */
+      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
+ continue;
+
+      field1 = TREE_OPERAND (ref1, 1);
+      field2 = TREE_OPERAND (ref2, 1);
+
+      /* Check for field adjacency, and ensure field1 comes first.  */
+      for (next = DECL_CHAIN (field1);
+   next && TREE_CODE (next) != FIELD_DECL;
+   next = DECL_CHAIN (next))
+ ;
+
+      if (!next || !operand_equal_p (next, field2, 0))
+ {
+  for (next = DECL_CHAIN (field2);
+       next && TREE_CODE (next) != FIELD_DECL;
+       next = DECL_CHAIN (next))
+    ;
+
+  if (!next || !operand_equal_p (next, field1, 0))
+    continue;
+
+  fieldswap = field1;
+  field1 = field2;
+  field2 = fieldswap;
+  defswap = def1;
+  def1 = def2;
+  def2 = defswap;
+ }
+
+      /* Check for proper alignment of the first field.  */
+      tree_offset1 = bit_position (field1);
+      tree_offset2 = bit_position (field2);
+      tree_size2 = DECL_SIZE_UNIT (field2);
+
+      if (TREE_CODE (tree_size2) != INTEGER_CST
+  || !host_integerp (tree_offset1, 0)
+  || !host_integerp (tree_offset2, 0)
+  || !host_integerp (tree_size2, 0))
+ continue;
+
+      offset1 = TREE_INT_CST_LOW (tree_offset1);
+      offset2 = TREE_INT_CST_LOW (tree_offset2);
+      size2 = TREE_INT_CST_LOW (tree_size2);
+      align1 = DECL_ALIGN (field1);
+
+      if (offset1 % BITS_PER_UNIT != 0 || offset1 % align1 != 0)
+ continue;
+
+      offset1 /= BITS_PER_UNIT;
+      offset2 /= BITS_PER_UNIT;
+
+      /* The two field references must fit within a block of memory that
+ is guaranteed to be on the same page.  */
+      block_offset = offset1 % param_align;
+
+      if (block_offset + offset2 - offset1 + size2 > param_align)
+ continue;
+
+      /* The two expressions cannot be dependent upon vdefs defined
+ in bb1/bb2.  */
+      if (local_mem_dependence (def1, bb1) || local_mem_dependence (def2, bb2))
+ continue;
+
+      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
+ bb0.  We hoist the first one first so that a cache miss is handled
+         efficiently regardless of hardware cache-fill policy.  */
+      gsi2 = gsi_for_stmt (def1);
+      gsi_move_to_bb_end (&gsi2, bb0);
+      gsi2 = gsi_for_stmt (def2);
+      gsi_move_to_bb_end (&gsi2, bb0);
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+  fprintf (dump_file,
+   "\nHoisting adjacent loads from %d and %d into %d: \n",
+   bb1->index, bb2->index, bb0->index);
+  print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
+  print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
+ }
+    }
+}
+
+/* Determine whether we should attempt to hoist adjacent loads out of
+   diamond patterns in pass_phiopt.  Always hoist loads if
+   -fhoist-adjacent-loads is specified, or if the target machine has
+   a conditional move instruction and -fno-hoist-adjacent-loads is
+   not specified.  */
+
+static bool
+gate_hoist_loads (void)
+{
+  return ((optimize > 1)
+  && (flag_hoist_adjacent_loads == 1
+      || (HAVE_conditional_move && flag_hoist_adjacent_loads != 0)));
+}
+
 /* Always do these optimizations if we have SSA
    trees to work on.  */
 static bool
Index: gcc/common.opt
===================================================================
--- gcc/common.opt (revision 187728)
+++ gcc/common.opt (working copy)
@@ -1186,6 +1186,11 @@ fgraphite-identity
 Common Report Var(flag_graphite_identity) Optimization
 Enable Graphite Identity transformation
 
+fhoist-adjacent-loads
+Common Report Var(flag_hoist_adjacent_loads) Init(-1) Optimization
+Enable hoisting adjacent loads to encourage generating conditional move
+instructions
+
 floop-parallelize-all
 Common Report Var(flag_loop_parallelize_all) Optimization
 Mark all loops as parallel
Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in (revision 187728)
+++ gcc/Makefile.in (working copy)
@@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
    $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
    $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
-   $(TREE_DATA_REF_H) tree-pretty-print.h
+   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
+   insn-config.h $(EXPR_H) $(OPTABS_H)
 tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
Index: gcc/params.def
===================================================================
--- gcc/params.def (revision 187728)
+++ gcc/params.def (working copy)
@@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
   "track string lengths",
   1000, 0, 0)
 
+/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
+DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
+  "min-cmove-struct-align",
+  "Minimum byte alignment to assume for structures in the stack "
+  "or heap when considering load hoisting for conditional moves",
+  16, 8, 256)
+
 /*
 Local variables:
 mode:c



Re: [PATCH] Hoist adjacent pointer loads

by Richard Biener :: Rate this Message:

| View Threaded | Show Only this Message

On Tue, 22 May 2012, William J. Schmidt wrote:

> Here's a revision of the hoist-adjacent-loads patch.  Besides hopefully
> addressing all your comments, I added a gate of at least -O2 for this
> transformation.  Let me know if you prefer a different minimum opt
> level.
>
> I'm still running SPEC tests to make sure there are no regressions when
> opening this up to non-pointer arguments.  The code bootstraps on
> powerpc64-unknown-linux-gnu with no regressions.  Assuming the SPEC
> numbers come out as expected, is this ok?
>
> Thanks,
> Bill
>
>
> 2012-05-22  Bill Schmidt  <wschmidt@...>
>
> * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> declaration.
> (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> (tree_ssa_phiopt): Call gate_hoist_loads.
> (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> hoist_adjacent_loads.
> (local_mem_dependence): New function.
> (hoist_adjacent_loads): Likewise.
> (gate_hoist_loads): Likewise.
> * common.opt (fhoist-adjacent-loads): New switch.
> * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
>
>
> Index: gcc/tree-ssa-phiopt.c
> ===================================================================
> --- gcc/tree-ssa-phiopt.c (revision 187728)
> +++ gcc/tree-ssa-phiopt.c (working copy)
> @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfgloop.h"
>  #include "tree-data-ref.h"
>  #include "tree-pretty-print.h"
> +#include "gimple-pretty-print.h"
> +#include "insn-config.h"
> +#include "expr.h"
> +#include "optabs.h"
>  
> +#ifndef HAVE_conditional_move
> +#define HAVE_conditional_move (0)
> +#endif
> +
>  static unsigned int tree_ssa_phiopt (void);
> -static unsigned int tree_ssa_phiopt_worker (bool);
> +static unsigned int tree_ssa_phiopt_worker (bool, bool);
>  static bool conditional_replacement (basic_block, basic_block,
>       edge, edge, gimple, tree, tree);
>  static int value_replacement (basic_block, basic_block,
> @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
>  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
>  static struct pointer_set_t * get_non_trapping (void);
>  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> +static void hoist_adjacent_loads (basic_block, basic_block,
> +  basic_block, basic_block);
> +static bool gate_hoist_loads (void);
>  
>  /* This pass tries to replaces an if-then-else block with an
>     assignment.  We have four kinds of transformations.  Some of these
> @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
>       bb2:
>         x = PHI <x' (bb0), ...>;
>  
> -   A similar transformation is done for MAX_EXPR.  */
> +   A similar transformation is done for MAX_EXPR.
>  
> +
> +   This pass also performs a fifth transformation of a slightly different
> +   flavor.
> +
> +   Adjacent Load Hoisting
> +   ----------------------
> +  
> +   This transformation replaces
> +
> +     bb0:
> +       if (...) goto bb2; else goto bb1;
> +     bb1:
> +       x1 = (<expr>).field1;
> +       goto bb3;
> +     bb2:
> +       x2 = (<expr>).field2;
> +     bb3:
> +       # x = PHI <x1, x2>;
> +
> +   with
> +
> +     bb0:
> +       x1 = (<expr>).field1;
> +       x2 = (<expr>).field2;
> +       if (...) goto bb2; else goto bb1;
> +     bb1:
> +       goto bb3;
> +     bb2:
> +     bb3:
> +       # x = PHI <x1, x2>;
> +
> +   The purpose of this transformation is to enable generation of conditional
> +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> +   the loads is speculative, the transformation is restricted to very
> +   specific cases to avoid introducing a page fault.  We are looking for
> +   the common idiom:
> +
> +     if (...)
> +       x = y->left;
> +     else
> +       x = y->right;
> +
> +   where left and right are typically adjacent pointers in a tree structure.  */
> +
>  static unsigned int
>  tree_ssa_phiopt (void)
>  {
> -  return tree_ssa_phiopt_worker (false);
> +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
>  }
>  
>  /* This pass tries to transform conditional stores into unconditional
> @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
>  static unsigned int
>  tree_ssa_cs_elim (void)
>  {
> -  return tree_ssa_phiopt_worker (true);
> +  return tree_ssa_phiopt_worker (true, false);
>  }
>  
>  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> @@ -227,9 +282,11 @@ static tree condstoretemp;
>  /* The core routine of conditional store replacement and normal
>     phi optimizations.  Both share much of the infrastructure in how
>     to match applicable basic block patterns.  DO_STORE_ELIM is true
> -   when we want to do conditional store replacement, false otherwise.  */
> +   when we want to do conditional store replacement, false otherwise.
> +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> +   of diamond control flow patterns, false otherwise.  */
>  static unsigned int
> -tree_ssa_phiopt_worker (bool do_store_elim)
> +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
>  {
>    basic_block bb;
>    basic_block *bb_order;
> @@ -312,6 +369,20 @@ static unsigned int
>      cfgchanged = true;
>    continue;
>   }
> +      else if (do_hoist_loads
> + && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> + {
> +  basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> +
> +  if (single_succ_p (bb1)
> +      && single_succ_p (bb2)
> +      && single_pred_p (bb1)
> +      && single_pred_p (bb2)
> +      && EDGE_COUNT (bb->succs) == 2
> +      && EDGE_COUNT (bb3->preds) == 2)
> +    hoist_adjacent_loads (bb, bb1, bb2, bb3);
> +  continue;
> + }
>        else
>   continue;      
>  
> @@ -1707,6 +1778,206 @@ cond_if_else_store_replacement (basic_block then_b
>    return ok;
>  }
>  
> +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> +
> +static bool
> +local_mem_dependence (gimple stmt, basic_block bb)
> +{
> +  tree vuse = gimple_vuse (stmt);
> +  gimple def;
> +
> +  if (!vuse)
> +    return false;
> +
> +  def = SSA_NAME_DEF_STMT (vuse);
> +  return (def && gimple_bb (def) == bb);
> +}
> +
> +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> +   and BB3 rejoins control flow following BB1 and BB2, look for
> +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> +   two loads, one each occurring in BB1 and BB2, and the loads are
> +   provably of adjacent fields in the same structure, then move both
> +   loads into BB0.  Of course this can only be done if there are no
> +   dependencies preventing such motion.
> +
> +   One of the hoisted loads will always be speculative, so the
> +   transformation is currently conservative:
> +
> +    - The fields must be strictly adjacent.
> +    - The two fields must occupy a single memory block that is
> +      guaranteed to not cross a page boundary.
> +
> +    The last is difficult to prove, as such memory blocks should be
> +    aligned on the minimum of the stack alignment boundary and the
> +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> +    on a parameter for the alignment value.
> +
> +    Provided a good value is used for the last case, the first
> +    restriction could possibly be relaxed.  */
> +
> +static void
> +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> +      basic_block bb2, basic_block bb3)
> +{
> +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> +  gimple_stmt_iterator gsi;
> +
> +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> +     for phis of two SSA names, one each of which is defined in bb1 and
> +     bb2.  */
> +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple phi_stmt = gsi_stmt (gsi);
> +      gimple def1, def2, defswap;
> +      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
> +      tree tree_offset1, tree_offset2, tree_size2, next;
> +      int offset1, offset2, size2, block_offset;
> +      unsigned align1;
> +      gimple_stmt_iterator gsi2;
> +
> +      if (gimple_phi_num_args (phi_stmt) != 2)
> + continue;
> +
> +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> +      
> +      if (TREE_CODE (arg1) != SSA_NAME
> +  || TREE_CODE (arg2) != SSA_NAME
> +  || SSA_NAME_IS_DEFAULT_DEF (arg1)
> +  || SSA_NAME_IS_DEFAULT_DEF (arg2))
> + continue;
> +
> +      def1 = SSA_NAME_DEF_STMT (arg1);
> +      def2 = SSA_NAME_DEF_STMT (arg2);
> +
> +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> +  && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> + continue;
> +
> +      /* Unless -fhoist-adjacent-loads was specified, check the mode
> + of the arguments to be sure a conditional move can be generated
> + for it.  */
> +      if (flag_hoist_adjacent_loads != 1
> +  && !optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> + continue;
> +
> +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> +      if (!gimple_assign_single_p (def1)
> +  || !gimple_assign_single_p (def2))
> + continue;
> +
> +      ref1 = gimple_assign_rhs1 (def1);
> +      ref2 = gimple_assign_rhs1 (def2);
> +
> +      if (TREE_CODE (ref1) != COMPONENT_REF
> +  || TREE_CODE (ref2) != COMPONENT_REF)
> + continue;
> +
> +      /* The zeroth operand of the two component references must be
> + identical.  It is not sufficient to compare get_base_address of
> + the two references, because this could allow for different
> + elements of the same array in the two trees.  It is not safe to
> + assume that the existence of one array element implies the
> + existence of a different one.  */
> +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> + continue;
> +
> +      field1 = TREE_OPERAND (ref1, 1);
> +      field2 = TREE_OPERAND (ref2, 1);
> +
> +      /* Check for field adjacency, and ensure field1 comes first.  */
> +      for (next = DECL_CHAIN (field1);
> +   next && TREE_CODE (next) != FIELD_DECL;
> +   next = DECL_CHAIN (next))
> + ;
> +
> +      if (!next || !operand_equal_p (next, field2, 0))
You can compare FIELD_DECLS by pointer equality, thus next != field2 here.

> + {
> +  for (next = DECL_CHAIN (field2);
> +       next && TREE_CODE (next) != FIELD_DECL;
> +       next = DECL_CHAIN (next))
> +    ;
> +
> +  if (!next || !operand_equal_p (next, field1, 0))

Likewise.

> +    continue;
> +
> +  fieldswap = field1;
> +  field1 = field2;
> +  field2 = fieldswap;
> +  defswap = def1;
> +  def1 = def2;
> +  def2 = defswap;
> + }
> +
> +      /* Check for proper alignment of the first field.  */
> +      tree_offset1 = bit_position (field1);
> +      tree_offset2 = bit_position (field2);
> +      tree_size2 = DECL_SIZE_UNIT (field2);
> +
> +      if (TREE_CODE (tree_size2) != INTEGER_CST
host_integerp already checks for the TREE_CODE.

> +  || !host_integerp (tree_offset1, 0)
> +  || !host_integerp (tree_offset2, 0)
> +  || !host_integerp (tree_size2, 0))
> + continue;

And I think you want , 1 in all cases, positive offsets/sizes.

> +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> +      size2 = TREE_INT_CST_LOW (tree_size2);
> +      align1 = DECL_ALIGN (field1);
> +
> +      if (offset1 % BITS_PER_UNIT != 0 || offset1 % align1 != 0)
> + continue;

Why offset1 % align1 != 0?  offset1 is relative to the containing
record start while DECL_ALIGN is relative to the ultimate containing
object.  Either it's redundant to check that or overly conservative.
I think what you want to check is that align1 is properly large
so you can guarantee that both fields are inside a single cache
line, thus, (offset2 - offset1 + size2) <= align1

> +      offset1 /= BITS_PER_UNIT;
> +      offset2 /= BITS_PER_UNIT;
> +
> +      /* The two field references must fit within a block of memory that
> + is guaranteed to be on the same page.  */
> +      block_offset = offset1 % param_align;

See above - offset1 has no interesting meaning.

> +
> +      if (block_offset + offset2 - offset1 + size2 > param_align)
> + continue;
> +      /* The two expressions cannot be dependent upon vdefs defined
> + in bb1/bb2.  */
> +      if (local_mem_dependence (def1, bb1) || local_mem_dependence (def2, bb2))
> + continue;
> +
> +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> + bb0.  We hoist the first one first so that a cache miss is handled
> +         efficiently regardless of hardware cache-fill policy.  */
> +      gsi2 = gsi_for_stmt (def1);
> +      gsi_move_to_bb_end (&gsi2, bb0);
> +      gsi2 = gsi_for_stmt (def2);
> +      gsi_move_to_bb_end (&gsi2, bb0);
> +
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> + {
> +  fprintf (dump_file,
> +   "\nHoisting adjacent loads from %d and %d into %d: \n",
> +   bb1->index, bb2->index, bb0->index);
> +  print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> +  print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> + }
> +    }
> +}
> +
> +/* Determine whether we should attempt to hoist adjacent loads out of
> +   diamond patterns in pass_phiopt.  Always hoist loads if
> +   -fhoist-adjacent-loads is specified, or if the target machine has
> +   a conditional move instruction and -fno-hoist-adjacent-loads is
> +   not specified.  */
> +
> +static bool
> +gate_hoist_loads (void)
> +{
> +  return ((optimize > 1)
> +  && (flag_hoist_adjacent_loads == 1
> +      || (HAVE_conditional_move && flag_hoist_adjacent_loads != 0)));
Rather than check optimize > 1 here properly initialize it with 1 and
put it into the default_options_table.

Thanks,
Richard.

> +}
> +
>  /* Always do these optimizations if we have SSA
>     trees to work on.  */
>  static bool
> Index: gcc/common.opt
> ===================================================================
> --- gcc/common.opt (revision 187728)
> +++ gcc/common.opt (working copy)
> @@ -1186,6 +1186,11 @@ fgraphite-identity
>  Common Report Var(flag_graphite_identity) Optimization
>  Enable Graphite Identity transformation
>  
> +fhoist-adjacent-loads
> +Common Report Var(flag_hoist_adjacent_loads) Init(-1) Optimization
> +Enable hoisting adjacent loads to encourage generating conditional move
> +instructions
> +
>  floop-parallelize-all
>  Common Report Var(flag_loop_parallelize_all) Optimization
>  Mark all loops as parallel
> Index: gcc/Makefile.in
> ===================================================================
> --- gcc/Makefile.in (revision 187728)
> +++ gcc/Makefile.in (working copy)
> @@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
>     $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
>     $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
>     $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> -   $(TREE_DATA_REF_H) tree-pretty-print.h
> +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> +   insn-config.h $(EXPR_H) $(OPTABS_H)
>  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
>     $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
>     $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> Index: gcc/params.def
> ===================================================================
> --- gcc/params.def (revision 187728)
> +++ gcc/params.def (working copy)
> @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
>    "track string lengths",
>    1000, 0, 0)
>  
> +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> +  "min-cmove-struct-align",
> +  "Minimum byte alignment to assume for structures in the stack "
> +  "or heap when considering load hoisting for conditional moves",
> +  16, 8, 256)
> +
>  /*
>  Local variables:
>  mode:c
>
>
>
--
Richard Guenther <rguenther@...>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer

Re: [PATCH] Hoist adjacent pointer loads

by William J. Schmidt :: Rate this Message:

| View Threaded | Show Only this Message

On Wed, 2012-05-23 at 13:25 +0200, Richard Guenther wrote:

> On Tue, 22 May 2012, William J. Schmidt wrote:
>
> > Here's a revision of the hoist-adjacent-loads patch.  Besides hopefully
> > addressing all your comments, I added a gate of at least -O2 for this
> > transformation.  Let me know if you prefer a different minimum opt
> > level.
> >
> > I'm still running SPEC tests to make sure there are no regressions when
> > opening this up to non-pointer arguments.  The code bootstraps on
> > powerpc64-unknown-linux-gnu with no regressions.  Assuming the SPEC
> > numbers come out as expected, is this ok?
> >
> > Thanks,
> > Bill
> >
> >
> > 2012-05-22  Bill Schmidt  <wschmidt@...>
> >
> > * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> > declaration.
> > (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> > (tree_ssa_phiopt): Call gate_hoist_loads.
> > (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> > (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> > hoist_adjacent_loads.
> > (local_mem_dependence): New function.
> > (hoist_adjacent_loads): Likewise.
> > (gate_hoist_loads): Likewise.
> > * common.opt (fhoist-adjacent-loads): New switch.
> > * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> > * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
> >
> >
> > Index: gcc/tree-ssa-phiopt.c
> > ===================================================================
> > --- gcc/tree-ssa-phiopt.c (revision 187728)
> > +++ gcc/tree-ssa-phiopt.c (working copy)
> > @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "cfgloop.h"
> >  #include "tree-data-ref.h"
> >  #include "tree-pretty-print.h"
> > +#include "gimple-pretty-print.h"
> > +#include "insn-config.h"
> > +#include "expr.h"
> > +#include "optabs.h"
> >  
> > +#ifndef HAVE_conditional_move
> > +#define HAVE_conditional_move (0)
> > +#endif
> > +
> >  static unsigned int tree_ssa_phiopt (void);
> > -static unsigned int tree_ssa_phiopt_worker (bool);
> > +static unsigned int tree_ssa_phiopt_worker (bool, bool);
> >  static bool conditional_replacement (basic_block, basic_block,
> >       edge, edge, gimple, tree, tree);
> >  static int value_replacement (basic_block, basic_block,
> > @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
> >  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
> >  static struct pointer_set_t * get_non_trapping (void);
> >  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> > +static void hoist_adjacent_loads (basic_block, basic_block,
> > +  basic_block, basic_block);
> > +static bool gate_hoist_loads (void);
> >  
> >  /* This pass tries to replaces an if-then-else block with an
> >     assignment.  We have four kinds of transformations.  Some of these
> > @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
> >       bb2:
> >         x = PHI <x' (bb0), ...>;
> >  
> > -   A similar transformation is done for MAX_EXPR.  */
> > +   A similar transformation is done for MAX_EXPR.
> >  
> > +
> > +   This pass also performs a fifth transformation of a slightly different
> > +   flavor.
> > +
> > +   Adjacent Load Hoisting
> > +   ----------------------
> > +  
> > +   This transformation replaces
> > +
> > +     bb0:
> > +       if (...) goto bb2; else goto bb1;
> > +     bb1:
> > +       x1 = (<expr>).field1;
> > +       goto bb3;
> > +     bb2:
> > +       x2 = (<expr>).field2;
> > +     bb3:
> > +       # x = PHI <x1, x2>;
> > +
> > +   with
> > +
> > +     bb0:
> > +       x1 = (<expr>).field1;
> > +       x2 = (<expr>).field2;
> > +       if (...) goto bb2; else goto bb1;
> > +     bb1:
> > +       goto bb3;
> > +     bb2:
> > +     bb3:
> > +       # x = PHI <x1, x2>;
> > +
> > +   The purpose of this transformation is to enable generation of conditional
> > +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> > +   the loads is speculative, the transformation is restricted to very
> > +   specific cases to avoid introducing a page fault.  We are looking for
> > +   the common idiom:
> > +
> > +     if (...)
> > +       x = y->left;
> > +     else
> > +       x = y->right;
> > +
> > +   where left and right are typically adjacent pointers in a tree structure.  */
> > +
> >  static unsigned int
> >  tree_ssa_phiopt (void)
> >  {
> > -  return tree_ssa_phiopt_worker (false);
> > +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
> >  }
> >  
> >  /* This pass tries to transform conditional stores into unconditional
> > @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
> >  static unsigned int
> >  tree_ssa_cs_elim (void)
> >  {
> > -  return tree_ssa_phiopt_worker (true);
> > +  return tree_ssa_phiopt_worker (true, false);
> >  }
> >  
> >  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> > @@ -227,9 +282,11 @@ static tree condstoretemp;
> >  /* The core routine of conditional store replacement and normal
> >     phi optimizations.  Both share much of the infrastructure in how
> >     to match applicable basic block patterns.  DO_STORE_ELIM is true
> > -   when we want to do conditional store replacement, false otherwise.  */
> > +   when we want to do conditional store replacement, false otherwise.
> > +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> > +   of diamond control flow patterns, false otherwise.  */
> >  static unsigned int
> > -tree_ssa_phiopt_worker (bool do_store_elim)
> > +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
> >  {
> >    basic_block bb;
> >    basic_block *bb_order;
> > @@ -312,6 +369,20 @@ static unsigned int
> >      cfgchanged = true;
> >    continue;
> >   }
> > +      else if (do_hoist_loads
> > + && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> > + {
> > +  basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> > +
> > +  if (single_succ_p (bb1)
> > +      && single_succ_p (bb2)
> > +      && single_pred_p (bb1)
> > +      && single_pred_p (bb2)
> > +      && EDGE_COUNT (bb->succs) == 2
> > +      && EDGE_COUNT (bb3->preds) == 2)
> > +    hoist_adjacent_loads (bb, bb1, bb2, bb3);
> > +  continue;
> > + }
> >        else
> >   continue;      
> >  
> > @@ -1707,6 +1778,206 @@ cond_if_else_store_replacement (basic_block then_b
> >    return ok;
> >  }
> >  
> > +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> > +
> > +static bool
> > +local_mem_dependence (gimple stmt, basic_block bb)
> > +{
> > +  tree vuse = gimple_vuse (stmt);
> > +  gimple def;
> > +
> > +  if (!vuse)
> > +    return false;
> > +
> > +  def = SSA_NAME_DEF_STMT (vuse);
> > +  return (def && gimple_bb (def) == bb);
> > +}
> > +
> > +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> > +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> > +   and BB3 rejoins control flow following BB1 and BB2, look for
> > +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> > +   two loads, one each occurring in BB1 and BB2, and the loads are
> > +   provably of adjacent fields in the same structure, then move both
> > +   loads into BB0.  Of course this can only be done if there are no
> > +   dependencies preventing such motion.
> > +
> > +   One of the hoisted loads will always be speculative, so the
> > +   transformation is currently conservative:
> > +
> > +    - The fields must be strictly adjacent.
> > +    - The two fields must occupy a single memory block that is
> > +      guaranteed to not cross a page boundary.
> > +
> > +    The last is difficult to prove, as such memory blocks should be
> > +    aligned on the minimum of the stack alignment boundary and the
> > +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> > +    on a parameter for the alignment value.
> > +
> > +    Provided a good value is used for the last case, the first
> > +    restriction could possibly be relaxed.  */
> > +
> > +static void
> > +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> > +      basic_block bb2, basic_block bb3)
> > +{
> > +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> > +  gimple_stmt_iterator gsi;
> > +
> > +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> > +     for phis of two SSA names, one each of which is defined in bb1 and
> > +     bb2.  */
> > +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> > +    {
> > +      gimple phi_stmt = gsi_stmt (gsi);
> > +      gimple def1, def2, defswap;
> > +      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
> > +      tree tree_offset1, tree_offset2, tree_size2, next;
> > +      int offset1, offset2, size2, block_offset;
> > +      unsigned align1;
> > +      gimple_stmt_iterator gsi2;
> > +
> > +      if (gimple_phi_num_args (phi_stmt) != 2)
> > + continue;
> > +
> > +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> > +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> > +      
> > +      if (TREE_CODE (arg1) != SSA_NAME
> > +  || TREE_CODE (arg2) != SSA_NAME
> > +  || SSA_NAME_IS_DEFAULT_DEF (arg1)
> > +  || SSA_NAME_IS_DEFAULT_DEF (arg2))
> > + continue;
> > +
> > +      def1 = SSA_NAME_DEF_STMT (arg1);
> > +      def2 = SSA_NAME_DEF_STMT (arg2);
> > +
> > +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> > +  && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> > + continue;
> > +
> > +      /* Unless -fhoist-adjacent-loads was specified, check the mode
> > + of the arguments to be sure a conditional move can be generated
> > + for it.  */
> > +      if (flag_hoist_adjacent_loads != 1
> > +  && !optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> > + continue;
> > +
> > +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> > +      if (!gimple_assign_single_p (def1)
> > +  || !gimple_assign_single_p (def2))
> > + continue;
> > +
> > +      ref1 = gimple_assign_rhs1 (def1);
> > +      ref2 = gimple_assign_rhs1 (def2);
> > +
> > +      if (TREE_CODE (ref1) != COMPONENT_REF
> > +  || TREE_CODE (ref2) != COMPONENT_REF)
> > + continue;
> > +
> > +      /* The zeroth operand of the two component references must be
> > + identical.  It is not sufficient to compare get_base_address of
> > + the two references, because this could allow for different
> > + elements of the same array in the two trees.  It is not safe to
> > + assume that the existence of one array element implies the
> > + existence of a different one.  */
> > +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> > + continue;
> > +
> > +      field1 = TREE_OPERAND (ref1, 1);
> > +      field2 = TREE_OPERAND (ref2, 1);
> > +
> > +      /* Check for field adjacency, and ensure field1 comes first.  */
> > +      for (next = DECL_CHAIN (field1);
> > +   next && TREE_CODE (next) != FIELD_DECL;
> > +   next = DECL_CHAIN (next))
> > + ;
> > +
> > +      if (!next || !operand_equal_p (next, field2, 0))
>
> You can compare FIELD_DECLS by pointer equality, thus next != field2 here.
>
> > + {
> > +  for (next = DECL_CHAIN (field2);
> > +       next && TREE_CODE (next) != FIELD_DECL;
> > +       next = DECL_CHAIN (next))
> > +    ;
> > +
> > +  if (!next || !operand_equal_p (next, field1, 0))
>
> Likewise.
>
> > +    continue;
> > +
> > +  fieldswap = field1;
> > +  field1 = field2;
> > +  field2 = fieldswap;
> > +  defswap = def1;
> > +  def1 = def2;
> > +  def2 = defswap;
> > + }
> > +
> > +      /* Check for proper alignment of the first field.  */
> > +      tree_offset1 = bit_position (field1);
> > +      tree_offset2 = bit_position (field2);
> > +      tree_size2 = DECL_SIZE_UNIT (field2);
> > +
> > +      if (TREE_CODE (tree_size2) != INTEGER_CST
>
> host_integerp already checks for the TREE_CODE.
>
> > +  || !host_integerp (tree_offset1, 0)
> > +  || !host_integerp (tree_offset2, 0)
> > +  || !host_integerp (tree_size2, 0))
> > + continue;
>
> And I think you want , 1 in all cases, positive offsets/sizes.
>
> > +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> > +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> > +      size2 = TREE_INT_CST_LOW (tree_size2);
> > +      align1 = DECL_ALIGN (field1);
> > +
> > +      if (offset1 % BITS_PER_UNIT != 0 || offset1 % align1 != 0)
> > + continue;
>
> Why offset1 % align1 != 0?  offset1 is relative to the containing
> record start while DECL_ALIGN is relative to the ultimate containing
> object.  Either it's redundant to check that or overly conservative.
> I think what you want to check is that align1 is properly large
> so you can guarantee that both fields are inside a single cache
> line, thus, (offset2 - offset1 + size2) <= align1

Hm, I think really what we want is to just look within the block size
established by param_align...something like:

      offset1 = TREE_INT_CST_LOW (tree_offset1);
      offset2 = TREE_INT_CST_LOW (tree_offset2);
      size2 = TREE_INT_CST_LOW (tree_size2);
      align1 = DECL_ALIGN (field1) % (param_align * BITS_PER_UNIT);

      if (offset1 % BITS_PER_UNIT != 0)
        continue;

      /* The two field references must fit within a block of memory that
         is guaranteed to be on the same page.  */
      if (align1 + offset2 - offset1 + size2 > param_align * BITS_PER_UNIT)
        continue;

I.e., use the DECL_ALIGN to find where within the block we are probably
starting, then make sure the extent of the two objects doesn't run past
the end of the block.  Sound right?

>
> > +      offset1 /= BITS_PER_UNIT;
> > +      offset2 /= BITS_PER_UNIT;
> > +
> > +      /* The two field references must fit within a block of memory that
> > + is guaranteed to be on the same page.  */
> > +      block_offset = offset1 % param_align;
>
> See above - offset1 has no interesting meaning.
>
> > +
> > +      if (block_offset + offset2 - offset1 + size2 > param_align)
> > + continue;
> > +      /* The two expressions cannot be dependent upon vdefs defined
> > + in bb1/bb2.  */
> > +      if (local_mem_dependence (def1, bb1) || local_mem_dependence (def2, bb2))
> > + continue;
> > +
> > +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> > + bb0.  We hoist the first one first so that a cache miss is handled
> > +         efficiently regardless of hardware cache-fill policy.  */
> > +      gsi2 = gsi_for_stmt (def1);
> > +      gsi_move_to_bb_end (&gsi2, bb0);
> > +      gsi2 = gsi_for_stmt (def2);
> > +      gsi_move_to_bb_end (&gsi2, bb0);
> > +
> > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > + {
> > +  fprintf (dump_file,
> > +   "\nHoisting adjacent loads from %d and %d into %d: \n",
> > +   bb1->index, bb2->index, bb0->index);
> > +  print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> > +  print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> > + }
> > +    }
> > +}
> > +
> > +/* Determine whether we should attempt to hoist adjacent loads out of
> > +   diamond patterns in pass_phiopt.  Always hoist loads if
> > +   -fhoist-adjacent-loads is specified, or if the target machine has
> > +   a conditional move instruction and -fno-hoist-adjacent-loads is
> > +   not specified.  */
> > +
> > +static bool
> > +gate_hoist_loads (void)
> > +{
> > +  return ((optimize > 1)
> > +  && (flag_hoist_adjacent_loads == 1
> > +      || (HAVE_conditional_move && flag_hoist_adjacent_loads != 0)));
>
> Rather than check optimize > 1 here properly initialize it with 1 and
> put it into the default_options_table.
>
> Thanks,
> Richard.
>
> > +}
> > +
> >  /* Always do these optimizations if we have SSA
> >     trees to work on.  */
> >  static bool
> > Index: gcc/common.opt
> > ===================================================================
> > --- gcc/common.opt (revision 187728)
> > +++ gcc/common.opt (working copy)
> > @@ -1186,6 +1186,11 @@ fgraphite-identity
> >  Common Report Var(flag_graphite_identity) Optimization
> >  Enable Graphite Identity transformation
> >  
> > +fhoist-adjacent-loads
> > +Common Report Var(flag_hoist_adjacent_loads) Init(-1) Optimization
> > +Enable hoisting adjacent loads to encourage generating conditional move
> > +instructions
> > +
> >  floop-parallelize-all
> >  Common Report Var(flag_loop_parallelize_all) Optimization
> >  Mark all loops as parallel
> > Index: gcc/Makefile.in
> > ===================================================================
> > --- gcc/Makefile.in (revision 187728)
> > +++ gcc/Makefile.in (working copy)
> > @@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
> >     $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
> >     $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
> >     $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> > -   $(TREE_DATA_REF_H) tree-pretty-print.h
> > +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> > +   insn-config.h $(EXPR_H) $(OPTABS_H)
> >  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
> >     $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
> >     $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> > Index: gcc/params.def
> > ===================================================================
> > --- gcc/params.def (revision 187728)
> > +++ gcc/params.def (working copy)
> > @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
> >    "track string lengths",
> >    1000, 0, 0)
> >  
> > +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> > +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> > +  "min-cmove-struct-align",
> > +  "Minimum byte alignment to assume for structures in the stack "
> > +  "or heap when considering load hoisting for conditional moves",
> > +  16, 8, 256)
> > +
> >  /*
> >  Local variables:
> >  mode:c
> >
> >
> >
>


Re: [PATCH] Hoist adjacent pointer loads

by William J. Schmidt :: Rate this Message:

| View Threaded | Show Only this Message

Hi Richard,

Here's a revision of the hoist-adjacent-loads patch.  I'm sorry for the
delay since the last revision, but my performance testing has been
blocked waiting for a fix to PR53487.  I ended up applying a test
version of the patch to 4.7 and ran performance numbers with that
instead, with no degradations.

In addition to addressing your comments, this patch contains one bug fix
where local_mem_dependence was called on the wrong blocks after swapping
def1 and def2.

Bootstrapped with no regressions on powerpc64-unknown-linux-gnu.  Is
this version ok for trunk?  I won't commit it until I can do final
testing on trunk in conjunction with a fix for PR53487.

Thanks,
Bill


2012-06-04  Bill Schmidt  <wschmidt@...>

        * opts.c: Add -fhoist_adjacent_loads to -O2 and above.
        * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
        declaration.
        (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
        (tree_ssa_phiopt): Call gate_hoist_loads.
        (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
        (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
        hoist_adjacent_loads.
        (local_mem_dependence): New function.
        (hoist_adjacent_loads): Likewise.
        (gate_hoist_loads): Likewise.
        * common.opt (fhoist-adjacent-loads): New switch.
        * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
        * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.


Index: gcc/opts.c
===================================================================
--- gcc/opts.c (revision 187805)
+++ gcc/opts.c (working copy)
@@ -489,6 +489,7 @@ static const struct default_options default_option
     { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },
 
     /* -O3 optimizations.  */
     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
Index: gcc/tree-ssa-phiopt.c
===================================================================
--- gcc/tree-ssa-phiopt.c (revision 187805)
+++ gcc/tree-ssa-phiopt.c (working copy)
@@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
 #include "cfgloop.h"
 #include "tree-data-ref.h"
 #include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "insn-config.h"
+#include "expr.h"
+#include "optabs.h"
 
+#ifndef HAVE_conditional_move
+#define HAVE_conditional_move (0)
+#endif
+
 static unsigned int tree_ssa_phiopt (void);
-static unsigned int tree_ssa_phiopt_worker (bool);
+static unsigned int tree_ssa_phiopt_worker (bool, bool);
 static bool conditional_replacement (basic_block, basic_block,
      edge, edge, gimple, tree, tree);
 static int value_replacement (basic_block, basic_block,
@@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
 static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
 static struct pointer_set_t * get_non_trapping (void);
 static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
+static void hoist_adjacent_loads (basic_block, basic_block,
+  basic_block, basic_block);
+static bool gate_hoist_loads (void);
 
 /* This pass tries to replaces an if-then-else block with an
    assignment.  We have four kinds of transformations.  Some of these
@@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
      bb2:
        x = PHI <x' (bb0), ...>;
 
-   A similar transformation is done for MAX_EXPR.  */
+   A similar transformation is done for MAX_EXPR.
 
+
+   This pass also performs a fifth transformation of a slightly different
+   flavor.
+
+   Adjacent Load Hoisting
+   ----------------------
+  
+   This transformation replaces
+
+     bb0:
+       if (...) goto bb2; else goto bb1;
+     bb1:
+       x1 = (<expr>).field1;
+       goto bb3;
+     bb2:
+       x2 = (<expr>).field2;
+     bb3:
+       # x = PHI <x1, x2>;
+
+   with
+
+     bb0:
+       x1 = (<expr>).field1;
+       x2 = (<expr>).field2;
+       if (...) goto bb2; else goto bb1;
+     bb1:
+       goto bb3;
+     bb2:
+     bb3:
+       # x = PHI <x1, x2>;
+
+   The purpose of this transformation is to enable generation of conditional
+   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
+   the loads is speculative, the transformation is restricted to very
+   specific cases to avoid introducing a page fault.  We are looking for
+   the common idiom:
+
+     if (...)
+       x = y->left;
+     else
+       x = y->right;
+
+   where left and right are typically adjacent pointers in a tree structure.  */
+
 static unsigned int
 tree_ssa_phiopt (void)
 {
-  return tree_ssa_phiopt_worker (false);
+  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
 }
 
 /* This pass tries to transform conditional stores into unconditional
@@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
 static unsigned int
 tree_ssa_cs_elim (void)
 {
-  return tree_ssa_phiopt_worker (true);
+  return tree_ssa_phiopt_worker (true, false);
 }
 
 /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
@@ -227,9 +282,11 @@ static tree condstoretemp;
 /* The core routine of conditional store replacement and normal
    phi optimizations.  Both share much of the infrastructure in how
    to match applicable basic block patterns.  DO_STORE_ELIM is true
-   when we want to do conditional store replacement, false otherwise.  */
+   when we want to do conditional store replacement, false otherwise.
+   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
+   of diamond control flow patterns, false otherwise.  */
 static unsigned int
-tree_ssa_phiopt_worker (bool do_store_elim)
+tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
 {
   basic_block bb;
   basic_block *bb_order;
@@ -312,6 +369,21 @@ static unsigned int
     cfgchanged = true;
   continue;
  }
+      else if (do_hoist_loads
+ && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
+ {
+  basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
+
+  if (!FLOAT_TYPE_P (TREE_TYPE (gimple_cond_lhs (cond_stmt)))
+      && single_succ_p (bb1)
+      && single_succ_p (bb2)
+      && single_pred_p (bb1)
+      && single_pred_p (bb2)
+      && EDGE_COUNT (bb->succs) == 2
+      && EDGE_COUNT (bb3->preds) == 2)
+    hoist_adjacent_loads (bb, bb1, bb2, bb3);
+  continue;
+ }
       else
  continue;      
 
@@ -1707,6 +1779,207 @@ cond_if_else_store_replacement (basic_block then_b
   return ok;
 }
 
+/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
+
+static bool
+local_mem_dependence (gimple stmt, basic_block bb)
+{
+  tree vuse = gimple_vuse (stmt);
+  gimple def;
+
+  if (!vuse)
+    return false;
+
+  def = SSA_NAME_DEF_STMT (vuse);
+  return (def && gimple_bb (def) == bb);
+}
+
+/* Given a "diamond" control-flow pattern where BB0 tests a condition,
+   BB1 and BB2 are "then" and "else" blocks dependent on this test,
+   and BB3 rejoins control flow following BB1 and BB2, look for
+   opportunities to hoist loads as follows.  If BB3 contains a PHI of
+   two loads, one each occurring in BB1 and BB2, and the loads are
+   provably of adjacent fields in the same structure, then move both
+   loads into BB0.  Of course this can only be done if there are no
+   dependencies preventing such motion.
+
+   One of the hoisted loads will always be speculative, so the
+   transformation is currently conservative:
+
+    - The fields must be strictly adjacent.
+    - The two fields must occupy a single memory block that is
+      guaranteed to not cross a page boundary.
+
+    The last is difficult to prove, as such memory blocks should be
+    aligned on the minimum of the stack alignment boundary and the
+    alignment guaranteed by heap allocation interfaces.  Thus we rely
+    on a parameter for the alignment value.
+
+    Provided a good value is used for the last case, the first
+    restriction could possibly be relaxed.  */
+
+static void
+hoist_adjacent_loads (basic_block bb0, basic_block bb1,
+      basic_block bb2, basic_block bb3)
+{
+  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
+  unsigned param_align_bits = (unsigned) (param_align * BITS_PER_UNIT);
+  gimple_stmt_iterator gsi;
+
+  /* Walk the phis in bb3 looking for an opportunity.  We are looking
+     for phis of two SSA names, one each of which is defined in bb1 and
+     bb2.  */
+  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple phi_stmt = gsi_stmt (gsi);
+      gimple def1, def2, defswap;
+      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
+      tree tree_offset1, tree_offset2, tree_size2, next;
+      int offset1, offset2, size2;
+      unsigned align1;
+      gimple_stmt_iterator gsi2;
+      basic_block bb_for_def1, bb_for_def2;
+
+      if (gimple_phi_num_args (phi_stmt) != 2)
+ continue;
+
+      arg1 = gimple_phi_arg_def (phi_stmt, 0);
+      arg2 = gimple_phi_arg_def (phi_stmt, 1);
+      
+      if (TREE_CODE (arg1) != SSA_NAME
+  || TREE_CODE (arg2) != SSA_NAME
+  || SSA_NAME_IS_DEFAULT_DEF (arg1)
+  || SSA_NAME_IS_DEFAULT_DEF (arg2))
+ continue;
+
+      def1 = SSA_NAME_DEF_STMT (arg1);
+      def2 = SSA_NAME_DEF_STMT (arg2);
+
+      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
+  && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
+ continue;
+
+      /* Check the mode of the arguments to be sure a conditional move
+ can be generated for it.  */
+      if (!optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
+ continue;
+
+      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
+      if (!gimple_assign_single_p (def1)
+  || !gimple_assign_single_p (def2))
+ continue;
+
+      ref1 = gimple_assign_rhs1 (def1);
+      ref2 = gimple_assign_rhs1 (def2);
+
+      if (TREE_CODE (ref1) != COMPONENT_REF
+  || TREE_CODE (ref2) != COMPONENT_REF)
+ continue;
+
+      /* The zeroth operand of the two component references must be
+ identical.  It is not sufficient to compare get_base_address of
+ the two references, because this could allow for different
+ elements of the same array in the two trees.  It is not safe to
+ assume that the existence of one array element implies the
+ existence of a different one.  */
+      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
+ continue;
+
+      field1 = TREE_OPERAND (ref1, 1);
+      field2 = TREE_OPERAND (ref2, 1);
+
+      /* Check for field adjacency, and ensure field1 comes first.  */
+      for (next = DECL_CHAIN (field1);
+   next && TREE_CODE (next) != FIELD_DECL;
+   next = DECL_CHAIN (next))
+ ;
+
+      if (next != field2)
+ {
+  for (next = DECL_CHAIN (field2);
+       next && TREE_CODE (next) != FIELD_DECL;
+       next = DECL_CHAIN (next))
+    ;
+
+  if (next != field1)
+    continue;
+
+  fieldswap = field1;
+  field1 = field2;
+  field2 = fieldswap;
+  defswap = def1;
+  def1 = def2;
+  def2 = defswap;
+  /* Don't swap bb1 and bb2 as we may have more than one
+     phi to process successfully.  */
+  bb_for_def1 = bb2;
+  bb_for_def2 = bb1;
+ }
+      else
+ {
+  bb_for_def1 = bb1;
+  bb_for_def2 = bb2;
+ }
+
+      /* Check for proper alignment of the first field.  */
+      tree_offset1 = bit_position (field1);
+      tree_offset2 = bit_position (field2);
+      tree_size2 = DECL_SIZE (field2);
+
+      if (!host_integerp (tree_offset1, 1)
+  || !host_integerp (tree_offset2, 1)
+  || !host_integerp (tree_size2, 1))
+ continue;
+
+      offset1 = TREE_INT_CST_LOW (tree_offset1);
+      offset2 = TREE_INT_CST_LOW (tree_offset2);
+      size2 = TREE_INT_CST_LOW (tree_size2);
+      align1 = DECL_ALIGN (field1) % param_align_bits;
+
+      if (offset1 % BITS_PER_UNIT != 0)
+ continue;
+
+      /* The two field references must fit within a block of memory that
+ is guaranteed to be on the same page.  */
+      if (align1 + offset2 - offset1 + size2 > param_align_bits)
+ continue;
+
+      /* The two expressions cannot be dependent upon vdefs defined
+ in bb1/bb2.  */
+      if (local_mem_dependence (def1, bb_for_def1)
+  || local_mem_dependence (def2, bb_for_def2))
+ continue;
+
+      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
+ bb0.  We hoist the first one first so that a cache miss is handled
+         efficiently regardless of hardware cache-fill policy.  */
+      gsi2 = gsi_for_stmt (def1);
+      gsi_move_to_bb_end (&gsi2, bb0);
+      gsi2 = gsi_for_stmt (def2);
+      gsi_move_to_bb_end (&gsi2, bb0);
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+  fprintf (dump_file,
+   "\nHoisting adjacent loads from %d and %d into %d: \n",
+   bb_for_def1->index, bb_for_def2->index, bb0->index);
+  print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
+  print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
+ }
+    }
+}
+
+/* Determine whether we should attempt to hoist adjacent loads out of
+   diamond patterns in pass_phiopt.  Always hoist loads if
+   -fhoist-adjacent-loads is specified and the target machine has
+   a conditional move instruction.  */
+
+static bool
+gate_hoist_loads (void)
+{
+  return (flag_hoist_adjacent_loads == 1 && HAVE_conditional_move);
+}
+
 /* Always do these optimizations if we have SSA
    trees to work on.  */
 static bool
Index: gcc/common.opt
===================================================================
--- gcc/common.opt (revision 187805)
+++ gcc/common.opt (working copy)
@@ -1186,6 +1186,11 @@ fgraphite-identity
 Common Report Var(flag_graphite_identity) Optimization
 Enable Graphite Identity transformation
 
+fhoist-adjacent-loads
+Common Report Var(flag_hoist_adjacent_loads) Optimization
+Enable hoisting adjacent loads to encourage generating conditional move
+instructions
+
 floop-parallelize-all
 Common Report Var(flag_loop_parallelize_all) Optimization
 Mark all loops as parallel
Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in (revision 187805)
+++ gcc/Makefile.in (working copy)
@@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
    $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
    $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
-   $(TREE_DATA_REF_H) tree-pretty-print.h
+   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
+   insn-config.h $(EXPR_H) $(OPTABS_H)
 tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
Index: gcc/params.def
===================================================================
--- gcc/params.def (revision 187805)
+++ gcc/params.def (working copy)
@@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
   "track string lengths",
   1000, 0, 0)
 
+/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
+DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
+  "min-cmove-struct-align",
+  "Minimum byte alignment to assume for structures in the stack "
+  "or heap when considering load hoisting for conditional moves",
+  32, 8, 256)
+
 /*
 Local variables:
 mode:c



Re: [PATCH] Hoist adjacent pointer loads

by Jakub Jelinek :: Rate this Message:

| View Threaded | Show Only this Message

On Mon, Jun 04, 2012 at 08:45:20AM -0500, William J. Schmidt wrote:
> 2012-06-04  Bill Schmidt  <wschmidt@...>
>
> * opts.c: Add -fhoist_adjacent_loads to -O2 and above.

The option name has hyphens instead of underscores in it, so the above is
wrong.

> --- gcc/common.opt (revision 187805)
> +++ gcc/common.opt (working copy)
> @@ -1186,6 +1186,11 @@ fgraphite-identity
>  Common Report Var(flag_graphite_identity) Optimization
>  Enable Graphite Identity transformation
>  
> +fhoist-adjacent-loads
> +Common Report Var(flag_hoist_adjacent_loads) Optimization
> +Enable hoisting adjacent loads to encourage generating conditional move
> +instructions
> +

        Jakub

Re: [PATCH] Hoist adjacent pointer loads

by William J. Schmidt :: Rate this Message:

| View Threaded | Show Only this Message

On Mon, 2012-06-04 at 08:45 -0500, William J. Schmidt wrote:

> Hi Richard,
>
> Here's a revision of the hoist-adjacent-loads patch.  I'm sorry for the
> delay since the last revision, but my performance testing has been
> blocked waiting for a fix to PR53487.  I ended up applying a test
> version of the patch to 4.7 and ran performance numbers with that
> instead, with no degradations.
>
> In addition to addressing your comments, this patch contains one bug fix
> where local_mem_dependence was called on the wrong blocks after swapping
> def1 and def2.
>
> Bootstrapped with no regressions on powerpc64-unknown-linux-gnu.  Is
> this version ok for trunk?  I won't commit it until I can do final
> testing on trunk in conjunction with a fix for PR53487.

Final performance tests are complete and show no degradations on SPEC
cpu2006 on powerpc64-unknown-linux-gnu.

Is the patch ok for trunk?

Thanks!
Bill

>
> Thanks,
> Bill
>
>
> 2012-06-04  Bill Schmidt  <wschmidt@...>
>
> * opts.c: Add -fhoist_adjacent_loads to -O2 and above.
> * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> declaration.
> (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> (tree_ssa_phiopt): Call gate_hoist_loads.
> (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> hoist_adjacent_loads.
> (local_mem_dependence): New function.
> (hoist_adjacent_loads): Likewise.
> (gate_hoist_loads): Likewise.
> * common.opt (fhoist-adjacent-loads): New switch.
> * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
>
>
> Index: gcc/opts.c
> ===================================================================
> --- gcc/opts.c (revision 187805)
> +++ gcc/opts.c (working copy)
> @@ -489,6 +489,7 @@ static const struct default_options default_option
>      { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
>      { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
>      { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
> +    { OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },
>  
>      /* -O3 optimizations.  */
>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
> Index: gcc/tree-ssa-phiopt.c
> ===================================================================
> --- gcc/tree-ssa-phiopt.c (revision 187805)
> +++ gcc/tree-ssa-phiopt.c (working copy)
> @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfgloop.h"
>  #include "tree-data-ref.h"
>  #include "tree-pretty-print.h"
> +#include "gimple-pretty-print.h"
> +#include "insn-config.h"
> +#include "expr.h"
> +#include "optabs.h"
>  
> +#ifndef HAVE_conditional_move
> +#define HAVE_conditional_move (0)
> +#endif
> +
>  static unsigned int tree_ssa_phiopt (void);
> -static unsigned int tree_ssa_phiopt_worker (bool);
> +static unsigned int tree_ssa_phiopt_worker (bool, bool);
>  static bool conditional_replacement (basic_block, basic_block,
>       edge, edge, gimple, tree, tree);
>  static int value_replacement (basic_block, basic_block,
> @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
>  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
>  static struct pointer_set_t * get_non_trapping (void);
>  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> +static void hoist_adjacent_loads (basic_block, basic_block,
> +  basic_block, basic_block);
> +static bool gate_hoist_loads (void);
>  
>  /* This pass tries to replaces an if-then-else block with an
>     assignment.  We have four kinds of transformations.  Some of these
> @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
>       bb2:
>         x = PHI <x' (bb0), ...>;
>  
> -   A similar transformation is done for MAX_EXPR.  */
> +   A similar transformation is done for MAX_EXPR.
>  
> +
> +   This pass also performs a fifth transformation of a slightly different
> +   flavor.
> +
> +   Adjacent Load Hoisting
> +   ----------------------
> +  
> +   This transformation replaces
> +
> +     bb0:
> +       if (...) goto bb2; else goto bb1;
> +     bb1:
> +       x1 = (<expr>).field1;
> +       goto bb3;
> +     bb2:
> +       x2 = (<expr>).field2;
> +     bb3:
> +       # x = PHI <x1, x2>;
> +
> +   with
> +
> +     bb0:
> +       x1 = (<expr>).field1;
> +       x2 = (<expr>).field2;
> +       if (...) goto bb2; else goto bb1;
> +     bb1:
> +       goto bb3;
> +     bb2:
> +     bb3:
> +       # x = PHI <x1, x2>;
> +
> +   The purpose of this transformation is to enable generation of conditional
> +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> +   the loads is speculative, the transformation is restricted to very
> +   specific cases to avoid introducing a page fault.  We are looking for
> +   the common idiom:
> +
> +     if (...)
> +       x = y->left;
> +     else
> +       x = y->right;
> +
> +   where left and right are typically adjacent pointers in a tree structure.  */
> +
>  static unsigned int
>  tree_ssa_phiopt (void)
>  {
> -  return tree_ssa_phiopt_worker (false);
> +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
>  }
>  
>  /* This pass tries to transform conditional stores into unconditional
> @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
>  static unsigned int
>  tree_ssa_cs_elim (void)
>  {
> -  return tree_ssa_phiopt_worker (true);
> +  return tree_ssa_phiopt_worker (true, false);
>  }
>  
>  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> @@ -227,9 +282,11 @@ static tree condstoretemp;
>  /* The core routine of conditional store replacement and normal
>     phi optimizations.  Both share much of the infrastructure in how
>     to match applicable basic block patterns.  DO_STORE_ELIM is true
> -   when we want to do conditional store replacement, false otherwise.  */
> +   when we want to do conditional store replacement, false otherwise.
> +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> +   of diamond control flow patterns, false otherwise.  */
>  static unsigned int
> -tree_ssa_phiopt_worker (bool do_store_elim)
> +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
>  {
>    basic_block bb;
>    basic_block *bb_order;
> @@ -312,6 +369,21 @@ static unsigned int
>      cfgchanged = true;
>    continue;
>   }
> +      else if (do_hoist_loads
> + && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> + {
> +  basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> +
> +  if (!FLOAT_TYPE_P (TREE_TYPE (gimple_cond_lhs (cond_stmt)))
> +      && single_succ_p (bb1)
> +      && single_succ_p (bb2)
> +      && single_pred_p (bb1)
> +      && single_pred_p (bb2)
> +      && EDGE_COUNT (bb->succs) == 2
> +      && EDGE_COUNT (bb3->preds) == 2)
> +    hoist_adjacent_loads (bb, bb1, bb2, bb3);
> +  continue;
> + }
>        else
>   continue;      
>  
> @@ -1707,6 +1779,207 @@ cond_if_else_store_replacement (basic_block then_b
>    return ok;
>  }
>  
> +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> +
> +static bool
> +local_mem_dependence (gimple stmt, basic_block bb)
> +{
> +  tree vuse = gimple_vuse (stmt);
> +  gimple def;
> +
> +  if (!vuse)
> +    return false;
> +
> +  def = SSA_NAME_DEF_STMT (vuse);
> +  return (def && gimple_bb (def) == bb);
> +}
> +
> +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> +   and BB3 rejoins control flow following BB1 and BB2, look for
> +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> +   two loads, one each occurring in BB1 and BB2, and the loads are
> +   provably of adjacent fields in the same structure, then move both
> +   loads into BB0.  Of course this can only be done if there are no
> +   dependencies preventing such motion.
> +
> +   One of the hoisted loads will always be speculative, so the
> +   transformation is currently conservative:
> +
> +    - The fields must be strictly adjacent.
> +    - The two fields must occupy a single memory block that is
> +      guaranteed to not cross a page boundary.
> +
> +    The last is difficult to prove, as such memory blocks should be
> +    aligned on the minimum of the stack alignment boundary and the
> +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> +    on a parameter for the alignment value.
> +
> +    Provided a good value is used for the last case, the first
> +    restriction could possibly be relaxed.  */
> +
> +static void
> +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> +      basic_block bb2, basic_block bb3)
> +{
> +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> +  unsigned param_align_bits = (unsigned) (param_align * BITS_PER_UNIT);
> +  gimple_stmt_iterator gsi;
> +
> +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> +     for phis of two SSA names, one each of which is defined in bb1 and
> +     bb2.  */
> +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple phi_stmt = gsi_stmt (gsi);
> +      gimple def1, def2, defswap;
> +      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
> +      tree tree_offset1, tree_offset2, tree_size2, next;
> +      int offset1, offset2, size2;
> +      unsigned align1;
> +      gimple_stmt_iterator gsi2;
> +      basic_block bb_for_def1, bb_for_def2;
> +
> +      if (gimple_phi_num_args (phi_stmt) != 2)
> + continue;
> +
> +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> +      
> +      if (TREE_CODE (arg1) != SSA_NAME
> +  || TREE_CODE (arg2) != SSA_NAME
> +  || SSA_NAME_IS_DEFAULT_DEF (arg1)
> +  || SSA_NAME_IS_DEFAULT_DEF (arg2))
> + continue;
> +
> +      def1 = SSA_NAME_DEF_STMT (arg1);
> +      def2 = SSA_NAME_DEF_STMT (arg2);
> +
> +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> +  && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> + continue;
> +
> +      /* Check the mode of the arguments to be sure a conditional move
> + can be generated for it.  */
> +      if (!optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> + continue;
> +
> +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> +      if (!gimple_assign_single_p (def1)
> +  || !gimple_assign_single_p (def2))
> + continue;
> +
> +      ref1 = gimple_assign_rhs1 (def1);
> +      ref2 = gimple_assign_rhs1 (def2);
> +
> +      if (TREE_CODE (ref1) != COMPONENT_REF
> +  || TREE_CODE (ref2) != COMPONENT_REF)
> + continue;
> +
> +      /* The zeroth operand of the two component references must be
> + identical.  It is not sufficient to compare get_base_address of
> + the two references, because this could allow for different
> + elements of the same array in the two trees.  It is not safe to
> + assume that the existence of one array element implies the
> + existence of a different one.  */
> +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> + continue;
> +
> +      field1 = TREE_OPERAND (ref1, 1);
> +      field2 = TREE_OPERAND (ref2, 1);
> +
> +      /* Check for field adjacency, and ensure field1 comes first.  */
> +      for (next = DECL_CHAIN (field1);
> +   next && TREE_CODE (next) != FIELD_DECL;
> +   next = DECL_CHAIN (next))
> + ;
> +
> +      if (next != field2)
> + {
> +  for (next = DECL_CHAIN (field2);
> +       next && TREE_CODE (next) != FIELD_DECL;
> +       next = DECL_CHAIN (next))
> +    ;
> +
> +  if (next != field1)
> +    continue;
> +
> +  fieldswap = field1;
> +  field1 = field2;
> +  field2 = fieldswap;
> +  defswap = def1;
> +  def1 = def2;
> +  def2 = defswap;
> +  /* Don't swap bb1 and bb2 as we may have more than one
> +     phi to process successfully.  */
> +  bb_for_def1 = bb2;
> +  bb_for_def2 = bb1;
> + }
> +      else
> + {
> +  bb_for_def1 = bb1;
> +  bb_for_def2 = bb2;
> + }
> +
> +      /* Check for proper alignment of the first field.  */
> +      tree_offset1 = bit_position (field1);
> +      tree_offset2 = bit_position (field2);
> +      tree_size2 = DECL_SIZE (field2);
> +
> +      if (!host_integerp (tree_offset1, 1)
> +  || !host_integerp (tree_offset2, 1)
> +  || !host_integerp (tree_size2, 1))
> + continue;
> +
> +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> +      size2 = TREE_INT_CST_LOW (tree_size2);
> +      align1 = DECL_ALIGN (field1) % param_align_bits;
> +
> +      if (offset1 % BITS_PER_UNIT != 0)
> + continue;
> +
> +      /* The two field references must fit within a block of memory that
> + is guaranteed to be on the same page.  */
> +      if (align1 + offset2 - offset1 + size2 > param_align_bits)
> + continue;
> +
> +      /* The two expressions cannot be dependent upon vdefs defined
> + in bb1/bb2.  */
> +      if (local_mem_dependence (def1, bb_for_def1)
> +  || local_mem_dependence (def2, bb_for_def2))
> + continue;
> +
> +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> + bb0.  We hoist the first one first so that a cache miss is handled
> +         efficiently regardless of hardware cache-fill policy.  */
> +      gsi2 = gsi_for_stmt (def1);
> +      gsi_move_to_bb_end (&gsi2, bb0);
> +      gsi2 = gsi_for_stmt (def2);
> +      gsi_move_to_bb_end (&gsi2, bb0);
> +
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> + {
> +  fprintf (dump_file,
> +   "\nHoisting adjacent loads from %d and %d into %d: \n",
> +   bb_for_def1->index, bb_for_def2->index, bb0->index);
> +  print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> +  print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> + }
> +    }
> +}
> +
> +/* Determine whether we should attempt to hoist adjacent loads out of
> +   diamond patterns in pass_phiopt.  Always hoist loads if
> +   -fhoist-adjacent-loads is specified and the target machine has
> +   a conditional move instruction.  */
> +
> +static bool
> +gate_hoist_loads (void)
> +{
> +  return (flag_hoist_adjacent_loads == 1 && HAVE_conditional_move);
> +}
> +
>  /* Always do these optimizations if we have SSA
>     trees to work on.  */
>  static bool
> Index: gcc/common.opt
> ===================================================================
> --- gcc/common.opt (revision 187805)
> +++ gcc/common.opt (working copy)
> @@ -1186,6 +1186,11 @@ fgraphite-identity
>  Common Report Var(flag_graphite_identity) Optimization
>  Enable Graphite Identity transformation
>  
> +fhoist-adjacent-loads
> +Common Report Var(flag_hoist_adjacent_loads) Optimization
> +Enable hoisting adjacent loads to encourage generating conditional move
> +instructions
> +
>  floop-parallelize-all
>  Common Report Var(flag_loop_parallelize_all) Optimization
>  Mark all loops as parallel
> Index: gcc/Makefile.in
> ===================================================================
> --- gcc/Makefile.in (revision 187805)
> +++ gcc/Makefile.in (working copy)
> @@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
>     $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
>     $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
>     $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> -   $(TREE_DATA_REF_H) tree-pretty-print.h
> +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> +   insn-config.h $(EXPR_H) $(OPTABS_H)
>  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
>     $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
>     $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> Index: gcc/params.def
> ===================================================================
> --- gcc/params.def (revision 187805)
> +++ gcc/params.def (working copy)
> @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
>    "track string lengths",
>    1000, 0, 0)
>  
> +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> +  "min-cmove-struct-align",
> +  "Minimum byte alignment to assume for structures in the stack "
> +  "or heap when considering load hoisting for conditional moves",
> +  32, 8, 256)
> +
>  /*
>  Local variables:
>  mode:c
>


Re: [PATCH] Hoist adjacent pointer loads

by Richard Guenther-2 :: Rate this Message:

| View Threaded | Show Only this Message

On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt
<wschmidt@...> wrote:

> Hi Richard,
>
> Here's a revision of the hoist-adjacent-loads patch.  I'm sorry for the
> delay since the last revision, but my performance testing has been
> blocked waiting for a fix to PR53487.  I ended up applying a test
> version of the patch to 4.7 and ran performance numbers with that
> instead, with no degradations.
>
> In addition to addressing your comments, this patch contains one bug fix
> where local_mem_dependence was called on the wrong blocks after swapping
> def1 and def2.
>
> Bootstrapped with no regressions on powerpc64-unknown-linux-gnu.  Is
> this version ok for trunk?  I won't commit it until I can do final
> testing on trunk in conjunction with a fix for PR53487.
>
> Thanks,
> Bill
>
>
> 2012-06-04  Bill Schmidt  <wschmidt@...>
>
>        * opts.c: Add -fhoist_adjacent_loads to -O2 and above.
>        * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
>        declaration.
>        (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
>        (tree_ssa_phiopt): Call gate_hoist_loads.
>        (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
>        (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
>        hoist_adjacent_loads.
>        (local_mem_dependence): New function.
>        (hoist_adjacent_loads): Likewise.
>        (gate_hoist_loads): Likewise.
>        * common.opt (fhoist-adjacent-loads): New switch.
>        * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
>        * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
>
>
> Index: gcc/opts.c
> ===================================================================
> --- gcc/opts.c  (revision 187805)
> +++ gcc/opts.c  (working copy)
> @@ -489,6 +489,7 @@ static const struct default_options default_option
>     { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
>     { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
>     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
> +    { OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },
>
>     /* -O3 optimizations.  */
>     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
> Index: gcc/tree-ssa-phiopt.c
> ===================================================================
> --- gcc/tree-ssa-phiopt.c       (revision 187805)
> +++ gcc/tree-ssa-phiopt.c       (working copy)
> @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfgloop.h"
>  #include "tree-data-ref.h"
>  #include "tree-pretty-print.h"
> +#include "gimple-pretty-print.h"
> +#include "insn-config.h"
> +#include "expr.h"
> +#include "optabs.h"
>
> +#ifndef HAVE_conditional_move
> +#define HAVE_conditional_move (0)
> +#endif
> +
>  static unsigned int tree_ssa_phiopt (void);
> -static unsigned int tree_ssa_phiopt_worker (bool);
> +static unsigned int tree_ssa_phiopt_worker (bool, bool);
>  static bool conditional_replacement (basic_block, basic_block,
>                                     edge, edge, gimple, tree, tree);
>  static int value_replacement (basic_block, basic_block,
> @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
>  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
>  static struct pointer_set_t * get_non_trapping (void);
>  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> +static void hoist_adjacent_loads (basic_block, basic_block,
> +                                 basic_block, basic_block);
> +static bool gate_hoist_loads (void);
>
>  /* This pass tries to replaces an if-then-else block with an
>    assignment.  We have four kinds of transformations.  Some of these
> @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
>      bb2:
>        x = PHI <x' (bb0), ...>;
>
> -   A similar transformation is done for MAX_EXPR.  */
> +   A similar transformation is done for MAX_EXPR.
>
> +
> +   This pass also performs a fifth transformation of a slightly different
> +   flavor.
> +
> +   Adjacent Load Hoisting
> +   ----------------------
> +
> +   This transformation replaces
> +
> +     bb0:
> +       if (...) goto bb2; else goto bb1;
> +     bb1:
> +       x1 = (<expr>).field1;
> +       goto bb3;
> +     bb2:
> +       x2 = (<expr>).field2;
> +     bb3:
> +       # x = PHI <x1, x2>;
> +
> +   with
> +
> +     bb0:
> +       x1 = (<expr>).field1;
> +       x2 = (<expr>).field2;
> +       if (...) goto bb2; else goto bb1;
> +     bb1:
> +       goto bb3;
> +     bb2:
> +     bb3:
> +       # x = PHI <x1, x2>;
> +
> +   The purpose of this transformation is to enable generation of conditional
> +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> +   the loads is speculative, the transformation is restricted to very
> +   specific cases to avoid introducing a page fault.  We are looking for
> +   the common idiom:
> +
> +     if (...)
> +       x = y->left;
> +     else
> +       x = y->right;
> +
> +   where left and right are typically adjacent pointers in a tree structure.  */
> +
>  static unsigned int
>  tree_ssa_phiopt (void)
>  {
> -  return tree_ssa_phiopt_worker (false);
> +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
>  }
>
>  /* This pass tries to transform conditional stores into unconditional
> @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
>  static unsigned int
>  tree_ssa_cs_elim (void)
>  {
> -  return tree_ssa_phiopt_worker (true);
> +  return tree_ssa_phiopt_worker (true, false);
>  }
>
>  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> @@ -227,9 +282,11 @@ static tree condstoretemp;
>  /* The core routine of conditional store replacement and normal
>    phi optimizations.  Both share much of the infrastructure in how
>    to match applicable basic block patterns.  DO_STORE_ELIM is true
> -   when we want to do conditional store replacement, false otherwise.  */
> +   when we want to do conditional store replacement, false otherwise.
> +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> +   of diamond control flow patterns, false otherwise.  */
>  static unsigned int
> -tree_ssa_phiopt_worker (bool do_store_elim)
> +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
>  {
>   basic_block bb;
>   basic_block *bb_order;
> @@ -312,6 +369,21 @@ static unsigned int
>            cfgchanged = true;
>          continue;
>        }
> +      else if (do_hoist_loads
> +                && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> +       {
> +         basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> +
> +         if (!FLOAT_TYPE_P (TREE_TYPE (gimple_cond_lhs (cond_stmt)))
> +             && single_succ_p (bb1)
> +             && single_succ_p (bb2)
> +             && single_pred_p (bb1)
> +             && single_pred_p (bb2)
> +             && EDGE_COUNT (bb->succs) == 2
> +             && EDGE_COUNT (bb3->preds) == 2)
> +           hoist_adjacent_loads (bb, bb1, bb2, bb3);
> +         continue;
> +       }
>       else
>        continue;
>
> @@ -1707,6 +1779,207 @@ cond_if_else_store_replacement (basic_block then_b
>   return ok;
>  }
>
> +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> +
> +static bool
> +local_mem_dependence (gimple stmt, basic_block bb)
> +{
> +  tree vuse = gimple_vuse (stmt);
> +  gimple def;
> +
> +  if (!vuse)
> +    return false;
> +
> +  def = SSA_NAME_DEF_STMT (vuse);
> +  return (def && gimple_bb (def) == bb);
> +}
> +
> +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> +   and BB3 rejoins control flow following BB1 and BB2, look for
> +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> +   two loads, one each occurring in BB1 and BB2, and the loads are
> +   provably of adjacent fields in the same structure, then move both
> +   loads into BB0.  Of course this can only be done if there are no
> +   dependencies preventing such motion.
> +
> +   One of the hoisted loads will always be speculative, so the
> +   transformation is currently conservative:
> +
> +    - The fields must be strictly adjacent.
> +    - The two fields must occupy a single memory block that is
> +      guaranteed to not cross a page boundary.
> +
> +    The last is difficult to prove, as such memory blocks should be
> +    aligned on the minimum of the stack alignment boundary and the
> +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> +    on a parameter for the alignment value.
> +
> +    Provided a good value is used for the last case, the first
> +    restriction could possibly be relaxed.  */
> +
> +static void
> +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> +                     basic_block bb2, basic_block bb3)
> +{
> +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> +  unsigned param_align_bits = (unsigned) (param_align * BITS_PER_UNIT);
> +  gimple_stmt_iterator gsi;
> +
> +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> +     for phis of two SSA names, one each of which is defined in bb1 and
> +     bb2.  */
> +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple phi_stmt = gsi_stmt (gsi);
> +      gimple def1, def2, defswap;
> +      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
> +      tree tree_offset1, tree_offset2, tree_size2, next;
> +      int offset1, offset2, size2;
> +      unsigned align1;
> +      gimple_stmt_iterator gsi2;
> +      basic_block bb_for_def1, bb_for_def2;
> +
> +      if (gimple_phi_num_args (phi_stmt) != 2)
> +       continue;
> +
> +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> +
> +      if (TREE_CODE (arg1) != SSA_NAME
> +         || TREE_CODE (arg2) != SSA_NAME
> +         || SSA_NAME_IS_DEFAULT_DEF (arg1)
> +         || SSA_NAME_IS_DEFAULT_DEF (arg2))
> +       continue;
> +
> +      def1 = SSA_NAME_DEF_STMT (arg1);
> +      def2 = SSA_NAME_DEF_STMT (arg2);
> +
> +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> +         && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> +       continue;
> +
> +      /* Check the mode of the arguments to be sure a conditional move
> +        can be generated for it.  */
> +      if (!optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> +       continue;
> +
> +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> +      if (!gimple_assign_single_p (def1)
> +         || !gimple_assign_single_p (def2))
> +       continue;
> +
> +      ref1 = gimple_assign_rhs1 (def1);
> +      ref2 = gimple_assign_rhs1 (def2);
> +
> +      if (TREE_CODE (ref1) != COMPONENT_REF
> +         || TREE_CODE (ref2) != COMPONENT_REF)
> +       continue;
> +
> +      /* The zeroth operand of the two component references must be
> +        identical.  It is not sufficient to compare get_base_address of
> +        the two references, because this could allow for different
> +        elements of the same array in the two trees.  It is not safe to
> +        assume that the existence of one array element implies the
> +        existence of a different one.  */
> +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> +       continue;
> +
> +      field1 = TREE_OPERAND (ref1, 1);
> +      field2 = TREE_OPERAND (ref2, 1);
> +
> +      /* Check for field adjacency, and ensure field1 comes first.  */
> +      for (next = DECL_CHAIN (field1);
> +          next && TREE_CODE (next) != FIELD_DECL;
> +          next = DECL_CHAIN (next))
> +       ;
> +
> +      if (next != field2)
> +       {
> +         for (next = DECL_CHAIN (field2);
> +              next && TREE_CODE (next) != FIELD_DECL;
> +              next = DECL_CHAIN (next))
> +           ;
> +
> +         if (next != field1)
> +           continue;
> +
> +         fieldswap = field1;
> +         field1 = field2;
> +         field2 = fieldswap;
> +         defswap = def1;
> +         def1 = def2;
> +         def2 = defswap;
> +         /* Don't swap bb1 and bb2 as we may have more than one
> +            phi to process successfully.  */
> +         bb_for_def1 = bb2;
> +         bb_for_def2 = bb1;
> +       }
> +      else
> +       {
> +         bb_for_def1 = bb1;
> +         bb_for_def2 = bb2;
> +       }
> +
> +      /* Check for proper alignment of the first field.  */
> +      tree_offset1 = bit_position (field1);
> +      tree_offset2 = bit_position (field2);
> +      tree_size2 = DECL_SIZE (field2);
> +
> +      if (!host_integerp (tree_offset1, 1)
> +         || !host_integerp (tree_offset2, 1)
> +         || !host_integerp (tree_size2, 1))
> +       continue;
> +
> +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> +      size2 = TREE_INT_CST_LOW (tree_size2);
> +      align1 = DECL_ALIGN (field1) % param_align_bits;
> +
> +      if (offset1 % BITS_PER_UNIT != 0)
> +       continue;
> +
> +      /* The two field references must fit within a block of memory that
> +        is guaranteed to be on the same page.  */
> +      if (align1 + offset2 - offset1 + size2 > param_align_bits)
> +       continue;

If param_align_bits were the cache-line size then this would be appropriate
for a check if the transform is profitable.  But the comment suggests it
is a check for correctness?  We do already have a l1-cache-line-size
param btw.

Did you check packed nested structs?  Like

struct A {
  int i;
  int j;
};
struct {
  char c;
  struct A a;
} __attribute__((packed)) x;

and loads of x.a.i and x.a.j?  I _think_ that DECL_ALIGN of the i and j
FIELD_DECL may still be 4 in this case.  Which means that if you
are performing the check for correctness then it has to be a lot more
conservative.

I _think_ we have concluded that if you see loads of p->a and p->b
then either both trap or both do not trap, so the check for correctness
is that you see two COMPONENT_REFs of the same base for
adjacent fields.

Btw, you do not restrict the hoisting to the case where the the/else
block will be empty afterwards?  It seems to be the transform is
not profitable if we end up doing the if() anyways - it might even
be not profitable (similar to the case where either path is very likely
and the other not - conditional moves are mostly profitable when
the possibility of the branches is equal).

Thanks,
Richard.

> +
> +      /* The two expressions cannot be dependent upon vdefs defined
> +        in bb1/bb2.  */
> +      if (local_mem_dependence (def1, bb_for_def1)
> +         || local_mem_dependence (def2, bb_for_def2))
> +       continue;
> +
> +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> +        bb0.  We hoist the first one first so that a cache miss is handled
> +         efficiently regardless of hardware cache-fill policy.  */
> +      gsi2 = gsi_for_stmt (def1);
> +      gsi_move_to_bb_end (&gsi2, bb0);
> +      gsi2 = gsi_for_stmt (def2);
> +      gsi_move_to_bb_end (&gsi2, bb0);
> +
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       {
> +         fprintf (dump_file,
> +                  "\nHoisting adjacent loads from %d and %d into %d: \n",
> +                  bb_for_def1->index, bb_for_def2->index, bb0->index);
> +         print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> +         print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> +       }
> +    }
> +}
> +
> +/* Determine whether we should attempt to hoist adjacent loads out of
> +   diamond patterns in pass_phiopt.  Always hoist loads if
> +   -fhoist-adjacent-loads is specified and the target machine has
> +   a conditional move instruction.  */
> +
> +static bool
> +gate_hoist_loads (void)
> +{
> +  return (flag_hoist_adjacent_loads == 1 && HAVE_conditional_move);
> +}
> +
>  /* Always do these optimizations if we have SSA
>    trees to work on.  */
>  static bool
> Index: gcc/common.opt
> ===================================================================
> --- gcc/common.opt      (revision 187805)
> +++ gcc/common.opt      (working copy)
> @@ -1186,6 +1186,11 @@ fgraphite-identity
>  Common Report Var(flag_graphite_identity) Optimization
>  Enable Graphite Identity transformation
>
> +fhoist-adjacent-loads
> +Common Report Var(flag_hoist_adjacent_loads) Optimization
> +Enable hoisting adjacent loads to encourage generating conditional move
> +instructions
> +
>  floop-parallelize-all
>  Common Report Var(flag_loop_parallelize_all) Optimization
>  Mark all loops as parallel
> Index: gcc/Makefile.in
> ===================================================================
> --- gcc/Makefile.in     (revision 187805)
> +++ gcc/Makefile.in     (working copy)
> @@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
>    $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
>    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
>    $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> -   $(TREE_DATA_REF_H) tree-pretty-print.h
> +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> +   insn-config.h $(EXPR_H) $(OPTABS_H)
>  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
>    $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
>    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> Index: gcc/params.def
> ===================================================================
> --- gcc/params.def      (revision 187805)
> +++ gcc/params.def      (working copy)
> @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
>          "track string lengths",
>          1000, 0, 0)
>
> +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> +         "min-cmove-struct-align",
> +         "Minimum byte alignment to assume for structures in the stack "
> +         "or heap when considering load hoisting for conditional moves",
> +         32, 8, 256)
> +
>  /*
>  Local variables:
>  mode:c
>
>

Re: [PATCH] Hoist adjacent pointer loads

by William J. Schmidt :: Rate this Message:

| View Threaded | Show Only this Message

On Mon, 2012-06-11 at 13:28 +0200, Richard Guenther wrote:

> On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt
> <wschmidt@...> wrote:
> > Hi Richard,
> >
> > Here's a revision of the hoist-adjacent-loads patch.  I'm sorry for the
> > delay since the last revision, but my performance testing has been
> > blocked waiting for a fix to PR53487.  I ended up applying a test
> > version of the patch to 4.7 and ran performance numbers with that
> > instead, with no degradations.
> >
> > In addition to addressing your comments, this patch contains one bug fix
> > where local_mem_dependence was called on the wrong blocks after swapping
> > def1 and def2.
> >
> > Bootstrapped with no regressions on powerpc64-unknown-linux-gnu.  Is
> > this version ok for trunk?  I won't commit it until I can do final
> > testing on trunk in conjunction with a fix for PR53487.
> >
> > Thanks,
> > Bill
> >
> >
> > 2012-06-04  Bill Schmidt  <wschmidt@...>
> >
> >        * opts.c: Add -fhoist_adjacent_loads to -O2 and above.
> >        * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> >        declaration.
> >        (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> >        (tree_ssa_phiopt): Call gate_hoist_loads.
> >        (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> >        (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> >        hoist_adjacent_loads.
> >        (local_mem_dependence): New function.
> >        (hoist_adjacent_loads): Likewise.
> >        (gate_hoist_loads): Likewise.
> >        * common.opt (fhoist-adjacent-loads): New switch.
> >        * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> >        * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
> >
> >
> > Index: gcc/opts.c
> > ===================================================================
> > --- gcc/opts.c  (revision 187805)
> > +++ gcc/opts.c  (working copy)
> > @@ -489,6 +489,7 @@ static const struct default_options default_option
> >     { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
> >     { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
> >     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
> > +    { OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },
> >
> >     /* -O3 optimizations.  */
> >     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
> > Index: gcc/tree-ssa-phiopt.c
> > ===================================================================
> > --- gcc/tree-ssa-phiopt.c       (revision 187805)
> > +++ gcc/tree-ssa-phiopt.c       (working copy)
> > @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "cfgloop.h"
> >  #include "tree-data-ref.h"
> >  #include "tree-pretty-print.h"
> > +#include "gimple-pretty-print.h"
> > +#include "insn-config.h"
> > +#include "expr.h"
> > +#include "optabs.h"
> >
> > +#ifndef HAVE_conditional_move
> > +#define HAVE_conditional_move (0)
> > +#endif
> > +
> >  static unsigned int tree_ssa_phiopt (void);
> > -static unsigned int tree_ssa_phiopt_worker (bool);
> > +static unsigned int tree_ssa_phiopt_worker (bool, bool);
> >  static bool conditional_replacement (basic_block, basic_block,
> >                                     edge, edge, gimple, tree, tree);
> >  static int value_replacement (basic_block, basic_block,
> > @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
> >  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
> >  static struct pointer_set_t * get_non_trapping (void);
> >  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> > +static void hoist_adjacent_loads (basic_block, basic_block,
> > +                                 basic_block, basic_block);
> > +static bool gate_hoist_loads (void);
> >
> >  /* This pass tries to replaces an if-then-else block with an
> >    assignment.  We have four kinds of transformations.  Some of these
> > @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
> >      bb2:
> >        x = PHI <x' (bb0), ...>;
> >
> > -   A similar transformation is done for MAX_EXPR.  */
> > +   A similar transformation is done for MAX_EXPR.
> >
> > +
> > +   This pass also performs a fifth transformation of a slightly different
> > +   flavor.
> > +
> > +   Adjacent Load Hoisting
> > +   ----------------------
> > +
> > +   This transformation replaces
> > +
> > +     bb0:
> > +       if (...) goto bb2; else goto bb1;
> > +     bb1:
> > +       x1 = (<expr>).field1;
> > +       goto bb3;
> > +     bb2:
> > +       x2 = (<expr>).field2;
> > +     bb3:
> > +       # x = PHI <x1, x2>;
> > +
> > +   with
> > +
> > +     bb0:
> > +       x1 = (<expr>).field1;
> > +       x2 = (<expr>).field2;
> > +       if (...) goto bb2; else goto bb1;
> > +     bb1:
> > +       goto bb3;
> > +     bb2:
> > +     bb3:
> > +       # x = PHI <x1, x2>;
> > +
> > +   The purpose of this transformation is to enable generation of conditional
> > +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> > +   the loads is speculative, the transformation is restricted to very
> > +   specific cases to avoid introducing a page fault.  We are looking for
> > +   the common idiom:
> > +
> > +     if (...)
> > +       x = y->left;
> > +     else
> > +       x = y->right;
> > +
> > +   where left and right are typically adjacent pointers in a tree structure.  */
> > +
> >  static unsigned int
> >  tree_ssa_phiopt (void)
> >  {
> > -  return tree_ssa_phiopt_worker (false);
> > +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
> >  }
> >
> >  /* This pass tries to transform conditional stores into unconditional
> > @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
> >  static unsigned int
> >  tree_ssa_cs_elim (void)
> >  {
> > -  return tree_ssa_phiopt_worker (true);
> > +  return tree_ssa_phiopt_worker (true, false);
> >  }
> >
> >  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> > @@ -227,9 +282,11 @@ static tree condstoretemp;
> >  /* The core routine of conditional store replacement and normal
> >    phi optimizations.  Both share much of the infrastructure in how
> >    to match applicable basic block patterns.  DO_STORE_ELIM is true
> > -   when we want to do conditional store replacement, false otherwise.  */
> > +   when we want to do conditional store replacement, false otherwise.
> > +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> > +   of diamond control flow patterns, false otherwise.  */
> >  static unsigned int
> > -tree_ssa_phiopt_worker (bool do_store_elim)
> > +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
> >  {
> >   basic_block bb;
> >   basic_block *bb_order;
> > @@ -312,6 +369,21 @@ static unsigned int
> >            cfgchanged = true;
> >          continue;
> >        }
> > +      else if (do_hoist_loads
> > +                && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> > +       {
> > +         basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> > +
> > +         if (!FLOAT_TYPE_P (TREE_TYPE (gimple_cond_lhs (cond_stmt)))
> > +             && single_succ_p (bb1)
> > +             && single_succ_p (bb2)
> > +             && single_pred_p (bb1)
> > +             && single_pred_p (bb2)
> > +             && EDGE_COUNT (bb->succs) == 2
> > +             && EDGE_COUNT (bb3->preds) == 2)
> > +           hoist_adjacent_loads (bb, bb1, bb2, bb3);
> > +         continue;
> > +       }
> >       else
> >        continue;
> >
> > @@ -1707,6 +1779,207 @@ cond_if_else_store_replacement (basic_block then_b
> >   return ok;
> >  }
> >
> > +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> > +
> > +static bool
> > +local_mem_dependence (gimple stmt, basic_block bb)
> > +{
> > +  tree vuse = gimple_vuse (stmt);
> > +  gimple def;
> > +
> > +  if (!vuse)
> > +    return false;
> > +
> > +  def = SSA_NAME_DEF_STMT (vuse);
> > +  return (def && gimple_bb (def) == bb);
> > +}
> > +
> > +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> > +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> > +   and BB3 rejoins control flow following BB1 and BB2, look for
> > +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> > +   two loads, one each occurring in BB1 and BB2, and the loads are
> > +   provably of adjacent fields in the same structure, then move both
> > +   loads into BB0.  Of course this can only be done if there are no
> > +   dependencies preventing such motion.
> > +
> > +   One of the hoisted loads will always be speculative, so the
> > +   transformation is currently conservative:
> > +
> > +    - The fields must be strictly adjacent.
> > +    - The two fields must occupy a single memory block that is
> > +      guaranteed to not cross a page boundary.
> > +
> > +    The last is difficult to prove, as such memory blocks should be
> > +    aligned on the minimum of the stack alignment boundary and the
> > +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> > +    on a parameter for the alignment value.
> > +
> > +    Provided a good value is used for the last case, the first
> > +    restriction could possibly be relaxed.  */
> > +
> > +static void
> > +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> > +                     basic_block bb2, basic_block bb3)
> > +{
> > +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> > +  unsigned param_align_bits = (unsigned) (param_align * BITS_PER_UNIT);
> > +  gimple_stmt_iterator gsi;
> > +
> > +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> > +     for phis of two SSA names, one each of which is defined in bb1 and
> > +     bb2.  */
> > +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> > +    {
> > +      gimple phi_stmt = gsi_stmt (gsi);
> > +      gimple def1, def2, defswap;
> > +      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
> > +      tree tree_offset1, tree_offset2, tree_size2, next;
> > +      int offset1, offset2, size2;
> > +      unsigned align1;
> > +      gimple_stmt_iterator gsi2;
> > +      basic_block bb_for_def1, bb_for_def2;
> > +
> > +      if (gimple_phi_num_args (phi_stmt) != 2)
> > +       continue;
> > +
> > +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> > +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> > +
> > +      if (TREE_CODE (arg1) != SSA_NAME
> > +         || TREE_CODE (arg2) != SSA_NAME
> > +         || SSA_NAME_IS_DEFAULT_DEF (arg1)
> > +         || SSA_NAME_IS_DEFAULT_DEF (arg2))
> > +       continue;
> > +
> > +      def1 = SSA_NAME_DEF_STMT (arg1);
> > +      def2 = SSA_NAME_DEF_STMT (arg2);
> > +
> > +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> > +         && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> > +       continue;
> > +
> > +      /* Check the mode of the arguments to be sure a conditional move
> > +        can be generated for it.  */
> > +      if (!optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> > +       continue;
> > +
> > +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> > +      if (!gimple_assign_single_p (def1)
> > +         || !gimple_assign_single_p (def2))
> > +       continue;
> > +
> > +      ref1 = gimple_assign_rhs1 (def1);
> > +      ref2 = gimple_assign_rhs1 (def2);
> > +
> > +      if (TREE_CODE (ref1) != COMPONENT_REF
> > +         || TREE_CODE (ref2) != COMPONENT_REF)
> > +       continue;
> > +
> > +      /* The zeroth operand of the two component references must be
> > +        identical.  It is not sufficient to compare get_base_address of
> > +        the two references, because this could allow for different
> > +        elements of the same array in the two trees.  It is not safe to
> > +        assume that the existence of one array element implies the
> > +        existence of a different one.  */
> > +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> > +       continue;
> > +
> > +      field1 = TREE_OPERAND (ref1, 1);
> > +      field2 = TREE_OPERAND (ref2, 1);
> > +
> > +      /* Check for field adjacency, and ensure field1 comes first.  */
> > +      for (next = DECL_CHAIN (field1);
> > +          next && TREE_CODE (next) != FIELD_DECL;
> > +          next = DECL_CHAIN (next))
> > +       ;
> > +
> > +      if (next != field2)
> > +       {
> > +         for (next = DECL_CHAIN (field2);
> > +              next && TREE_CODE (next) != FIELD_DECL;
> > +              next = DECL_CHAIN (next))
> > +           ;
> > +
> > +         if (next != field1)
> > +           continue;
> > +
> > +         fieldswap = field1;
> > +         field1 = field2;
> > +         field2 = fieldswap;
> > +         defswap = def1;
> > +         def1 = def2;
> > +         def2 = defswap;
> > +         /* Don't swap bb1 and bb2 as we may have more than one
> > +            phi to process successfully.  */
> > +         bb_for_def1 = bb2;
> > +         bb_for_def2 = bb1;
> > +       }
> > +      else
> > +       {
> > +         bb_for_def1 = bb1;
> > +         bb_for_def2 = bb2;
> > +       }
> > +
> > +      /* Check for proper alignment of the first field.  */
> > +      tree_offset1 = bit_position (field1);
> > +      tree_offset2 = bit_position (field2);
> > +      tree_size2 = DECL_SIZE (field2);
> > +
> > +      if (!host_integerp (tree_offset1, 1)
> > +         || !host_integerp (tree_offset2, 1)
> > +         || !host_integerp (tree_size2, 1))
> > +       continue;
> > +
> > +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> > +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> > +      size2 = TREE_INT_CST_LOW (tree_size2);
> > +      align1 = DECL_ALIGN (field1) % param_align_bits;
> > +
> > +      if (offset1 % BITS_PER_UNIT != 0)
> > +       continue;
> > +
> > +      /* The two field references must fit within a block of memory that
> > +        is guaranteed to be on the same page.  */
> > +      if (align1 + offset2 - offset1 + size2 > param_align_bits)
> > +       continue;
>
> If param_align_bits were the cache-line size then this would be appropriate
> for a check if the transform is profitable.  But the comment suggests it
> is a check for correctness?  We do already have a l1-cache-line-size
> param btw.
>
> Did you check packed nested structs?  Like
>
> struct A {
>   int i;
>   int j;
> };
> struct {
>   char c;
>   struct A a;
> } __attribute__((packed)) x;
>
> and loads of x.a.i and x.a.j?  I _think_ that DECL_ALIGN of the i and j
> FIELD_DECL may still be 4 in this case.  Which means that if you
> are performing the check for correctness then it has to be a lot more
> conservative.
>
> I _think_ we have concluded that if you see loads of p->a and p->b
> then either both trap or both do not trap, so the check for correctness
> is that you see two COMPONENT_REFs of the same base for
> adjacent fields.

The above is largely a commentary problem.  This isn't a test for
correctness, but a heuristic for profitability.  We don't want to cross
page boundaries with this optimization, at least not often.  As you
point out, for correctness we avoid extra traps by checking for a common
base.  What we're trying to avoid here is two adjacent fields on two
different memory pages so we don't introduce a page fault.

Maybe this is overkill and should just be removed.  It is difficult to
be accurate and effective with such a test.  The idea was to
parameterize on a size that corresponds to the alignment guarantee of
the stack and heap, but if the parameter is too small, it disables the
optimization too frequently when it is useful.

So I could go with fixing the comment, removing the test, or updating it
to something that tries to avoid page faults in a better way, if you
have ideas for that.  Please let me know which seems best.

>
> Btw, you do not restrict the hoisting to the case where the the/else
> block will be empty afterwards?  It seems to be the transform is
> not profitable if we end up doing the if() anyways - it might even
> be not profitable (similar to the case where either path is very likely
> and the other not - conditional moves are mostly profitable when
> the possibility of the branches is equal).
>

In general it is difficult to make such a decision accurately.  I have
seen numerous cases where the then/else blocks contain two or more
opportunities for collapsing into a conditional move.  Looking at just
one of them, the opportunity would look bad if we restricted it to only
cases where the blocks became empty.  So that introduces more complexity
to check that all non-control stmts will be removed by some instance of
the transformation.

Also, even without removing the if, we sometimes see profitable
situations like:

if (...)
  {
    a = p.x;
    b = p.y;
    ...
  }
else
  {
    a = p.y;
    b = p.x;
    ...
  }

Here the loads are not speculative at all, so hoisting them is always
good.  (But perhaps PRE and later processing would pick up the cmove
possibilities, not sure.)

In any case I haven't been able to find benchmarks where the heuristic
as written is bad on PowerPC, and our conditional move instruction isn't
always the most efficient in the universe. ;-/  In general, the extra
load is "free" (from a dcache point of view) and the potential extra
fetch cost, etc., tends to wash out due to fetch-ahead inefficiencies
and so on.

Thanks,
Bill

> Thanks,
> Richard.
>
> > +
> > +      /* The two expressions cannot be dependent upon vdefs defined
> > +        in bb1/bb2.  */
> > +      if (local_mem_dependence (def1, bb_for_def1)
> > +         || local_mem_dependence (def2, bb_for_def2))
> > +       continue;
> > +
> > +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> > +        bb0.  We hoist the first one first so that a cache miss is handled
> > +         efficiently regardless of hardware cache-fill policy.  */
> > +      gsi2 = gsi_for_stmt (def1);
> > +      gsi_move_to_bb_end (&gsi2, bb0);
> > +      gsi2 = gsi_for_stmt (def2);
> > +      gsi_move_to_bb_end (&gsi2, bb0);
> > +
> > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > +       {
> > +         fprintf (dump_file,
> > +                  "\nHoisting adjacent loads from %d and %d into %d: \n",
> > +                  bb_for_def1->index, bb_for_def2->index, bb0->index);
> > +         print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> > +         print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> > +       }
> > +    }
> > +}
> > +
> > +/* Determine whether we should attempt to hoist adjacent loads out of
> > +   diamond patterns in pass_phiopt.  Always hoist loads if
> > +   -fhoist-adjacent-loads is specified and the target machine has
> > +   a conditional move instruction.  */
> > +
> > +static bool
> > +gate_hoist_loads (void)
> > +{
> > +  return (flag_hoist_adjacent_loads == 1 && HAVE_conditional_move);
> > +}
> > +
> >  /* Always do these optimizations if we have SSA
> >    trees to work on.  */
> >  static bool
> > Index: gcc/common.opt
> > ===================================================================
> > --- gcc/common.opt      (revision 187805)
> > +++ gcc/common.opt      (working copy)
> > @@ -1186,6 +1186,11 @@ fgraphite-identity
> >  Common Report Var(flag_graphite_identity) Optimization
> >  Enable Graphite Identity transformation
> >
> > +fhoist-adjacent-loads
> > +Common Report Var(flag_hoist_adjacent_loads) Optimization
> > +Enable hoisting adjacent loads to encourage generating conditional move
> > +instructions
> > +
> >  floop-parallelize-all
> >  Common Report Var(flag_loop_parallelize_all) Optimization
> >  Mark all loops as parallel
> > Index: gcc/Makefile.in
> > ===================================================================
> > --- gcc/Makefile.in     (revision 187805)
> > +++ gcc/Makefile.in     (working copy)
> > @@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
> >    $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
> >    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
> >    $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> > -   $(TREE_DATA_REF_H) tree-pretty-print.h
> > +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> > +   insn-config.h $(EXPR_H) $(OPTABS_H)
> >  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
> >    $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
> >    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> > Index: gcc/params.def
> > ===================================================================
> > --- gcc/params.def      (revision 187805)
> > +++ gcc/params.def      (working copy)
> > @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
> >          "track string lengths",
> >          1000, 0, 0)
> >
> > +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> > +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> > +         "min-cmove-struct-align",
> > +         "Minimum byte alignment to assume for structures in the stack "
> > +         "or heap when considering load hoisting for conditional moves",
> > +         32, 8, 256)
> > +
> >  /*
> >  Local variables:
> >  mode:c
> >
> >
>


Re: [PATCH] Hoist adjacent pointer loads

by Richard Biener :: Rate this Message:

| View Threaded | Show Only this Message

On Mon, 11 Jun 2012, William J. Schmidt wrote:

> On Mon, 2012-06-11 at 13:28 +0200, Richard Guenther wrote:
> > On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt
> > <wschmidt@...> wrote:
> > > Hi Richard,
> > >
> > > Here's a revision of the hoist-adjacent-loads patch.  I'm sorry for the
> > > delay since the last revision, but my performance testing has been
> > > blocked waiting for a fix to PR53487.  I ended up applying a test
> > > version of the patch to 4.7 and ran performance numbers with that
> > > instead, with no degradations.
> > >
> > > In addition to addressing your comments, this patch contains one bug fix
> > > where local_mem_dependence was called on the wrong blocks after swapping
> > > def1 and def2.
> > >
> > > Bootstrapped with no regressions on powerpc64-unknown-linux-gnu.  Is
> > > this version ok for trunk?  I won't commit it until I can do final
> > > testing on trunk in conjunction with a fix for PR53487.
> > >
> > > Thanks,
> > > Bill
> > >
> > >
> > > 2012-06-04  Bill Schmidt  <wschmidt@...>
> > >
> > >        * opts.c: Add -fhoist_adjacent_loads to -O2 and above.
> > >        * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> > >        declaration.
> > >        (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> > >        (tree_ssa_phiopt): Call gate_hoist_loads.
> > >        (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> > >        (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> > >        hoist_adjacent_loads.
> > >        (local_mem_dependence): New function.
> > >        (hoist_adjacent_loads): Likewise.
> > >        (gate_hoist_loads): Likewise.
> > >        * common.opt (fhoist-adjacent-loads): New switch.
> > >        * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> > >        * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
> > >
> > >
> > > Index: gcc/opts.c
> > > ===================================================================
> > > --- gcc/opts.c  (revision 187805)
> > > +++ gcc/opts.c  (working copy)
> > > @@ -489,6 +489,7 @@ static const struct default_options default_option
> > >     { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
> > >     { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
> > >     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
> > > +    { OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },
> > >
> > >     /* -O3 optimizations.  */
> > >     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
> > > Index: gcc/tree-ssa-phiopt.c
> > > ===================================================================
> > > --- gcc/tree-ssa-phiopt.c       (revision 187805)
> > > +++ gcc/tree-ssa-phiopt.c       (working copy)
> > > @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
> > >  #include "cfgloop.h"
> > >  #include "tree-data-ref.h"
> > >  #include "tree-pretty-print.h"
> > > +#include "gimple-pretty-print.h"
> > > +#include "insn-config.h"
> > > +#include "expr.h"
> > > +#include "optabs.h"
> > >
> > > +#ifndef HAVE_conditional_move
> > > +#define HAVE_conditional_move (0)
> > > +#endif
> > > +
> > >  static unsigned int tree_ssa_phiopt (void);
> > > -static unsigned int tree_ssa_phiopt_worker (bool);
> > > +static unsigned int tree_ssa_phiopt_worker (bool, bool);
> > >  static bool conditional_replacement (basic_block, basic_block,
> > >                                     edge, edge, gimple, tree, tree);
> > >  static int value_replacement (basic_block, basic_block,
> > > @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
> > >  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
> > >  static struct pointer_set_t * get_non_trapping (void);
> > >  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> > > +static void hoist_adjacent_loads (basic_block, basic_block,
> > > +                                 basic_block, basic_block);
> > > +static bool gate_hoist_loads (void);
> > >
> > >  /* This pass tries to replaces an if-then-else block with an
> > >    assignment.  We have four kinds of transformations.  Some of these
> > > @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
> > >      bb2:
> > >        x = PHI <x' (bb0), ...>;
> > >
> > > -   A similar transformation is done for MAX_EXPR.  */
> > > +   A similar transformation is done for MAX_EXPR.
> > >
> > > +
> > > +   This pass also performs a fifth transformation of a slightly different
> > > +   flavor.
> > > +
> > > +   Adjacent Load Hoisting
> > > +   ----------------------
> > > +
> > > +   This transformation replaces
> > > +
> > > +     bb0:
> > > +       if (...) goto bb2; else goto bb1;
> > > +     bb1:
> > > +       x1 = (<expr>).field1;
> > > +       goto bb3;
> > > +     bb2:
> > > +       x2 = (<expr>).field2;
> > > +     bb3:
> > > +       # x = PHI <x1, x2>;
> > > +
> > > +   with
> > > +
> > > +     bb0:
> > > +       x1 = (<expr>).field1;
> > > +       x2 = (<expr>).field2;
> > > +       if (...) goto bb2; else goto bb1;
> > > +     bb1:
> > > +       goto bb3;
> > > +     bb2:
> > > +     bb3:
> > > +       # x = PHI <x1, x2>;
> > > +
> > > +   The purpose of this transformation is to enable generation of conditional
> > > +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> > > +   the loads is speculative, the transformation is restricted to very
> > > +   specific cases to avoid introducing a page fault.  We are looking for
> > > +   the common idiom:
> > > +
> > > +     if (...)
> > > +       x = y->left;
> > > +     else
> > > +       x = y->right;
> > > +
> > > +   where left and right are typically adjacent pointers in a tree structure.  */
> > > +
> > >  static unsigned int
> > >  tree_ssa_phiopt (void)
> > >  {
> > > -  return tree_ssa_phiopt_worker (false);
> > > +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
> > >  }
> > >
> > >  /* This pass tries to transform conditional stores into unconditional
> > > @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
> > >  static unsigned int
> > >  tree_ssa_cs_elim (void)
> > >  {
> > > -  return tree_ssa_phiopt_worker (true);
> > > +  return tree_ssa_phiopt_worker (true, false);
> > >  }
> > >
> > >  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> > > @@ -227,9 +282,11 @@ static tree condstoretemp;
> > >  /* The core routine of conditional store replacement and normal
> > >    phi optimizations.  Both share much of the infrastructure in how
> > >    to match applicable basic block patterns.  DO_STORE_ELIM is true
> > > -   when we want to do conditional store replacement, false otherwise.  */
> > > +   when we want to do conditional store replacement, false otherwise.
> > > +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> > > +   of diamond control flow patterns, false otherwise.  */
> > >  static unsigned int
> > > -tree_ssa_phiopt_worker (bool do_store_elim)
> > > +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
> > >  {
> > >   basic_block bb;
> > >   basic_block *bb_order;
> > > @@ -312,6 +369,21 @@ static unsigned int
> > >            cfgchanged = true;
> > >          continue;
> > >        }
> > > +      else if (do_hoist_loads
> > > +                && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> > > +       {
> > > +         basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> > > +
> > > +         if (!FLOAT_TYPE_P (TREE_TYPE (gimple_cond_lhs (cond_stmt)))
> > > +             && single_succ_p (bb1)
> > > +             && single_succ_p (bb2)
> > > +             && single_pred_p (bb1)
> > > +             && single_pred_p (bb2)
> > > +             && EDGE_COUNT (bb->succs) == 2
> > > +             && EDGE_COUNT (bb3->preds) == 2)
> > > +           hoist_adjacent_loads (bb, bb1, bb2, bb3);
> > > +         continue;
> > > +       }
> > >       else
> > >        continue;
> > >
> > > @@ -1707,6 +1779,207 @@ cond_if_else_store_replacement (basic_block then_b
> > >   return ok;
> > >  }
> > >
> > > +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> > > +
> > > +static bool
> > > +local_mem_dependence (gimple stmt, basic_block bb)
> > > +{
> > > +  tree vuse = gimple_vuse (stmt);
> > > +  gimple def;
> > > +
> > > +  if (!vuse)
> > > +    return false;
> > > +
> > > +  def = SSA_NAME_DEF_STMT (vuse);
> > > +  return (def && gimple_bb (def) == bb);
> > > +}
> > > +
> > > +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> > > +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> > > +   and BB3 rejoins control flow following BB1 and BB2, look for
> > > +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> > > +   two loads, one each occurring in BB1 and BB2, and the loads are
> > > +   provably of adjacent fields in the same structure, then move both
> > > +   loads into BB0.  Of course this can only be done if there are no
> > > +   dependencies preventing such motion.
> > > +
> > > +   One of the hoisted loads will always be speculative, so the
> > > +   transformation is currently conservative:
> > > +
> > > +    - The fields must be strictly adjacent.
> > > +    - The two fields must occupy a single memory block that is
> > > +      guaranteed to not cross a page boundary.
> > > +
> > > +    The last is difficult to prove, as such memory blocks should be
> > > +    aligned on the minimum of the stack alignment boundary and the
> > > +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> > > +    on a parameter for the alignment value.
> > > +
> > > +    Provided a good value is used for the last case, the first
> > > +    restriction could possibly be relaxed.  */
> > > +
> > > +static void
> > > +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> > > +                     basic_block bb2, basic_block bb3)
> > > +{
> > > +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> > > +  unsigned param_align_bits = (unsigned) (param_align * BITS_PER_UNIT);
> > > +  gimple_stmt_iterator gsi;
> > > +
> > > +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> > > +     for phis of two SSA names, one each of which is defined in bb1 and
> > > +     bb2.  */
> > > +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> > > +    {
> > > +      gimple phi_stmt = gsi_stmt (gsi);
> > > +      gimple def1, def2, defswap;
> > > +      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
> > > +      tree tree_offset1, tree_offset2, tree_size2, next;
> > > +      int offset1, offset2, size2;
> > > +      unsigned align1;
> > > +      gimple_stmt_iterator gsi2;
> > > +      basic_block bb_for_def1, bb_for_def2;
> > > +
> > > +      if (gimple_phi_num_args (phi_stmt) != 2)
> > > +       continue;
> > > +
> > > +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> > > +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> > > +
> > > +      if (TREE_CODE (arg1) != SSA_NAME
> > > +         || TREE_CODE (arg2) != SSA_NAME
> > > +         || SSA_NAME_IS_DEFAULT_DEF (arg1)
> > > +         || SSA_NAME_IS_DEFAULT_DEF (arg2))
> > > +       continue;
> > > +
> > > +      def1 = SSA_NAME_DEF_STMT (arg1);
> > > +      def2 = SSA_NAME_DEF_STMT (arg2);
> > > +
> > > +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> > > +         && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> > > +       continue;
> > > +
> > > +      /* Check the mode of the arguments to be sure a conditional move
> > > +        can be generated for it.  */
> > > +      if (!optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> > > +       continue;
> > > +
> > > +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> > > +      if (!gimple_assign_single_p (def1)
> > > +         || !gimple_assign_single_p (def2))
> > > +       continue;
> > > +
> > > +      ref1 = gimple_assign_rhs1 (def1);
> > > +      ref2 = gimple_assign_rhs1 (def2);
> > > +
> > > +      if (TREE_CODE (ref1) != COMPONENT_REF
> > > +         || TREE_CODE (ref2) != COMPONENT_REF)
> > > +       continue;
> > > +
> > > +      /* The zeroth operand of the two component references must be
> > > +        identical.  It is not sufficient to compare get_base_address of
> > > +        the two references, because this could allow for different
> > > +        elements of the same array in the two trees.  It is not safe to
> > > +        assume that the existence of one array element implies the
> > > +        existence of a different one.  */
> > > +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> > > +       continue;
> > > +
> > > +      field1 = TREE_OPERAND (ref1, 1);
> > > +      field2 = TREE_OPERAND (ref2, 1);
> > > +
> > > +      /* Check for field adjacency, and ensure field1 comes first.  */
> > > +      for (next = DECL_CHAIN (field1);
> > > +          next && TREE_CODE (next) != FIELD_DECL;
> > > +          next = DECL_CHAIN (next))
> > > +       ;
> > > +
> > > +      if (next != field2)
> > > +       {
> > > +         for (next = DECL_CHAIN (field2);
> > > +              next && TREE_CODE (next) != FIELD_DECL;
> > > +              next = DECL_CHAIN (next))
> > > +           ;
> > > +
> > > +         if (next != field1)
> > > +           continue;
> > > +
> > > +         fieldswap = field1;
> > > +         field1 = field2;
> > > +         field2 = fieldswap;
> > > +         defswap = def1;
> > > +         def1 = def2;
> > > +         def2 = defswap;
> > > +         /* Don't swap bb1 and bb2 as we may have more than one
> > > +            phi to process successfully.  */
> > > +         bb_for_def1 = bb2;
> > > +         bb_for_def2 = bb1;
> > > +       }
> > > +      else
> > > +       {
> > > +         bb_for_def1 = bb1;
> > > +         bb_for_def2 = bb2;
> > > +       }
> > > +
> > > +      /* Check for proper alignment of the first field.  */
> > > +      tree_offset1 = bit_position (field1);
> > > +      tree_offset2 = bit_position (field2);
> > > +      tree_size2 = DECL_SIZE (field2);
> > > +
> > > +      if (!host_integerp (tree_offset1, 1)
> > > +         || !host_integerp (tree_offset2, 1)
> > > +         || !host_integerp (tree_size2, 1))
> > > +       continue;
> > > +
> > > +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> > > +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> > > +      size2 = TREE_INT_CST_LOW (tree_size2);
> > > +      align1 = DECL_ALIGN (field1) % param_align_bits;
> > > +
> > > +      if (offset1 % BITS_PER_UNIT != 0)
> > > +       continue;
> > > +
> > > +      /* The two field references must fit within a block of memory that
> > > +        is guaranteed to be on the same page.  */
> > > +      if (align1 + offset2 - offset1 + size2 > param_align_bits)
> > > +       continue;
> >
> > If param_align_bits were the cache-line size then this would be appropriate
> > for a check if the transform is profitable.  But the comment suggests it
> > is a check for correctness?  We do already have a l1-cache-line-size
> > param btw.
> >
> > Did you check packed nested structs?  Like
> >
> > struct A {
> >   int i;
> >   int j;
> > };
> > struct {
> >   char c;
> >   struct A a;
> > } __attribute__((packed)) x;
> >
> > and loads of x.a.i and x.a.j?  I _think_ that DECL_ALIGN of the i and j
> > FIELD_DECL may still be 4 in this case.  Which means that if you
> > are performing the check for correctness then it has to be a lot more
> > conservative.
> >
> > I _think_ we have concluded that if you see loads of p->a and p->b
> > then either both trap or both do not trap, so the check for correctness
> > is that you see two COMPONENT_REFs of the same base for
> > adjacent fields.
>
> The above is largely a commentary problem.  This isn't a test for
> correctness, but a heuristic for profitability.  We don't want to cross
> page boundaries with this optimization, at least not often.  As you
> point out, for correctness we avoid extra traps by checking for a common
> base.  What we're trying to avoid here is two adjacent fields on two
> different memory pages so we don't introduce a page fault.
>
> Maybe this is overkill and should just be removed.  It is difficult to
> be accurate and effective with such a test.  The idea was to
> parameterize on a size that corresponds to the alignment guarantee of
> the stack and heap, but if the parameter is too small, it disables the
> optimization too frequently when it is useful.
>
> So I could go with fixing the comment, removing the test, or updating it
> to something that tries to avoid page faults in a better way, if you
> have ideas for that.  Please let me know which seems best.
Well, fixup the comment and re-use l1-cache-line-size instead of adding
a new one.  If we unconditionally fetch a new memory lane that's the
thing to base profitability on.

> >
> > Btw, you do not restrict the hoisting to the case where the the/else
> > block will be empty afterwards?  It seems to be the transform is
> > not profitable if we end up doing the if() anyways - it might even
> > be not profitable (similar to the case where either path is very likely
> > and the other not - conditional moves are mostly profitable when
> > the possibility of the branches is equal).
> >
>
> In general it is difficult to make such a decision accurately.  I have
> seen numerous cases where the then/else blocks contain two or more
> opportunities for collapsing into a conditional move.  Looking at just
> one of them, the opportunity would look bad if we restricted it to only
> cases where the blocks became empty.  So that introduces more complexity
> to check that all non-control stmts will be removed by some instance of
> the transformation.
>
> Also, even without removing the if, we sometimes see profitable
> situations like:
>
> if (...)
>   {
>     a = p.x;
>     b = p.y;
>     ...
>   }
> else
>   {
>     a = p.y;
>     b = p.x;
>     ...
>   }
>
> Here the loads are not speculative at all, so hoisting them is always
> good.  (But perhaps PRE and later processing would pick up the cmove
> possibilities, not sure.)
>
> In any case I haven't been able to find benchmarks where the heuristic
> as written is bad on PowerPC, and our conditional move instruction isn't
> always the most efficient in the universe. ;-/  In general, the extra
> load is "free" (from a dcache point of view) and the potential extra
> fetch cost, etc., tends to wash out due to fetch-ahead inefficiencies
> and so on.
What I've seen in other PRs is that cmov is slow compared to a
well-prediced conditional jump on x86.  Thus, can you restrict
speculation to branches with probabilities that are at least not
hot/cold?  Honza, which generic interface would you use for this
(maybe add a new one)?  (see predict.[ch]).

Thanks,
Richard.

>
> Thanks,
> Bill
>
> > Thanks,
> > Richard.
> >
> > > +
> > > +      /* The two expressions cannot be dependent upon vdefs defined
> > > +        in bb1/bb2.  */
> > > +      if (local_mem_dependence (def1, bb_for_def1)
> > > +         || local_mem_dependence (def2, bb_for_def2))
> > > +       continue;
> > > +
> > > +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> > > +        bb0.  We hoist the first one first so that a cache miss is handled
> > > +         efficiently regardless of hardware cache-fill policy.  */
> > > +      gsi2 = gsi_for_stmt (def1);
> > > +      gsi_move_to_bb_end (&gsi2, bb0);
> > > +      gsi2 = gsi_for_stmt (def2);
> > > +      gsi_move_to_bb_end (&gsi2, bb0);
> > > +
> > > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > > +       {
> > > +         fprintf (dump_file,
> > > +                  "\nHoisting adjacent loads from %d and %d into %d: \n",
> > > +                  bb_for_def1->index, bb_for_def2->index, bb0->index);
> > > +         print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> > > +         print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> > > +       }
> > > +    }
> > > +}
> > > +
> > > +/* Determine whether we should attempt to hoist adjacent loads out of
> > > +   diamond patterns in pass_phiopt.  Always hoist loads if
> > > +   -fhoist-adjacent-loads is specified and the target machine has
> > > +   a conditional move instruction.  */
> > > +
> > > +static bool
> > > +gate_hoist_loads (void)
> > > +{
> > > +  return (flag_hoist_adjacent_loads == 1 && HAVE_conditional_move);
> > > +}
> > > +
> > >  /* Always do these optimizations if we have SSA
> > >    trees to work on.  */
> > >  static bool
> > > Index: gcc/common.opt
> > > ===================================================================
> > > --- gcc/common.opt      (revision 187805)
> > > +++ gcc/common.opt      (working copy)
> > > @@ -1186,6 +1186,11 @@ fgraphite-identity
> > >  Common Report Var(flag_graphite_identity) Optimization
> > >  Enable Graphite Identity transformation
> > >
> > > +fhoist-adjacent-loads
> > > +Common Report Var(flag_hoist_adjacent_loads) Optimization
> > > +Enable hoisting adjacent loads to encourage generating conditional move
> > > +instructions
> > > +
> > >  floop-parallelize-all
> > >  Common Report Var(flag_loop_parallelize_all) Optimization
> > >  Mark all loops as parallel
> > > Index: gcc/Makefile.in
> > > ===================================================================
> > > --- gcc/Makefile.in     (revision 187805)
> > > +++ gcc/Makefile.in     (working copy)
> > > @@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
> > >    $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
> > >    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
> > >    $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> > > -   $(TREE_DATA_REF_H) tree-pretty-print.h
> > > +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> > > +   insn-config.h $(EXPR_H) $(OPTABS_H)
> > >  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
> > >    $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
> > >    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> > > Index: gcc/params.def
> > > ===================================================================
> > > --- gcc/params.def      (revision 187805)
> > > +++ gcc/params.def      (working copy)
> > > @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
> > >          "track string lengths",
> > >          1000, 0, 0)
> > >
> > > +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> > > +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> > > +         "min-cmove-struct-align",
> > > +         "Minimum byte alignment to assume for structures in the stack "
> > > +         "or heap when considering load hoisting for conditional moves",
> > > +         32, 8, 256)
> > > +
> > >  /*
> > >  Local variables:
> > >  mode:c
> > >
> > >
> >
>
>
--
Richard Guenther <rguenther@...>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer

Re: [PATCH] Hoist adjacent pointer loads

by William J. Schmidt :: Rate this Message:

| View Threaded | Show Only this Message

On Mon, 2012-06-11 at 14:59 +0200, Richard Guenther wrote:

> On Mon, 11 Jun 2012, William J. Schmidt wrote:
>
> > On Mon, 2012-06-11 at 13:28 +0200, Richard Guenther wrote:
> > > On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt
> > > <wschmidt@...> wrote:
> > > > Hi Richard,
> > > >
> > > > Here's a revision of the hoist-adjacent-loads patch.  I'm sorry for the
> > > > delay since the last revision, but my performance testing has been
> > > > blocked waiting for a fix to PR53487.  I ended up applying a test
> > > > version of the patch to 4.7 and ran performance numbers with that
> > > > instead, with no degradations.
> > > >
> > > > In addition to addressing your comments, this patch contains one bug fix
> > > > where local_mem_dependence was called on the wrong blocks after swapping
> > > > def1 and def2.
> > > >
> > > > Bootstrapped with no regressions on powerpc64-unknown-linux-gnu.  Is
> > > > this version ok for trunk?  I won't commit it until I can do final
> > > > testing on trunk in conjunction with a fix for PR53487.
> > > >
> > > > Thanks,
> > > > Bill
> > > >
> > > >
> > > > 2012-06-04  Bill Schmidt  <wschmidt@...>
> > > >
> > > >        * opts.c: Add -fhoist_adjacent_loads to -O2 and above.
> > > >        * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> > > >        declaration.
> > > >        (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> > > >        (tree_ssa_phiopt): Call gate_hoist_loads.
> > > >        (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> > > >        (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> > > >        hoist_adjacent_loads.
> > > >        (local_mem_dependence): New function.
> > > >        (hoist_adjacent_loads): Likewise.
> > > >        (gate_hoist_loads): Likewise.
> > > >        * common.opt (fhoist-adjacent-loads): New switch.
> > > >        * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> > > >        * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
> > > >
> > > >
> > > > Index: gcc/opts.c
> > > > ===================================================================
> > > > --- gcc/opts.c  (revision 187805)
> > > > +++ gcc/opts.c  (working copy)
> > > > @@ -489,6 +489,7 @@ static const struct default_options default_option
> > > >     { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
> > > >     { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
> > > >     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
> > > > +    { OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },
> > > >
> > > >     /* -O3 optimizations.  */
> > > >     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
> > > > Index: gcc/tree-ssa-phiopt.c
> > > > ===================================================================
> > > > --- gcc/tree-ssa-phiopt.c       (revision 187805)
> > > > +++ gcc/tree-ssa-phiopt.c       (working copy)
> > > > @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
> > > >  #include "cfgloop.h"
> > > >  #include "tree-data-ref.h"
> > > >  #include "tree-pretty-print.h"
> > > > +#include "gimple-pretty-print.h"
> > > > +#include "insn-config.h"
> > > > +#include "expr.h"
> > > > +#include "optabs.h"
> > > >
> > > > +#ifndef HAVE_conditional_move
> > > > +#define HAVE_conditional_move (0)
> > > > +#endif
> > > > +
> > > >  static unsigned int tree_ssa_phiopt (void);
> > > > -static unsigned int tree_ssa_phiopt_worker (bool);
> > > > +static unsigned int tree_ssa_phiopt_worker (bool, bool);
> > > >  static bool conditional_replacement (basic_block, basic_block,
> > > >                                     edge, edge, gimple, tree, tree);
> > > >  static int value_replacement (basic_block, basic_block,
> > > > @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
> > > >  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
> > > >  static struct pointer_set_t * get_non_trapping (void);
> > > >  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> > > > +static void hoist_adjacent_loads (basic_block, basic_block,
> > > > +                                 basic_block, basic_block);
> > > > +static bool gate_hoist_loads (void);
> > > >
> > > >  /* This pass tries to replaces an if-then-else block with an
> > > >    assignment.  We have four kinds of transformations.  Some of these
> > > > @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
> > > >      bb2:
> > > >        x = PHI <x' (bb0), ...>;
> > > >
> > > > -   A similar transformation is done for MAX_EXPR.  */
> > > > +   A similar transformation is done for MAX_EXPR.
> > > >
> > > > +
> > > > +   This pass also performs a fifth transformation of a slightly different
> > > > +   flavor.
> > > > +
> > > > +   Adjacent Load Hoisting
> > > > +   ----------------------
> > > > +
> > > > +   This transformation replaces
> > > > +
> > > > +     bb0:
> > > > +       if (...) goto bb2; else goto bb1;
> > > > +     bb1:
> > > > +       x1 = (<expr>).field1;
> > > > +       goto bb3;
> > > > +     bb2:
> > > > +       x2 = (<expr>).field2;
> > > > +     bb3:
> > > > +       # x = PHI <x1, x2>;
> > > > +
> > > > +   with
> > > > +
> > > > +     bb0:
> > > > +       x1 = (<expr>).field1;
> > > > +       x2 = (<expr>).field2;
> > > > +       if (...) goto bb2; else goto bb1;
> > > > +     bb1:
> > > > +       goto bb3;
> > > > +     bb2:
> > > > +     bb3:
> > > > +       # x = PHI <x1, x2>;
> > > > +
> > > > +   The purpose of this transformation is to enable generation of conditional
> > > > +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> > > > +   the loads is speculative, the transformation is restricted to very
> > > > +   specific cases to avoid introducing a page fault.  We are looking for
> > > > +   the common idiom:
> > > > +
> > > > +     if (...)
> > > > +       x = y->left;
> > > > +     else
> > > > +       x = y->right;
> > > > +
> > > > +   where left and right are typically adjacent pointers in a tree structure.  */
> > > > +
> > > >  static unsigned int
> > > >  tree_ssa_phiopt (void)
> > > >  {
> > > > -  return tree_ssa_phiopt_worker (false);
> > > > +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
> > > >  }
> > > >
> > > >  /* This pass tries to transform conditional stores into unconditional
> > > > @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
> > > >  static unsigned int
> > > >  tree_ssa_cs_elim (void)
> > > >  {
> > > > -  return tree_ssa_phiopt_worker (true);
> > > > +  return tree_ssa_phiopt_worker (true, false);
> > > >  }
> > > >
> > > >  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> > > > @@ -227,9 +282,11 @@ static tree condstoretemp;
> > > >  /* The core routine of conditional store replacement and normal
> > > >    phi optimizations.  Both share much of the infrastructure in how
> > > >    to match applicable basic block patterns.  DO_STORE_ELIM is true
> > > > -   when we want to do conditional store replacement, false otherwise.  */
> > > > +   when we want to do conditional store replacement, false otherwise.
> > > > +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> > > > +   of diamond control flow patterns, false otherwise.  */
> > > >  static unsigned int
> > > > -tree_ssa_phiopt_worker (bool do_store_elim)
> > > > +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
> > > >  {
> > > >   basic_block bb;
> > > >   basic_block *bb_order;
> > > > @@ -312,6 +369,21 @@ static unsigned int
> > > >            cfgchanged = true;
> > > >          continue;
> > > >        }
> > > > +      else if (do_hoist_loads
> > > > +                && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> > > > +       {
> > > > +         basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> > > > +
> > > > +         if (!FLOAT_TYPE_P (TREE_TYPE (gimple_cond_lhs (cond_stmt)))
> > > > +             && single_succ_p (bb1)
> > > > +             && single_succ_p (bb2)
> > > > +             && single_pred_p (bb1)
> > > > +             && single_pred_p (bb2)
> > > > +             && EDGE_COUNT (bb->succs) == 2
> > > > +             && EDGE_COUNT (bb3->preds) == 2)
> > > > +           hoist_adjacent_loads (bb, bb1, bb2, bb3);
> > > > +         continue;
> > > > +       }
> > > >       else
> > > >        continue;
> > > >
> > > > @@ -1707,6 +1779,207 @@ cond_if_else_store_replacement (basic_block then_b
> > > >   return ok;
> > > >  }
> > > >
> > > > +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> > > > +
> > > > +static bool
> > > > +local_mem_dependence (gimple stmt, basic_block bb)
> > > > +{
> > > > +  tree vuse = gimple_vuse (stmt);
> > > > +  gimple def;
> > > > +
> > > > +  if (!vuse)
> > > > +    return false;
> > > > +
> > > > +  def = SSA_NAME_DEF_STMT (vuse);
> > > > +  return (def && gimple_bb (def) == bb);
> > > > +}
> > > > +
> > > > +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> > > > +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> > > > +   and BB3 rejoins control flow following BB1 and BB2, look for
> > > > +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> > > > +   two loads, one each occurring in BB1 and BB2, and the loads are
> > > > +   provably of adjacent fields in the same structure, then move both
> > > > +   loads into BB0.  Of course this can only be done if there are no
> > > > +   dependencies preventing such motion.
> > > > +
> > > > +   One of the hoisted loads will always be speculative, so the
> > > > +   transformation is currently conservative:
> > > > +
> > > > +    - The fields must be strictly adjacent.
> > > > +    - The two fields must occupy a single memory block that is
> > > > +      guaranteed to not cross a page boundary.
> > > > +
> > > > +    The last is difficult to prove, as such memory blocks should be
> > > > +    aligned on the minimum of the stack alignment boundary and the
> > > > +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> > > > +    on a parameter for the alignment value.
> > > > +
> > > > +    Provided a good value is used for the last case, the first
> > > > +    restriction could possibly be relaxed.  */
> > > > +
> > > > +static void
> > > > +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> > > > +                     basic_block bb2, basic_block bb3)
> > > > +{
> > > > +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> > > > +  unsigned param_align_bits = (unsigned) (param_align * BITS_PER_UNIT);
> > > > +  gimple_stmt_iterator gsi;
> > > > +
> > > > +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> > > > +     for phis of two SSA names, one each of which is defined in bb1 and
> > > > +     bb2.  */
> > > > +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> > > > +    {
> > > > +      gimple phi_stmt = gsi_stmt (gsi);
> > > > +      gimple def1, def2, defswap;
> > > > +      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
> > > > +      tree tree_offset1, tree_offset2, tree_size2, next;
> > > > +      int offset1, offset2, size2;
> > > > +      unsigned align1;
> > > > +      gimple_stmt_iterator gsi2;
> > > > +      basic_block bb_for_def1, bb_for_def2;
> > > > +
> > > > +      if (gimple_phi_num_args (phi_stmt) != 2)
> > > > +       continue;
> > > > +
> > > > +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> > > > +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> > > > +
> > > > +      if (TREE_CODE (arg1) != SSA_NAME
> > > > +         || TREE_CODE (arg2) != SSA_NAME
> > > > +         || SSA_NAME_IS_DEFAULT_DEF (arg1)
> > > > +         || SSA_NAME_IS_DEFAULT_DEF (arg2))
> > > > +       continue;
> > > > +
> > > > +      def1 = SSA_NAME_DEF_STMT (arg1);
> > > > +      def2 = SSA_NAME_DEF_STMT (arg2);
> > > > +
> > > > +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> > > > +         && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> > > > +       continue;
> > > > +
> > > > +      /* Check the mode of the arguments to be sure a conditional move
> > > > +        can be generated for it.  */
> > > > +      if (!optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> > > > +       continue;
> > > > +
> > > > +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> > > > +      if (!gimple_assign_single_p (def1)
> > > > +         || !gimple_assign_single_p (def2))
> > > > +       continue;
> > > > +
> > > > +      ref1 = gimple_assign_rhs1 (def1);
> > > > +      ref2 = gimple_assign_rhs1 (def2);
> > > > +
> > > > +      if (TREE_CODE (ref1) != COMPONENT_REF
> > > > +         || TREE_CODE (ref2) != COMPONENT_REF)
> > > > +       continue;
> > > > +
> > > > +      /* The zeroth operand of the two component references must be
> > > > +        identical.  It is not sufficient to compare get_base_address of
> > > > +        the two references, because this could allow for different
> > > > +        elements of the same array in the two trees.  It is not safe to
> > > > +        assume that the existence of one array element implies the
> > > > +        existence of a different one.  */
> > > > +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> > > > +       continue;
> > > > +
> > > > +      field1 = TREE_OPERAND (ref1, 1);
> > > > +      field2 = TREE_OPERAND (ref2, 1);
> > > > +
> > > > +      /* Check for field adjacency, and ensure field1 comes first.  */
> > > > +      for (next = DECL_CHAIN (field1);
> > > > +          next && TREE_CODE (next) != FIELD_DECL;
> > > > +          next = DECL_CHAIN (next))
> > > > +       ;
> > > > +
> > > > +      if (next != field2)
> > > > +       {
> > > > +         for (next = DECL_CHAIN (field2);
> > > > +              next && TREE_CODE (next) != FIELD_DECL;
> > > > +              next = DECL_CHAIN (next))
> > > > +           ;
> > > > +
> > > > +         if (next != field1)
> > > > +           continue;
> > > > +
> > > > +         fieldswap = field1;
> > > > +         field1 = field2;
> > > > +         field2 = fieldswap;
> > > > +         defswap = def1;
> > > > +         def1 = def2;
> > > > +         def2 = defswap;
> > > > +         /* Don't swap bb1 and bb2 as we may have more than one
> > > > +            phi to process successfully.  */
> > > > +         bb_for_def1 = bb2;
> > > > +         bb_for_def2 = bb1;
> > > > +       }
> > > > +      else
> > > > +       {
> > > > +         bb_for_def1 = bb1;
> > > > +         bb_for_def2 = bb2;
> > > > +       }
> > > > +
> > > > +      /* Check for proper alignment of the first field.  */
> > > > +      tree_offset1 = bit_position (field1);
> > > > +      tree_offset2 = bit_position (field2);
> > > > +      tree_size2 = DECL_SIZE (field2);
> > > > +
> > > > +      if (!host_integerp (tree_offset1, 1)
> > > > +         || !host_integerp (tree_offset2, 1)
> > > > +         || !host_integerp (tree_size2, 1))
> > > > +       continue;
> > > > +
> > > > +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> > > > +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> > > > +      size2 = TREE_INT_CST_LOW (tree_size2);
> > > > +      align1 = DECL_ALIGN (field1) % param_align_bits;
> > > > +
> > > > +      if (offset1 % BITS_PER_UNIT != 0)
> > > > +       continue;
> > > > +
> > > > +      /* The two field references must fit within a block of memory that
> > > > +        is guaranteed to be on the same page.  */
> > > > +      if (align1 + offset2 - offset1 + size2 > param_align_bits)
> > > > +       continue;
> > >
> > > If param_align_bits were the cache-line size then this would be appropriate
> > > for a check if the transform is profitable.  But the comment suggests it
> > > is a check for correctness?  We do already have a l1-cache-line-size
> > > param btw.
> > >
> > > Did you check packed nested structs?  Like
> > >
> > > struct A {
> > >   int i;
> > >   int j;
> > > };
> > > struct {
> > >   char c;
> > >   struct A a;
> > > } __attribute__((packed)) x;
> > >
> > > and loads of x.a.i and x.a.j?  I _think_ that DECL_ALIGN of the i and j
> > > FIELD_DECL may still be 4 in this case.  Which means that if you
> > > are performing the check for correctness then it has to be a lot more
> > > conservative.
> > >
> > > I _think_ we have concluded that if you see loads of p->a and p->b
> > > then either both trap or both do not trap, so the check for correctness
> > > is that you see two COMPONENT_REFs of the same base for
> > > adjacent fields.
> >
> > The above is largely a commentary problem.  This isn't a test for
> > correctness, but a heuristic for profitability.  We don't want to cross
> > page boundaries with this optimization, at least not often.  As you
> > point out, for correctness we avoid extra traps by checking for a common
> > base.  What we're trying to avoid here is two adjacent fields on two
> > different memory pages so we don't introduce a page fault.
> >
> > Maybe this is overkill and should just be removed.  It is difficult to
> > be accurate and effective with such a test.  The idea was to
> > parameterize on a size that corresponds to the alignment guarantee of
> > the stack and heap, but if the parameter is too small, it disables the
> > optimization too frequently when it is useful.
> >
> > So I could go with fixing the comment, removing the test, or updating it
> > to something that tries to avoid page faults in a better way, if you
> > have ideas for that.  Please let me know which seems best.
>
> Well, fixup the comment and re-use l1-cache-line-size instead of adding
> a new one.  If we unconditionally fetch a new memory lane that's the
> thing to base profitability on.

OK, sounds good.

>
> > >
> > > Btw, you do not restrict the hoisting to the case where the the/else
> > > block will be empty afterwards?  It seems to be the transform is
> > > not profitable if we end up doing the if() anyways - it might even
> > > be not profitable (similar to the case where either path is very likely
> > > and the other not - conditional moves are mostly profitable when
> > > the possibility of the branches is equal).
> > >
> >
> > In general it is difficult to make such a decision accurately.  I have
> > seen numerous cases where the then/else blocks contain two or more
> > opportunities for collapsing into a conditional move.  Looking at just
> > one of them, the opportunity would look bad if we restricted it to only
> > cases where the blocks became empty.  So that introduces more complexity
> > to check that all non-control stmts will be removed by some instance of
> > the transformation.
> >
> > Also, even without removing the if, we sometimes see profitable
> > situations like:
> >
> > if (...)
> >   {
> >     a = p.x;
> >     b = p.y;
> >     ...
> >   }
> > else
> >   {
> >     a = p.y;
> >     b = p.x;
> >     ...
> >   }
> >
> > Here the loads are not speculative at all, so hoisting them is always
> > good.  (But perhaps PRE and later processing would pick up the cmove
> > possibilities, not sure.)
> >
> > In any case I haven't been able to find benchmarks where the heuristic
> > as written is bad on PowerPC, and our conditional move instruction isn't
> > always the most efficient in the universe. ;-/  In general, the extra
> > load is "free" (from a dcache point of view) and the potential extra
> > fetch cost, etc., tends to wash out due to fetch-ahead inefficiencies
> > and so on.
>
> What I've seen in other PRs is that cmov is slow compared to a
> well-prediced conditional jump on x86.  Thus, can you restrict
> speculation to branches with probabilities that are at least not
> hot/cold?  Honza, which generic interface would you use for this
> (maybe add a new one)?  (see predict.[ch]).

I found this parameter that seems to correspond to well-predicted
conditional jumps:

/* When branch is predicted to be taken with probability lower than this
   threshold (in percent), then it is considered well predictable. */
DEFPARAM (PARAM_PREDICTABLE_BRANCH_OUTCOME,
          "predictable-branch-outcome",
          "Maximal estimated outcome of branch considered predictable",
          2, 0, 50)

I'll plan to use that unless I hear objections.

Thanks,
Bill

>
> Thanks,
> Richard.
>
> >
> > Thanks,
> > Bill
> >
> > > Thanks,
> > > Richard.
> > >
> > > > +
> > > > +      /* The two expressions cannot be dependent upon vdefs defined
> > > > +        in bb1/bb2.  */
> > > > +      if (local_mem_dependence (def1, bb_for_def1)
> > > > +         || local_mem_dependence (def2, bb_for_def2))
> > > > +       continue;
> > > > +
> > > > +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> > > > +        bb0.  We hoist the first one first so that a cache miss is handled
> > > > +         efficiently regardless of hardware cache-fill policy.  */
> > > > +      gsi2 = gsi_for_stmt (def1);
> > > > +      gsi_move_to_bb_end (&gsi2, bb0);
> > > > +      gsi2 = gsi_for_stmt (def2);
> > > > +      gsi_move_to_bb_end (&gsi2, bb0);
> > > > +
> > > > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > > > +       {
> > > > +         fprintf (dump_file,
> > > > +                  "\nHoisting adjacent loads from %d and %d into %d: \n",
> > > > +                  bb_for_def1->index, bb_for_def2->index, bb0->index);
> > > > +         print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> > > > +         print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> > > > +       }
> > > > +    }
> > > > +}
> > > > +
> > > > +/* Determine whether we should attempt to hoist adjacent loads out of
> > > > +   diamond patterns in pass_phiopt.  Always hoist loads if
> > > > +   -fhoist-adjacent-loads is specified and the target machine has
> > > > +   a conditional move instruction.  */
> > > > +
> > > > +static bool
> > > > +gate_hoist_loads (void)
> > > > +{
> > > > +  return (flag_hoist_adjacent_loads == 1 && HAVE_conditional_move);
> > > > +}
> > > > +
> > > >  /* Always do these optimizations if we have SSA
> > > >    trees to work on.  */
> > > >  static bool
> > > > Index: gcc/common.opt
> > > > ===================================================================
> > > > --- gcc/common.opt      (revision 187805)
> > > > +++ gcc/common.opt      (working copy)
> > > > @@ -1186,6 +1186,11 @@ fgraphite-identity
> > > >  Common Report Var(flag_graphite_identity) Optimization
> > > >  Enable Graphite Identity transformation
> > > >
> > > > +fhoist-adjacent-loads
> > > > +Common Report Var(flag_hoist_adjacent_loads) Optimization
> > > > +Enable hoisting adjacent loads to encourage generating conditional move
> > > > +instructions
> > > > +
> > > >  floop-parallelize-all
> > > >  Common Report Var(flag_loop_parallelize_all) Optimization
> > > >  Mark all loops as parallel
> > > > Index: gcc/Makefile.in
> > > > ===================================================================
> > > > --- gcc/Makefile.in     (revision 187805)
> > > > +++ gcc/Makefile.in     (working copy)
> > > > @@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
> > > >    $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
> > > >    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
> > > >    $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> > > > -   $(TREE_DATA_REF_H) tree-pretty-print.h
> > > > +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> > > > +   insn-config.h $(EXPR_H) $(OPTABS_H)
> > > >  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
> > > >    $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
> > > >    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> > > > Index: gcc/params.def
> > > > ===================================================================
> > > > --- gcc/params.def      (revision 187805)
> > > > +++ gcc/params.def      (working copy)
> > > > @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
> > > >          "track string lengths",
> > > >          1000, 0, 0)
> > > >
> > > > +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> > > > +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> > > > +         "min-cmove-struct-align",
> > > > +         "Minimum byte alignment to assume for structures in the stack "
> > > > +         "or heap when considering load hoisting for conditional moves",
> > > > +         32, 8, 256)
> > > > +
> > > >  /*
> > > >  Local variables:
> > > >  mode:c
> > > >
> > > >
> > >
> >
> >
>


Re: [PATCH] Hoist adjacent pointer loads

by William J. Schmidt :: Rate this Message:

| View Threaded | Show Only this Message

On Mon, 2012-06-11 at 12:11 -0500, William J. Schmidt wrote:

> I found this parameter that seems to correspond to well-predicted
> conditional jumps:
>
> /* When branch is predicted to be taken with probability lower than this
>    threshold (in percent), then it is considered well predictable. */
> DEFPARAM (PARAM_PREDICTABLE_BRANCH_OUTCOME,
>  "predictable-branch-outcome",
>  "Maximal estimated outcome of branch considered predictable",
>  2, 0, 50)
>
...which has an interface predictable_edge_p () in predict.c, so that's
what I'll use.

Thanks,
Bill


< Prev | 1 - 2 | Next >