当前位置：首页 > news >正文

544 eff.c:1761处loop vect 分析

news 来源：原创 2025/9/20 6:32:49

2.6 带有mask的向量数学函数

gcc 支持的svml向量数学函数

32652 GCC currently emits calls to @code{vmldExp2},
32653 @code{vmldLn2}, @code{vmldLog102}, @code{vmldPow2},
32654 @code{vmldTanh2}, @code{vmldTan2}, @code{vmldAtan2}, @code{vmldAtanh2},
32655 @code{vmldCbrt2}, @code{vmldSinh2}, @code{vmldSin2}, @code{vmldAsinh2},
32656 @code{vmldAsin2}, @code{vmldCosh2}, @code{vmldCos2}, @code{vmldAcosh2},
32657 @code{vmldAcos2}, @code{vmlsExp4}, @code{vmlsLn4},
32658 @code{vmlsLog104}, @code{vmlsPow4}, @code{vmlsTanh4}, @code{vmlsTan4},
32659 @code{vmlsAtan4}, @code{vmlsAtanh4}, @code{vmlsCbrt4}, @code{vmlsSinh4},
32660 @code{vmlsSin4}, @code{vmlsAsinh4}, @code{vmlsAsin4}, @code{vmlsCosh4},
32661 @code{vmlsCos4}, @code{vmlsAcosh4} and @code{vmlsAcos4} for corresponding
32662 function type when @option{-mveclibabi=svml} is used

oneapi的IR:%3970 = call fast cc104 <4 x double> @__svml_log4_mask(<4 x double> %3968, <4 x i64> %3969)

gcc的IR : _799 = _ZGVdN4v_logD.6143 (_800);

<__svml_log4_mask_e9>汇编代码的函数原名。

从如何调用不带mask的svml向量数学函数的流程出发，找出调用带有mask的方法。

设计方案：

vect__ifc__1252.1526_717 = VEC_COND_EXPR <mask__1460.1449_910, vect__1761.1465_870, { 0.0, 0.0 }>; 找到一个VEC_COND_EXPR，在同一个基本块中，根据第二个或者第三个参数所涉及到的运算（建立一个栈暂存每次找到的结果），顺着运算的关系一步步往上找，直到找到了需要进行mask的数学函数。如果在第二个参数中找到，VEC_COND_EXPR中的第一个参数mask就是数学函数需要进行mask的值。如果在第三个参数的关系链中找到，其所需的mask就是VEC_COND_EXPR中的mask的取反。将数学函数和mask一起生成带有mask的数学函数的IR，替换掉原来的不带mask的。（在生成cond_expr之后做还是在loop vect pass之后另外新建一个pass做。)

     #include "config.h"2  #include "system.h"3  #include "coretypes.h"4  #include "backend.h"5  #include "tree.h"6  #include "gimple.h"7  #include "predict.h"8  #include "tree-pass.h"9  #include "ssa.h"10  #include "cgraph.h"11  #include "fold-const.h"12  #include "stor-layout.h"13  #include "gimple-iterator.h"14  #include "gimple-walk.h"15  #include "tree-ssa-loop-manip.h"16  #include "tree-ssa-loop-niter.h"17  #include "tree-cfg.h"18  #include "cfgloop.h"19  #include "tree-vectorizer.h"20  #include "tree-ssa-propagate.h"21  #include "dbgcnt.h"22  #include "tree-scalar-evolution.h"23  #include "stringpool.h"24  #include "attribs.h"25  #include "gimple-pretty-print.h"26  #include "opt-problem.h"27  #include "internal-fn.h"28  #include "tree-ssa-sccvn.h"29  #include "gimple-expr.h"30  #include <cstdio>31 32  namespace33  {34  const pass_data pass_data_test = {35    GIMPLE_PASS,           /* type */36    "mask_vecmath_func",                /* name */37    OPTGROUP_NONE,         /* optinfo_flags */38    TV_TREE_VECT_MASK_VECMATH_FUNC,          /* tv_id */39    (PROP_cfg | PROP_ssa), /* properties_required */40    0,                     /* properties_provided */41    0,                     /* properties_destroyed */42    0,                     /* todo_flags_start */43    0,                     /* todo_flags_finish */44  };46  class pass_mask_vecmath_func : public gimple_opt_pass47  {48  public:49    pass_mask_vecmath_func (gcc::context *ctxt) : gimple_opt_pass (pass_data_test, ctxt) {}50    virtual bool51    gate (function *fun)52    {53     // printf ("gate function noipa.\n");54      return flag_tree_mask_vecmath_func;55    }56 57    virtual unsigned int execute (function *);58  };59 60 61 static gimple *find_relate_operand(tree operand, gimple *stmt)62 {63   if (!stmt)64         return NULL;65 66   if (TREE_CODE (operand) == SSA_NAME && is_gimple_call(stmt)) {  // operand is ssa && stmt is gimple call67      tree fndecl = gimple_call_fndecl(stmt);  // 获取函数声明68        if (fndecl && DECL_P(fndecl)) {  // 确保fndecl有效并且是一个声明69           const char *func_name = IDENTIFIER_POINTER(DECL_NAME(fndecl));  // 获取函数名称70          // if (strcmp(func_name, "vmldLn2") == 0) {71           if (strcmp(func_name, "__svml_log4_mask_e9") == 0) {72             return stmt;73           }74        }75   }76   if (TREE_CODE (operand) == SSA_NAME && is_gimple_assign(stmt)) {   // only find gimple assign77 78      for (unsigned i = 1; i < gimple_num_ops(stmt); ++i) {  // get gimple assign right hand side operand79         tree op = gimple_op(stmt, i);80         if(TREE_CODE (op) == SSA_NAME) {81 82            gimple *stmt_2 = SSA_NAME_DEF_STMT (op);83            gimple *result = find_relate_operand(op,stmt_2);84            if(result) return result;85         }86     }87   }88   return NULL;89 }90 91 static void add_mask_to_call(gimple *stmt, tree new_arg) {92     if (!is_gimple_call(stmt)) {93         // 如果不是函数调用语句，则不做任何操作94         return;95     }96 97     // 获取原始函数调用的目标和参数列表98     tree call_fn = gimple_call_fndecl(stmt);99 
100    // 获取或创建新的标识符节点来表示新的函数名称
101   // tree new_func_id = get_identifier("vmldLn2Mask");
102    tree new_func_id = get_identifier("__svml_log4_mask_e9");
103    tree fntype = TREE_TYPE(call_fn);
104 
105    tree new_fndecl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, new_func_id, fntype);
106 
107    TREE_PUBLIC (new_fndecl) = 1;
108    DECL_EXTERNAL (new_fndecl) = 1;
109    DECL_IS_NOVOPS (new_fndecl) = 1;
110    TREE_READONLY (new_fndecl) = 1;
111 
112 
113    // 将新的标识符节点分配给函数声明的汇编名
114   // DECL_ASSEMBLER_NAME(call_fn) = new_func_id;
115 
116     int num_args = gimple_call_num_args(stmt);
117     vec<tree> vargs = vNULL;
118     vargs.create (num_args+1);
119 
120     // 创建一个新的参数列表，包含原始的参数和新的参数
121     for (int i = 0; i < num_args; i++) {
122         tree arg = gimple_call_arg(stmt, i);
123         vargs.safe_push(arg);
124     }
125     vargs.safe_push(new_arg);
126 
127     tree lhs = gimple_call_lhs(stmt);
128 
129     // 创建新的函数调用语句，包含新的参数
130     gimple *new_call = gimple_build_call_vec(new_fndecl,vargs);
131     gimple_call_set_lhs (new_call, lhs);
132 
133     // 替换原始的函数调用语句
134     gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
135 
136  //   printf ("-------------finish add mask to vecmath func call------------.\n");
137 
138     gsi_replace(&gsi, new_call,true);
139     stmt = new_call;
140 
141     // 释放参数列表的内存
142     vargs.release ();
143 }
144 
145  unsigned
146  pass_mask_vecmath_func::execute (function *fun)
147  {
148    unsigned ret = 0;
149 
150  //  printf ("-----------begin mask vecmath func------------.\n");
151  //  printf ("current function name:%s\n", function_name (fun));
152    basic_block bb;
153    enum tree_code code;
154 
155     // 遍历所有基本块
156     FOR_EACH_BB_FN(bb, fun) {
157         gimple_stmt_iterator gsi;
158 
159         // 遍历基本块中的所有 GIMPLE 语句
160         for (gsi = gsi_start_bb(bb); !gsi_end_p(gsi); gsi_next(&gsi)) {
161             gimple *stmt = gsi_stmt(gsi);
162             if (is_gimple_assign(stmt)) {
163 
164                gassign *stmt_assign = dyn_cast <gassign *> (gsi_stmt (gsi));
165                code = gimple_assign_rhs_code (stmt_assign);
166 
167              // 检查语句是否为 VEC_COND_EXPR
168                if (code == VEC_COND_EXPR) {
169 
170                 //  printf ("-----------find out vec cond expr------------.\n");
171                   tree true_vector_operand = gimple_assign_rhs2(stmt_assign); // add wrong vec operand
172                   tree  mask_operand = gimple_assign_rhs1(stmt_assign);
173                   if(TREE_CODE (true_vector_operand) == SSA_NAME) {
174 
175                      gimple *stmt_def = SSA_NAME_DEF_STMT (true_vector_operand);
176                      gimple *stmt_vecmath = find_relate_operand(true_vector_operand,stmt_def);
177                      if(stmt_vecmath) {
178                  //  printf ("-----------find out vecmath stmt------------.\n");
179                      add_mask_to_call(stmt_vecmath,mask_operand);
180 
181                      }
182                   }
183               }
184             }
185         }
186     }
187    return ret;
188  }
189  }
190 
191  gimple_opt_pass *
192  make_pass_mask_vecmath_func (gcc::context *ctxt)
193  {
194    return new pass_mask_vecmath_func (ctxt);
195  }

生成了正确的IR之后，使用buitlin的方式调用svml中的带有mask的数学函数。

gcc调用svml函数在gimple阶段的过程：

1：examining statement:

vect_analyze_stmt函数中检查stmt, 在vectorizable_xxx函数里面判断操作数的类型。vect_is_simple_use: 计算向量化的cost, vect_model_simple_cost，先不进行transform。

调用svml需要使用target-specific built-in function，使用此函数targetm.vectorize.builtin_vectorized_function，根据优化选项（config/i386/i386-options.cc:2567）定位到（ix86_veclib_handler = &ix86_veclibabi_svml）后端ix86_veclibabi_svml函数处，返回向量svml函数fndecl。

2：vectorizing statement：

vect_transform_loop_stmt函数中，进行transform,同样也会调用vectorizable_xxx函数进行此转化。gimple_build_call_vec (fndecl, vargs)：根据获取到的fndecl以及对参数的向量化，构建一个新的gimple vec call。

loop vec pass的调用栈

vect_analyze_loop_2：

Apply a set of analyses on LOOP, and create a loop_vec_info struct for it. The different analyses will record information in the loop_vec_info struct

loop_vec_info 里面放的是对loop 分析完成后的整个loop的信息

vect_analyze_loop_operations：

Scan the loop stmts and make sure they are all vectorizable.

vect_analyze_stmt：

Make sure the statement is vectorizable.

ziyuan 2.3 和 2.4修改对于其他课题的影响 aggressive_if_conv && use_gather_2parts result.xlsx 采用HygonGCC 1.3.2编译器最新版本和最新配置文件Hygon7490-2p-HygonGCC1.3.2.202403-hgalloc-znver1-base.cfg

跑1copy的时候整个node最好不要跑其他程序，不然性能数据会波动较大。会抢占node的内存等资源。

可能优化的方向：

gcc调用svml向量数学库的接口函数只能支持128bit的输入。修改接口调用256bit的输入。
-mtune-ctrl=^avx256_split_regs,^avx128_optimal,256_unaligned_store_optimal可以使程序使用256bit的ymm寄存器，提高循环向量化的vf,对性能有提高2069 4%，1761：8%。
oneapi使用将条件和条件里面的计算分别放在不同的bb块中，通过控制流来选择需要执行哪些分支，可以减少冗余运算。Gcc向量化只能在同一个bb块中进行，无法控制每个分支，只支持在log函数上进行mask操作，和最终运算的结果上进行选择，其他操作- + *等只能在支持avx512的机器上。只能想办法在gcc上也进行将不同分支分为不同bb块的操作，模仿oneapi。
gcc上的vf是8,使用两次log4，oneapi的vf为4,使用一次log4,通过将i32扩展为i64，使用256bit ymm，尝试将gcc变为vf4使用一次log4，使用相似的方法，未能成功。并且怀疑3才是性能的主要点，此操作应该不是性能的主要点。

5. gcc循环向量化无法处理跨bb的问题，如果向量化后拆分成不同bb，后续的pass可能无法处理会对拆分的bb做一些未知的操作，不建议使用此方法，可以在原有的bb里面插入一些根据mask进行选择的指令，来模拟分支选择的操作。

void calc(double *src1,double *src2,double *src3)5    {6        int i;7        for(i=0;i<100;i++)8        {9            if(src3[i] > 10.0)10            {11                src1[i] = exp(src2[i]);12            }13            else if(src3[i] > 5.0)14            {15                src1[i] = log(src2[i]);16            }17            else if(src3[i] > 2.5)18            {19                src1[i] = sin(src2[i]);20            }21        }22    }

对于有mask store的操作，会将if-conversion操作进行回退。optimize_mask_stores

1：新建一个对mask进行判断是否全为0的GIMPLE_COND。

2：新建一个then bb块，并且维护其边。

3：在mask store后分割一个新的bb，并且把stmt全部移到bb里面，新建一个边。

create_basic_block_1 (void *head, void *end, basic_block after):

int vf为4，double vf 为2.

test_mask_vecmath.c:13:18: note:   === vect_determine_vectorization_factor ===681 test_mask_vecmath.c:13:18: note:   ==> examining phi: i_114 = PHI <i_85(20), 0(35)>682 test_mask_vecmath.c:13:18: note:   ==> examining phi: sumi1_115 = PHI <_136(20), 0.0(35)>683 test_mask_vecmath.c:13:18: note:   get vectype for scalar type:  double684 test_mask_vecmath.c:13:18: note:   vectype: vector(2) double685 test_mask_vecmath.c:13:18: note:   nunits = 2686 test_mask_vecmath.c:13:18: note:   ==> examining phi: sumi2_117 = PHI <_138(20), 0.0(35)>687 test_mask_vecmath.c:13:18: note:   get vectype for scalar type:  double688 test_mask_vecmath.c:13:18: note:   vectype: vector(2) double689 test_mask_vecmath.c:13:18: note:   nunits = 2690 test_mask_vecmath.c:13:18: note:   ==> examining phi: sumi3_119 = PHI <_140(20), 0.0(35)>691 test_mask_vecmath.c:13:18: note:   get vectype for scalar type:  double692 test_mask_vecmath.c:13:18: note:   vectype: vector(2) double693 test_mask_vecmath.c:13:18: note:   nunits = 2694 test_mask_vecmath.c:13:18: note:   ==> examining phi: ivtmp_106 = PHI <ivtmp_101(20), 100(35)>695 test_mask_vecmath.c:13:18: note:   ==> examining statement: _62 = (long unsigned int) i_114;696 test_mask_vecmath.c:13:18: note:   skip.697 test_mask_vecmath.c:13:18: note:   ==> examining statement: _63 = _62 * 4;698 test_mask_vecmath.c:13:18: note:   skip.699 test_mask_vecmath.c:13:18: note:   ==> examining pattern def stmt: patt_151 = i_114 w* 4;700 test_mask_vecmath.c:13:18: note:   skip.701 test_mask_vecmath.c:13:18: note:   ==> examining pattern statement: patt_152 = (long unsigned int) patt_151;702 test_mask_vecmath.c:13:18: note:   skip.703 test_mask_vecmath.c:13:18: note:   ==> examining statement: _64 = &src3 + _63;704 test_mask_vecmath.c:13:18: note:   skip.705 test_mask_vecmath.c:13:18: note:   ==> examining statement: j_65 = *_64;706 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(4) int707 test_mask_vecmath.c:13:18: note:   nunits = 4708 test_mask_vecmath.c:13:18: note:   ==> examining statement: _66 = (long unsigned int) j_65;709 test_mask_vecmath.c:13:18: note:   skip.710 test_mask_vecmath.c:13:18: note:   ==> examining statement: _67 = _66 * 8;711 test_mask_vecmath.c:13:18: note:   skip.712 test_mask_vecmath.c:13:18: note:   ==> examining pattern def stmt: patt_153 = j_65 w* 8;713 test_mask_vecmath.c:13:18: note:   skip.714 test_mask_vecmath.c:13:18: note:   ==> examining pattern statement: patt_154 = (long unsigned int) patt_153;715 test_mask_vecmath.c:13:18: note:   skip.716 test_mask_vecmath.c:13:18: note:   ==> examining statement: _142 = _141 + _67;717 test_mask_vecmath.c:13:18: note:   skip.
test_mask_vecmath.c:13:18: note:   ==> examining statement: _68 = (double *) _142;719 test_mask_vecmath.c:13:18: note:   skip.720 test_mask_vecmath.c:13:18: note:   ==> examining statement: _143 = j_65 > 10;721 test_mask_vecmath.c:13:18: note:   vectype: vector(4) <signed-boolean:32>722 test_mask_vecmath.c:13:18: note:   nunits = 4723 test_mask_vecmath.c:13:18: note:   ==> examining statement: _69 = .MASK_LOAD (_68, 64B, _143);724 test_mask_vecmath.c:13:18: note:   skip.725 test_mask_vecmath.c:13:18: note:   ==> examining pattern def stmt: patt_155 = (<signed-boolean:64>) _143;726 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(2) <signed-boolean:64>727 test_mask_vecmath.c:13:18: note:   nunits = 2728 test_mask_vecmath.c:13:18: note:   ==> examining pattern statement: patt_156 = .MASK_LOAD (_68, 64B, patt_155);729 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(2) double730 test_mask_vecmath.c:13:18: note:   nunits = 2731 test_mask_vecmath.c:13:18: note:   ==> examining statement: _70 = log (_69);732 test_mask_vecmath.c:13:18: note:   get vectype for scalar type: double733 test_mask_vecmath.c:13:18: note:   vectype: vector(2) double734 test_mask_vecmath.c:13:18: note:   nunits = 2735 test_mask_vecmath.c:13:18: note:   ==> examining statement: _89 = (unsigned int) j_65;736 test_mask_vecmath.c:13:18: note:   get vectype for scalar type: unsigned int737 test_mask_vecmath.c:13:18: note:   vectype: vector(4) unsigned int738 test_mask_vecmath.c:13:18: note:   nunits = 4739 test_mask_vecmath.c:13:18: note:   ==> examining statement: _87 = _89 + 4294967288;740 test_mask_vecmath.c:13:18: note:   get vectype for scalar type: unsigned int741 test_mask_vecmath.c:13:18: note:   vectype: vector(4) unsigned int742 test_mask_vecmath.c:13:18: note:   nunits = 4743 test_mask_vecmath.c:13:18: note:   ==> examining statement: _73 = _62 * 8;744 test_mask_vecmath.c:13:18: note:   skip.745 test_mask_vecmath.c:13:18: note:   ==> examining pattern def stmt: patt_157 = i_114 w* 8;746 test_mask_vecmath.c:13:18: note:   skip.747 test_mask_vecmath.c:13:18: note:   ==> examining pattern statement: patt_158 = (long unsigned int) patt_157;748 test_mask_vecmath.c:13:18: note:   skip.749 test_mask_vecmath.c:13:18: note:   ==> examining statement: _145 = _73 + _141;750 test_mask_vecmath.c:13:18: note:   skip.751 test_mask_vecmath.c:13:18: note:   ==> examining statement: _74 = (double *) _145;752 test_mask_vecmath.c:13:18: note:   skip.753 test_mask_vecmath.c:13:18: note:   ==> examining statement: _146 = _87 <= 2;754 test_mask_vecmath.c:13:18: note:   vectype: vector(4) <signed-boolean:32>755 test_mask_vecmath.c:13:18: note:   nunits = 4756 test_mask_vecmath.c:13:18: note:   ==> examining statement: _75 = .MASK_LOAD (_74, 64B, _146);757 test_mask_vecmath.c:13:18: note:   skip.758 test_mask_vecmath.c:13:18: note:   ==> examining pattern def stmt: patt_159 = (<signed-boolean:64>) _146;759 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(2) <signed-boolean:64>760 test_mask_vecmath.c:13:18: note:   nunits = 2
761 test_mask_vecmath.c:13:18: note:   ==> examining pattern statement: patt_160 = .MASK_LOAD (_74, 64B, patt_159);762 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(2) double763 test_mask_vecmath.c:13:18: note:   nunits = 2764 test_mask_vecmath.c:13:18: note:   ==> examining statement: _76 = log (_75);765 test_mask_vecmath.c:13:18: note:   get vectype for scalar type: double766 test_mask_vecmath.c:13:18: note:   vectype: vector(2) double767 test_mask_vecmath.c:13:18: note:   nunits = 2768 test_mask_vecmath.c:13:18: note:   ==> examining statement: _148 = _73 + _147;769 test_mask_vecmath.c:13:18: note:   skip.770 test_mask_vecmath.c:13:18: note:   ==> examining statement: _80 = (double *) _148;771 test_mask_vecmath.c:13:18: note:   skip.772 test_mask_vecmath.c:13:18: note:   ==> examining statement: _149 = j_65 == 7;773 test_mask_vecmath.c:13:18: note:   vectype: vector(4) <signed-boolean:32>774 test_mask_vecmath.c:13:18: note:   nunits = 4775 test_mask_vecmath.c:13:18: note:   ==> examining statement: _81 = .MASK_LOAD (_80, 64B, _149);776 test_mask_vecmath.c:13:18: note:   skip.777 test_mask_vecmath.c:13:18: note:   ==> examining pattern def stmt: patt_161 = (<signed-boolean:64>) _149;778 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(2) <signed-boolean:64>779 test_mask_vecmath.c:13:18: note:   nunits = 2780 test_mask_vecmath.c:13:18: note:   ==> examining pattern statement: patt_162 = .MASK_LOAD (_80, 64B, patt_161);781 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(2) double782 test_mask_vecmath.c:13:18: note:   nunits = 2783 test_mask_vecmath.c:13:18: note:   ==> examining statement: _82 = log (_81);784 test_mask_vecmath.c:13:18: note:   get vectype for scalar type: double785 test_mask_vecmath.c:13:18: note:   vectype: vector(2) double786 test_mask_vecmath.c:13:18: note:   nunits = 2787 test_mask_vecmath.c:13:18: note:   ==> examining statement: _ifc__135 = j_65 > 10 ? _70 : 0.0;788 test_mask_vecmath.c:13:18: note:   skip.789 test_mask_vecmath.c:13:18: note:   ==> examining pattern def stmt: patt_163 = j_65 > 10;790 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(4) <signed-boolean:32>791 test_mask_vecmath.c:13:18: note:   nunits = 4792 test_mask_vecmath.c:13:18: note:   ==> examining pattern def stmt: patt_164 = (<signed-boolean:64>) patt_163;793 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(2) <signed-boolean:64>794 test_mask_vecmath.c:13:18: note:   nunits = 2795 test_mask_vecmath.c:13:18: note:   ==> examining pattern statement: patt_165 = patt_164 ? _70 : 0.0;
796 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(2) double797 test_mask_vecmath.c:13:18: note:   nunits = 2798 test_mask_vecmath.c:13:18: note:   ==> examining statement: _136 = sumi1_115 + _ifc__135;799 test_mask_vecmath.c:13:18: note:   get vectype for scalar type: double800 test_mask_vecmath.c:13:18: note:   vectype: vector(2) double801 test_mask_vecmath.c:13:18: note:   nunits = 2802 test_mask_vecmath.c:13:18: note:   ==> examining statement: _ifc__137 = _87 <= 2 ? _76 : 0.0;803 test_mask_vecmath.c:13:18: note:   skip.804 test_mask_vecmath.c:13:18: note:   ==> examining pattern def stmt: patt_166 = _87 <= 2;805 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(4) <signed-boolean:32>806 test_mask_vecmath.c:13:18: note:   nunits = 4807 test_mask_vecmath.c:13:18: note:   ==> examining pattern def stmt: patt_167 = (<signed-boolean:64>) patt_166;808 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(2) <signed-boolean:64>809 test_mask_vecmath.c:13:18: note:   nunits = 2810 test_mask_vecmath.c:13:18: note:   ==> examining pattern statement: patt_168 = patt_167 ? _76 : 0.0;811 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(2) double812 test_mask_vecmath.c:13:18: note:   nunits = 2813 test_mask_vecmath.c:13:18: note:   ==> examining statement: _138 = sumi2_117 + _ifc__137;814 test_mask_vecmath.c:13:18: note:   get vectype for scalar type: double815 test_mask_vecmath.c:13:18: note:   vectype: vector(2) double816 test_mask_vecmath.c:13:18: note:   nunits = 2817 test_mask_vecmath.c:13:18: note:   ==> examining statement: _ifc__139 = j_65 == 7 ? _82 : 0.0;818 test_mask_vecmath.c:13:18: note:   skip.819 test_mask_vecmath.c:13:18: note:   ==> examining pattern def stmt: patt_169 = j_65 == 7;820 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(4) <signed-boolean:32>821 test_mask_vecmath.c:13:18: note:   nunits = 4822 test_mask_vecmath.c:13:18: note:   ==> examining pattern def stmt: patt_170 = (<signed-boolean:64>) patt_169;823 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(2) <signed-boolean:64>824 test_mask_vecmath.c:13:18: note:   nunits = 2825 test_mask_vecmath.c:13:18: note:   ==> examining pattern statement: patt_171 = patt_170 ? _82 : 0.0;826 test_mask_vecmath.c:13:18: note:   precomputed vectype: vector(2) double827 test_mask_vecmath.c:13:18: note:   nunits = 2828 test_mask_vecmath.c:13:18: note:   ==> examining statement: _140 = sumi3_119 + _ifc__139;829 test_mask_vecmath.c:13:18: note:   get vectype for scalar type: double830 test_mask_vecmath.c:13:18: note:   vectype: vector(2) double831 test_mask_vecmath.c:13:18: note:   nunits = 2832 test_mask_vecmath.c:13:18: note:   ==> examining statement: i_85 = i_114 + 1;833 test_mask_vecmath.c:13:18: note:   skip.834 test_mask_vecmath.c:13:18: note:   ==> examining statement: ivtmp_101 = ivtmp_106 - 1;835 test_mask_vecmath.c:13:18: note:   skip.836 test_mask_vecmath.c:13:18: note:   ==> examining statement: if (ivtmp_101 != 0)837 test_mask_vecmath.c:13:18: note:   skip.838 test_mask_vecmath.c:13:18: note:   vectorization factor = 4

既有int 也有double的loop

        #include<stdio.h>2     #include<math.h>3     #include<stdlib.h>4     void calc(double *src1,double *src2,int *src3)5     {6         int i;7         int j;8         double sumi = 0;9         double sumi1 = 0;10         double sumi2 = 0;11         double sumi3 = 0;12         double sumi_temp[100];13         for(i=0;i<100;i++)14         {15             j = src3[i];16             if(src3[i] > 10)17             {18               //  src1[i] = exp(src2[j]);19                 sumi1 += log(src2[j]);20               //  sumi = exp(src3[i]);21             //  sumi += 2;22             }23             else if(src3[i] > 7)24             {25               //  src1[i] = log(src2[j]);26               //  sumi = log(src2[j]);27                sumi2 += log(src2[i]);28             //  sumi += 3;29             }30 31             else if(src3[i] > 6)32             {33               //  src1[i] = sin(src2[j]);34                 sumi3 += log(src1[i]);35              //  sumi += 2;36             }37         }38       /*  for(int i=0;i<100;i++) {39           sumi+=src1[i];40         }*/41         sumi = sumi1 + sumi2 + sumi3;42         printf("sumi is %lf\n",sumi);43 44     }46     int main()47     {48       srand(12);49       double src1[100];50       double src2[100];51     //  double src3[100];52       int src3[100];53       double rand_double_min2 = 5.0;54       double rand_double_max2 = 15.0;55 56       int rand_int_min2 = 5;57       int rand_int_max2 = 15;58 59       for(int k = 0;k<100;k++) {60         src1[k] = rand_double_min2+1.0 * rand() / RAND_MAX * ( rand_double_max2 - rand_double_min2 );61         src2[k] = rand_double_min2+1.0 * rand() / RAND_MAX * ( rand_double_max2 - rand_double_min2 );62       //  src3[k] = rand_double_min2+1.0 * rand() / RAND_MAX * ( rand_double_max2 - rand_double_min2 );63       }64       for(int k = 0;k<100;k++) {65         src3[k] = rand_int_min2+ rand() % ( rand_int_max2 - rand_int_min2 );66       }67 68       for(int k = 0;k<100;k++) {69         printf("src1 is %lf ",src1[k]);70       }71       calc(src1,src2,src3);72       double res= 0;73       for(int m = 0;m<100;m++) {74         res += src1[m];75       }76       printf("res is %lf\n",res);77       return 0;78     }

bb分块

COUNT:1604735257<bb 78>:
# # RANGE [0, 2147483647] NONZERO 2147483647
k_3019 = PHI <k_1827(216), 0(301)>
# temp0_1543 = PHI <_1251(216), 0.0(301)>
# temp1_2883 = PHI <_1249(216), 0.0(301)>
# temp2_224 = PHI <_1247(216), 0.0(301)>
# temp3_2699 = PHI <_1245(216), 0.0(301)>
# temp4_1545 = PHI <_1243(216), 0.0(301)>
# vect_temp0_1543.1410_1003 = PHI <vect__1251.1527_708(216), { 0.0, 0.0, 0.0, 0.0 }(301)>
# vect_temp1_2883.1411_1002 = PHI <vect__1249.1530_701(216), { 0.0, 0.0, 0.0, 0.0 }(301)>
# vect_temp2_224.1412_1001 = PHI <vect__1247.1533_694(216), { 0.0, 0.0, 0.0, 0.0 }(301)>
# vect_temp3_2699.1413_1000 = PHI <vect__1245.1536_687(216), { 0.0, 0.0, 0.0, 0.0 }(301)>
# vect_temp4_1545.1414_999 = PHI <vect__1243.1539_670(216), { 0.0, 0.0, 0.0, 0.0 }(301)>
# # PT = nonlocal escaped null
# ALIGN = 4, MISALIGN = 0
vectp.1415_998 = PHI <vectp.1415_997(216), _1703(301)>
# ivtmp_667 = PHI <ivtmp_666(216), 0(301)>
# DEBUG temp4D.7772 => NULL
# DEBUG temp3D.7771 => NULL
# DEBUG temp2D.7770 => NULL
# DEBUG temp1D.7769 => NULL
# DEBUG temp0D.7768 => NULL
# DEBUG kD.7615 => NULL
# DEBUG BEGIN_STMT
# DEBUG BEGIN_STMT
# RANGE [0, 2147483646] NONZERO 2147483647
_1705 = (long unsigned intD.10) k_3019;
# RANGE [0, 8589934584] NONZERO 8589934588
_1706 = _1705 * 4;
# PT = nonlocal escaped null
_1707 = _1703 + _1706;
# VUSE <.MEM_2600>
vect_j_1708.1417_996 = MEM <vector(8) intD.6> [(INT_TD.3736 *)vectp.1415_998];
# VUSE <.MEM_2600>
j_1708 = *_1707;
# DEBUG jD.7613 => NULL
# DEBUG BEGIN_STMT
vect__1709.1418_994 = vect_j_1708.1417_996 * { 3, 3, 3, 3, 3, 3, 3, 3 };
_1709 = j_1708 * 3;
# RANGE ~[2147483648, 18446744071562067967]
_1710 = (long unsigned intD.10) _1709;
# RANGE [0, 18446744073709551608] NONZERO 18446744073709551608
_1711 = _1710 * 8;
# PT = nonlocal null
_1712 = x_242(D) + _1711;
# VUSE <.MEM_2600>
# USE = anything
vect__1713.1419_991 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, x_242(D), vect__1709.1418_994, {  Nan,  Nan,  Nan,  Nan }, 8);
vect__1713.1420_990 = VEC_PERM_EXPR <vect__1709.1418_994, vect__1709.1418_994, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
# VUSE <.MEM_2600>
# USE = anything
vect__1713.1419_989 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, x_242(D), vect__1713.1420_990, {  Nan,  Nan,  Nan,  Nan }, 8);
# VUSE <.MEM_2600>
_1713 = *_1712;
vect_xij_1714.1421_987 = vect_cst__988 - vect__1713.1419_991;
vect_xij_1714.1421_986 = vect_cst__988 - vect__1713.1419_989;
xij_1714 = xi_1687 - _1713;
# DEBUG xijD.7655 => NULL
# DEBUG BEGIN_STMT
# RANGE ~[2147483649, 18446744071562067968]
_1715 = _1710 + 1;
# RANGE [0, 18446744073709551608] NONZERO 18446744073709551608
_1716 = _1715 * 8;
# PT = nonlocal null
_1717 = x_242(D) + _1716;
# VUSE <.MEM_2600>
# USE = anything
vect__1718.1422_980 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _983, vect__1709.1418_994, {  Nan,  Nan,  Nan,  Nan }, 8);
# VUSE <.MEM_2600>
# USE = anything
vect__1718.1422_977 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _983, vect__1713.1420_990, {  Nan,  Nan,  Nan,  Nan }, 8);
# VUSE <.MEM_2600>
_1718 = *_1717;
vect_yij_1719.1424_975 = vect_cst__976 - vect__1718.1422_980;
vect_yij_1719.1424_974 = vect_cst__976 - vect__1718.1422_977;
yij_1719 = yi_1691 - _1718;
# DEBUG yijD.7656 => NULL
# DEBUG BEGIN_STMT
# RANGE ~[2147483650, 18446744071562067969]
_1720 = _1710 + 2;
# RANGE [0, 18446744073709551608] NONZERO 18446744073709551608
_1721 = _1720 * 8;
# PT = nonlocal null
_1722 = x_242(D) + _1721;
# VUSE <.MEM_2600>
# USE = anything
vect__1723.1425_967 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _971, vect__1709.1418_994, {  Nan,  Nan,  Nan,  Nan }, 8);
# VUSE <.MEM_2600>
# USE = anything
vect__1723.1425_965 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _971, vect__1713.1420_990, {  Nan,  Nan,  Nan,  Nan }, 8);
# VUSE <.MEM_2600>
_1723 = *_1722;
vect_zij_1724.1427_963 = vect_cst__964 - vect__1723.1425_967;
vect_zij_1724.1427_962 = vect_cst__964 - vect__1723.1425_965;
zij_1724 = zi_1695 - _1723;
# DEBUG zijD.7657 => NULL
# DEBUG BEGIN_STMT
vect_powmult_2740.1428_961 = vect_xij_1714.1421_987 * vect_xij_1714.1421_987;
vect_powmult_2740.1428_960 = vect_xij_1714.1421_986 * vect_xij_1714.1421_986;
powmult_2740 = xij_1714 * xij_1714;
vect_powmult_2713.1429_959 = vect_yij_1719.1424_975 * vect_yij_1719.1424_975;
vect_powmult_2713.1429_958 = vect_yij_1719.1424_974 * vect_yij_1719.1424_974;
powmult_2713 = yij_1719 * yij_1719;
vect_powmult_1661.1430_957 = vect_zij_1724.1427_963 * vect_zij_1724.1427_963;
vect_powmult_1661.1430_956 = vect_zij_1724.1427_962 * vect_zij_1724.1427_962;
powmult_1661 = zij_1724 * zij_1724;
vect__1971.1431_955 = vect_powmult_1661.1430_957 + vect_powmult_2713.1429_959;
vect__1971.1431_954 = vect_powmult_1661.1430_956 + vect_powmult_2713.1429_958;
_1971 = powmult_1661 + powmult_2713;
vect_r2_1729.1432_953 = vect__1971.1431_955 + vect_powmult_2740.1428_961;
vect_r2_1729.1432_952 = vect__1971.1431_954 + vect_powmult_2740.1428_960;           //  compute r2
r2_1729 = _1971 + powmult_2740;
# DEBUG r2D.7683 => NULL
# DEBUG BEGIN_STMT
# DEBUG r2D.7683 => NULL
# DEBUG BEGIN_STMT
# DEBUG BEGIN_STMT
vect__1730.1433_950 = .SQRT (vect_r2_1729.1432_953);        // after if (r2 > rgbmaxpsmax2) compute 
vect__1730.1433_949 = .SQRT (vect_r2_1729.1432_952);
vect_dij1i_1731.1434_947 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1730.1433_950;
vect_dij1i_1731.1434_946 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1730.1433_949;
# DEBUG dij1iD.7664 => NULL
# DEBUG BEGIN_STMT
vect_dij_1732.1435_945 = vect_r2_1729.1432_953 * vect_dij1i_1731.1434_947;
vect_dij_1732.1435_944 = vect_r2_1729.1432_952 * vect_dij1i_1731.1434_946;
dij_1732 = r2_1729 *  Inf;
# DEBUG dijD.7673 => NULL
# DEBUG BEGIN_STMT
_1733 = (long unsigned intD.10) j_1708;
_1734 = _1733 * 8;
_1241 = _1242 + _1734;
# PT = nonlocal escaped null
_1735 = (doubleD.32 *) _1241;
mask__1239.1436_942 = vect_r2_1729.1432_953 <= vect_cst__943;    //  if (r2 > rgbmaxpsmax2)
mask__1239.1436_941 = vect_r2_1729.1432_952 <= vect_cst__943;
_1239 = r2_1729 <= powmult_2494;
stmp_938 = VIEW_CONVERT_EXPR<vector(4) doubleD.32>(mask__1239.1436_942);
# VUSE <.MEM_2600>
# USE = anything
vect__1736.1437_937 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _939, vect_j_1708.1417_996, stmp_938, 8);  // after if (r2 > rgbmaxpsmax2) compute 
vect__1736.1438_936 = VEC_PERM_EXPR <vect_j_1708.1417_996, vect_j_1708.1417_996, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
stmp_935 = VIEW_CONVERT_EXPR<vector(4) doubleD.32>(mask__1239.1436_941);
# VUSE <.MEM_2600>
# USE = anything
vect__1736.1437_934 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _939, vect__1736.1438_936, stmp_935, 8);
_1237 = _1238 + _1734;
# PT = nonlocal escaped null
_1737 = (doubleD.32 *) _1237;
# VUSE <.MEM_2600>
# USE = anything
vect__1738.1439_931 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _933, vect_j_1708.1417_996, stmp_938, 8);
# VUSE <.MEM_2600>
# USE = anything
vect__1738.1439_924 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _933, vect__1736.1438_936, stmp_935, 8);
vect__1739.1441_922 = vect__1738.1439_931 + { -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2 };
vect__1739.1441_921 = vect__1738.1439_924 + { -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2 };
vect_sj_1740.1442_920 = vect__1736.1437_937 * vect__1739.1441_922;
vect_sj_1740.1442_919 = vect__1736.1437_934 * vect__1739.1441_921;
# DEBUG sjD.7686 => NULL
# DEBUG BEGIN_STMT
# DEBUG sj2D.7687 => NULL
# DEBUG BEGIN_STMT
vect__1743.1443_917 = vect_sj_1740.1442_920 + { 2.0e+1, 2.0e+1, 2.0e+1, 2.0e+1 };
vect__1743.1443_916 = vect_sj_1740.1442_919 + { 2.0e+1, 2.0e+1, 2.0e+1, 2.0e+1 };
mask__1463.1444_915 = vect_dij_1732.1435_945 <= vect__1743.1443_917;
mask__1463.1444_914 = vect_dij_1732.1435_944 <= vect__1743.1443_916;
_1463 = dij_1732 <= 2.0e+1;
mask__1462.1445_913 = mask__1239.1436_942 & mask__1463.1444_915;    //  if (dij > rgbmax + sj)
mask__1462.1445_912 = mask__1239.1436_941 & mask__1463.1444_914;
_1462 = _1239 & _1463;
vect_powmult_1725.1446_911 = vect_sj_1740.1442_920 * vect_sj_1740.1442_920;
vect_powmult_1725.1446_910 = vect_sj_1740.1442_919 * vect_sj_1740.1442_919;
# DEBUG BEGIN_STMT
vect__1744.1447_908 = { 2.0e+1, 2.0e+1, 2.0e+1, 2.0e+1 } - vect_sj_1740.1442_920;   // begin if ((dij > rgbmax - sj))
vect__1744.1447_907 = { 2.0e+1, 2.0e+1, 2.0e+1, 2.0e+1 } - vect_sj_1740.1442_919;
mask__1461.1448_906 = vect_dij_1732.1435_945 > vect__1744.1447_908;
mask__1461.1448_905 = vect_dij_1732.1435_944 > vect__1744.1447_907;
_1461 = dij_1732 > 2.0e+1;
mask__1460.1449_904 = mask__1461.1448_906 & mask__1462.1445_913;      //  if ((dij > rgbmax - sj))  enter if-else chain
mask__1460.1449_903 = mask__1461.1448_905 & mask__1462.1445_912;
_1460 = _1461 & _1462;                    else add 
# DEBUG BEGIN_STMT
vect__1745.1450_902 = vect_dij_1732.1435_945 - vect_sj_1740.1442_920;
vect__1745.1450_901 = vect_dij_1732.1435_944 - vect_sj_1740.1442_919;
vect_uij_1746.1451_899 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1745.1450_902;
vect_uij_1746.1451_898 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1745.1450_901;
uij_1746 = 0.0 / r2_1729;
# DEBUG uijD.7689 => NULL
# DEBUG BEGIN_STMT
vect__1748.1452_896 = vect_dij_1732.1435_945 * { 8.0e+1, 8.0e+1, 8.0e+1, 8.0e+1 };
vect__1748.1452_895 = vect_dij_1732.1435_944 * { 8.0e+1, 8.0e+1, 8.0e+1, 8.0e+1 };
_1748 = dij_1732 * 8.0e+1;
vect__2057.1453_894 = vect_powmult_1725.1446_911 - vect_r2_1729.1432_953;
vect__2057.1453_893 = vect_powmult_1725.1446_910 - vect_r2_1729.1432_952;
_2057 = -r2_1729;
vect__1750.1454_892 = vect__1748.1452_896 + vect__2057.1453_894;
vect__1750.1454_891 = vect__1748.1452_895 + vect__2057.1453_893;
_1750 = _1748 + _2057;
vect__1751.1455_889 = vect__1750.1454_892 * { 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3 };
vect__1751.1455_888 = vect__1750.1454_891 * { 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3 };
_1751 = _1750 * 2.50000000000000048572257327350598643533885478973388671875e-3;
vect__2086.1456_886 = vect_dij_1732.1435_945 * { 2.0e+0, 2.0e+0, 2.0e+0, 2.0e+0 };
vect__2086.1456_885 = vect_dij_1732.1435_944 * { 2.0e+0, 2.0e+0, 2.0e+0, 2.0e+0 };
_2086 = dij_1732 * 2.0e+0;
vect__1753.1457_884 = vect_uij_1746.1451_899 * vect__2086.1456_886;
vect__1753.1457_883 = vect_uij_1746.1451_898 * vect__2086.1456_885;
_1753 = uij_1746 * _2086;
vect__1754.1458_882 = vect__1751.1455_889 - vect__1753.1457_884;
vect__1754.1458_881 = vect__1751.1455_888 - vect__1753.1457_883;
_1754 = _1751 - _1753;
vect__1755.1459_879 = vect__1745.1450_902 * { 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2 };
vect__1755.1459_878 = vect__1745.1450_901 * { 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2 };
_1755 = dij_1732 * 5.000000000000000277555756156289135105907917022705078125e-2;
vect__1756.1460_877 = __svml_log4_mask_e9D.7954 (vect__1755.1459_879);
vect__1756.1460_876 = __svml_log4_mask_e9D.7954 (vect__1755.1459_878);
vect__1757.1461_874 = vect__1756.1460_877 * { 2.0e+0, 2.0e+0, 2.0e+0, 2.0e+0 };
vect__1757.1461_873 = vect__1756.1460_876 * { 2.0e+0, 2.0e+0, 2.0e+0, 2.0e+0 };
vect__2097.1462_871 = vect__1754.1458_882 + { -1.0e+0, -1.0e+0, -1.0e+0, -1.0e+0 };
vect__2097.1462_870 = vect__1754.1458_881 + { -1.0e+0, -1.0e+0, -1.0e+0, -1.0e+0 };
_2097 = _1754 - 1.0e+0;
vect__1759.1463_869 = vect__2097.1462_871 - vect__1757.1461_874;
vect__1759.1463_868 = vect__2097.1462_870 - vect__1757.1461_873;
vect__2099.1464_866 = vect_dij1i_1731.1434_947 * { 1.25e-1, 1.25e-1, 1.25e-1, 1.25e-1 };
vect__2099.1464_865 = vect_dij1i_1731.1434_946 * { 1.25e-1, 1.25e-1, 1.25e-1, 1.25e-1 };
vect__1761.1465_864 = vect__1759.1463_869 * vect__2099.1464_866;
vect__1761.1465_863 = vect__1759.1463_868 * vect__2099.1464_865;
_1761 = _2097 *  Inf;               ///   else add
# DEBUG temp0D.7768 => NULL
mask__1458.1466_862 = vect_dij_1732.1435_945 <= vect__1744.1447_908;           //  begin else if (dij > 4.0 * sj)
mask__1458.1466_861 = vect_dij_1732.1435_944 <= vect__1744.1447_907;
mask__1457.1467_860 = mask__1458.1466_862 & mask__1462.1445_913;
mask__1457.1467_859 = mask__1458.1466_861 & mask__1462.1445_912;
# DEBUG BEGIN_STMT
vect__1764.1468_857 = vect_sj_1740.1442_920 * { 4.0e+0, 4.0e+0, 4.0e+0, 4.0e+0 };
vect__1764.1468_856 = vect_sj_1740.1442_919 * { 4.0e+0, 4.0e+0, 4.0e+0, 4.0e+0 };
mask__1456.1469_855 = vect_dij_1732.1435_945 > vect__1764.1468_857;
mask__1456.1469_854 = vect_dij_1732.1435_944 > vect__1764.1468_856;
_1456 = dij_1732 > 0.0;
mask__1455.1470_853 = mask__1456.1469_855 & mask__1457.1467_860;    //  else if (dij > 4.0 * sj)
mask__1455.1470_852 = mask__1456.1469_854 & mask__1457.1467_859;
_1455 = _1456 & _1462;             ///  else add
# DEBUG BEGIN_STMT
vect_powmult_1726.1471_851 = vect_dij1i_1731.1434_947 * vect_dij1i_1731.1434_947;
vect_powmult_1726.1471_846 = vect_dij1i_1731.1434_946 * vect_dij1i_1731.1434_946;
# DEBUG dij2iD.7672 => NULL
# DEBUG BEGIN_STMT
vect_tmpsd_1766.1472_845 = vect_powmult_1725.1446_911 * vect_powmult_1726.1471_851;
vect_tmpsd_1766.1472_844 = vect_powmult_1725.1446_910 * vect_powmult_1726.1471_846;
# DEBUG tmpsdD.7695 => NULL
# DEBUG BEGIN_STMT
vect__1767.1473_842 = vect_tmpsd_1766.1472_845 * { 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1 };
vect__1767.1473_841 = vect_tmpsd_1766.1472_844 * { 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1 };
vect__1768.1474_839 = vect__1767.1473_842 + { 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1 };
vect__1768.1474_838 = vect__1767.1473_841 + { 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1 };
vect__1769.1475_837 = vect_tmpsd_1766.1472_845 * vect__1768.1474_839;
vect__1769.1475_836 = vect_tmpsd_1766.1472_844 * vect__1768.1474_838;
vect__1770.1476_834 = vect__1769.1475_837 + { 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1 };
vect__1770.1476_832 = vect__1769.1475_836 + { 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1 };
vect__1771.1477_831 = vect_tmpsd_1766.1472_845 * vect__1770.1476_834;
vect__1771.1477_830 = vect_tmpsd_1766.1472_844 * vect__1770.1476_832;
vect__1772.1478_824 = vect__1771.1477_831 + { 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1 };
vect__1772.1478_823 = vect__1771.1477_830 + { 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1 };
vect__1773.1479_822 = vect_tmpsd_1766.1472_845 * vect__1772.1478_824;
vect__1773.1479_821 = vect_tmpsd_1766.1472_844 * vect__1772.1478_823;
vect_dumbo_1774.1480_819 = vect__1773.1479_822 + { 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1 };
vect_dumbo_1774.1480_818 = vect__1773.1479_821 + { 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1 };
# DEBUG dumboD.7694 => NULL
# DEBUG BEGIN_STMT
vect__2892.1481_817 = vect_powmult_1726.1471_851 * vect_sj_1740.1442_920;
vect__2892.1481_816 = vect_powmult_1726.1471_846 * vect_sj_1740.1442_919;
vect__1776.1482_815 = vect_tmpsd_1766.1472_845 * vect__2892.1481_817;
vect__1776.1482_814 = vect_tmpsd_1766.1472_844 * vect__2892.1481_816;
vect__1777.1483_813 = vect_dumbo_1774.1480_819 * vect__1776.1482_815;
vect__1777.1483_812 = vect_dumbo_1774.1480_818 * vect__1776.1482_814;
# DEBUG temp1D.7769 => NULL
mask__1453.1484_811 = vect_dij_1732.1435_945 <= vect__1764.1468_857;   // begin else if (dij > ri + sj)
mask__1453.1484_810 = vect_dij_1732.1435_944 <= vect__1764.1468_856;
_1453 = dij_1732 <= 0.0;
mask__1452.1485_809 = mask__1453.1484_811 & mask__1457.1467_860;
mask__1452.1485_808 = mask__1453.1484_810 & mask__1457.1467_859;
_1452 = _1453 & _1462;      // esle add
# DEBUG BEGIN_STMT
vect__1780.1486_806 = vect_cst__807 + vect_sj_1740.1442_920;
vect__1780.1486_805 = vect_cst__807 + vect_sj_1740.1442_919;
_1780 = ri_1700;
mask__1451.1487_804 = vect_dij_1732.1435_945 > vect__1780.1486_806;
mask__1451.1487_803 = vect_dij_1732.1435_944 > vect__1780.1486_805;
_1451 = dij_1732 > _1780;
mask__1450.1488_802 = mask__1451.1487_804 & mask__1452.1485_809;
mask__1450.1488_801 = mask__1451.1487_803 & mask__1452.1485_808;    //  else if (dij > ri + sj)
_1450 = _1451 & _1452;
# DEBUG BEGIN_STMT
vect__1782.1489_800 = vect_sj_1740.1442_920 / vect__2057.1453_894;
vect__1782.1489_799 = vect_sj_1740.1442_919 / vect__2057.1453_893;
_1782 = 0.0 / r2_1729;
vect__1784.1490_797 = vect_dij_1732.1435_945 + vect_sj_1740.1442_920;
vect__1784.1490_796 = vect_dij_1732.1435_944 + vect_sj_1740.1442_919;
vect__1785.1491_795 = vect__1745.1450_902 / vect__1784.1490_797;
vect__1785.1491_794 = vect__1745.1450_901 / vect__1784.1490_796;
vect__1786.1492_793 = __svml_log4_mask_e9D.7987 (vect__1785.1491_795);
vect__1786.1492_792 = __svml_log4_mask_e9D.7987 (vect__1785.1491_794);
vect__1894.1493_790 = vect_dij1i_1731.1434_947 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 };
vect__1894.1493_789 = vect_dij1i_1731.1434_946 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 };
vect__1788.1494_788 = vect__1786.1492_793 * vect__1894.1493_790;
vect__1788.1494_787 = vect__1786.1492_792 * vect__1894.1493_789;
vect__1789.1495_786 = vect__1782.1489_800 - vect__1788.1494_788;
vect__1789.1495_785 = vect__1782.1489_799 - vect__1788.1494_787;
_1789 = _1782 -  Nan;
vect__1790.1496_783 = vect__1789.1495_786 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 };
vect__1790.1496_782 = vect__1789.1495_785 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 };
_1790 = _1789 * 5.0e-1;
# DEBUG temp2D.7770 => NULL
mask__1448.1497_781 = vect_dij_1732.1435_945 <= vect__1780.1486_806;    // begin  else if (dij > fabs(ri - sj))
mask__1448.1497_780 = vect_dij_1732.1435_944 <= vect__1780.1486_805;
_1448 = dij_1732 <= _1780;
mask__1447.1498_779 = mask__1448.1497_781 & mask__1452.1485_809;
mask__1447.1498_778 = mask__1448.1497_780 & mask__1452.1485_808;
_1447 = _1448 & _1452;
# DEBUG BEGIN_STMT
vect__1793.1499_776 = vect_cst__807 - vect_sj_1740.1442_920;
vect__1793.1499_775 = vect_cst__807 - vect_sj_1740.1442_919;
vect__1794.1500_774 = ABS_EXPR <vect__1793.1499_776>;
vect__1794.1500_773 = ABS_EXPR <vect__1793.1499_775>;
_1794 = ABS_EXPR <_1780>;
mask__1446.1501_772 = vect_dij_1732.1435_945 > vect__1794.1500_774;
mask__1446.1501_771 = vect_dij_1732.1435_944 > vect__1794.1500_773;
_1446 = dij_1732 > _1794;
mask__1445.1502_770 = mask__1446.1501_772 & mask__1447.1498_779;
mask__1445.1502_769 = mask__1446.1501_771 & mask__1447.1498_778;    // else if (dij > fabs(ri - sj))
_1445 = _1446 & _1447;
# DEBUG BEGIN_STMT
vect__2372.1503_767 = vect_cst__768 - vect_powmult_1725.1446_911;
vect__2372.1503_766 = vect_cst__768 - vect_powmult_1725.1446_910;
_2372 = powmult_1728;
vect__1798.1504_765 = vect_r2_1729.1432_953 + vect__2372.1503_767;
vect__1798.1504_764 = vect_r2_1729.1432_952 + vect__2372.1503_766;
_1798 = r2_1729 + _2372;
vect__2373.1505_762 = vect__1798.1504_765 * vect_cst__763;
vect__2373.1505_761 = vect__1798.1504_764 * vect_cst__763;
_2373 = _1798 * _2894;
vect_theta_1800.1506_760 = vect_dij1i_1731.1434_947 * vect__2373.1505_762;
vect_theta_1800.1506_759 = vect_dij1i_1731.1434_946 * vect__2373.1505_761;
theta_1800 = _2373 *  Inf;
# DEBUG thetaD.7670 => NULL
# DEBUG BEGIN_STMT
vect_uij_1802.1507_757 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1784.1490_797;
vect_uij_1802.1507_756 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1784.1490_796;
# DEBUG uijD.7689 => NULL
# DEBUG BEGIN_STMT
vect__1803.1508_754 = vect_theta_1800.1506_760 + { -2.0e+0, -2.0e+0, -2.0e+0, -2.0e+0 };
vect__1803.1508_753 = vect_theta_1800.1506_759 + { -2.0e+0, -2.0e+0, -2.0e+0, -2.0e+0 };
_1803 = theta_1800 - 2.0e+0;
vect__1804.1509_751 = vect_cst__752 * vect__1803.1508_754;
vect__1804.1509_750 = vect_cst__752 * vect__1803.1508_753;
_1804 = ri1i_1701 * _1803;
vect__1805.1510_749 = vect_uij_1802.1507_757 + vect__1804.1509_751;
vect__1805.1510_748 = vect_uij_1802.1507_756 + vect__1804.1509_750;
_1805 = uij_1746 + _1804;
vect__1806.1511_746 = vect_uij_1802.1507_757 * vect_cst__807;
vect__1806.1511_745 = vect_uij_1802.1507_756 * vect_cst__807;
_1806 = ri_1700 * uij_1746;
vect__1807.1512_744 = __svml_log4_mask_e9D.8008 (vect__1806.1511_746);
vect__1807.1512_743 = __svml_log4_mask_e9D.8008 (vect__1806.1511_745);
vect__1808.1513_742 = vect_dij1i_1731.1434_947 * vect__1807.1512_744;
vect__1808.1513_741 = vect_dij1i_1731.1434_946 * vect__1807.1512_743;
vect__1809.1514_740 = vect__1805.1510_749 - vect__1808.1513_742;
vect__1809.1514_739 = vect__1805.1510_748 - vect__1808.1513_741;
_1809 = _1805 -  Nan;
vect__1810.1515_737 = vect__1809.1514_740 * { 2.5e-1, 2.5e-1, 2.5e-1, 2.5e-1 };
vect__1810.1515_736 = vect__1809.1514_739 * { 2.5e-1, 2.5e-1, 2.5e-1, 2.5e-1 };
_1810 = _1809 * 2.5e-1;
# DEBUG temp3D.7771 => NULL
mask__1443.1516_735 = vect_dij_1732.1435_945 <= vect__1794.1500_774;   // begin  else if (ri < sj)
mask__1443.1516_734 = vect_dij_1732.1435_944 <= vect__1794.1500_773;
_1443 = dij_1732 <= _1794;
mask__1442.1517_733 = mask__1443.1516_735 & mask__1447.1498_779;
mask__1442.1517_732 = mask__1443.1516_734 & mask__1447.1498_778;
_1442 = _1443 & _1447;
# DEBUG BEGIN_STMT
mask__1441.1518_730 = vect_cst__807 < vect_sj_1740.1442_920;
mask__1441.1518_729 = vect_cst__807 < vect_sj_1740.1442_919;
_1441 = _1699 < 8.99999999999999966693309261245303787291049957275390625e-2;
mask__1406.1519_728 = mask__1441.1518_730 & mask__1442.1517_733;
mask__1406.1519_727 = mask__1441.1518_729 & mask__1442.1517_732;   //  else if (ri < sj)
_1406 = _1441 & _1442;
# DEBUG BEGIN_STMT
vect__1816.1520_725 = vect__1782.1489_800 - vect_cst__726;
vect__1816.1520_724 = vect__1782.1489_799 - vect_cst__726;
_1816 = _1782 - _1815;
vect__1235.1521_723 = -vect__1785.1491_795;
vect__1235.1521_722 = -vect__1785.1491_794;
vect__1820.1522_721 = __svml_log4_mask_e9D.8019 (vect__1235.1521_723);
vect__1820.1522_720 = __svml_log4_mask_e9D.8019 (vect__1235.1521_722);
vect__1822.1523_719 = vect__1820.1522_721 * vect__1894.1493_790;
vect__1822.1523_718 = vect__1820.1522_720 * vect__1894.1493_789;
vect__1823.1524_717 = vect__1816.1520_725 - vect__1822.1523_719;
vect__1823.1524_716 = vect__1816.1520_724 - vect__1822.1523_718;
_1823 = _1816 -  Nan;
vect__1824.1525_714 = vect__1823.1524_717 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 };
vect__1824.1525_713 = vect__1823.1524_716 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 }; //  end  if-else
_1824 = _1823 * 5.0e-1;
# DEBUG temp4D.7772 => NULL
vect__ifc__1252.1526_711 = VEC_COND_EXPR <mask__1460.1449_904, vect__1761.1465_864, { 0.0, 0.0, 0.0, 0.0 }>;
vect__ifc__1252.1526_710 = VEC_COND_EXPR <mask__1460.1449_903, vect__1761.1465_863, { 0.0, 0.0, 0.0, 0.0 }>;
_ifc__1252 = _1460 ? _1761 : 0.0;
vect__1251.1527_709 = vect_temp0_1543.1410_1003 + vect__ifc__1252.1526_711;
vect__1251.1527_708 = vect__1251.1527_709 + vect__ifc__1252.1526_710;
_1251 = temp0_1543 + _ifc__1252;
vect__ifc__1250.1529_704 = VEC_COND_EXPR <mask__1455.1470_853, vect__1777.1483_813, { 0.0, 0.0, 0.0, 0.0 }>;
vect__ifc__1250.1529_703 = VEC_COND_EXPR <mask__1455.1470_852, vect__1777.1483_812, { 0.0, 0.0, 0.0, 0.0 }>;
_ifc__1250 = _1455 ?  Nan : 0.0;
vect__1249.1530_702 = vect_temp1_2883.1411_1002 - vect__ifc__1250.1529_704;
vect__1249.1530_701 = vect__1249.1530_702 - vect__ifc__1250.1529_703;
_1249 = temp1_2883 - _ifc__1250;
vect__ifc__1248.1532_697 = VEC_COND_EXPR <mask__1450.1488_802, vect__1790.1496_783, { 0.0, 0.0, 0.0, 0.0 }>;
vect__ifc__1248.1532_696 = VEC_COND_EXPR <mask__1450.1488_801, vect__1790.1496_782, { 0.0, 0.0, 0.0, 0.0 }>;
_ifc__1248 = _1450 ? _1790 : 0.0;
vect__1247.1533_695 = vect_temp2_224.1412_1001 + vect__ifc__1248.1532_697;
vect__1247.1533_694 = vect__1247.1533_695 + vect__ifc__1248.1532_696;
_1247 = temp2_224 + _ifc__1248;
vect__ifc__1246.1535_690 = VEC_COND_EXPR <mask__1445.1502_770, vect__1810.1515_737, { 0.0, 0.0, 0.0, 0.0 }>;
vect__ifc__1246.1535_689 = VEC_COND_EXPR <mask__1445.1502_769, vect__1810.1515_736, { 0.0, 0.0, 0.0, 0.0 }>;
_ifc__1246 = _1445 ? _1810 : 0.0;
vect__1245.1536_688 = vect_temp3_2699.1413_1000 + vect__ifc__1246.1535_690;
vect__1245.1536_687 = vect__1245.1536_688 + vect__ifc__1246.1535_689;
_1245 = temp3_2699 + _ifc__1246;
vect__ifc__1244.1538_673 = VEC_COND_EXPR <mask__1406.1519_728, vect__1824.1525_714, { 0.0, 0.0, 0.0, 0.0 }>;
vect__ifc__1244.1538_672 = VEC_COND_EXPR <mask__1406.1519_727, vect__1824.1525_713, { 0.0, 0.0, 0.0, 0.0 }>;
_ifc__1244 = _1406 ? _1824 : 0.0;
vect__1243.1539_671 = vect_temp4_1545.1414_999 + vect__ifc__1244.1538_673;
vect__1243.1539_670 = vect__1243.1539_671 + vect__ifc__1244.1538_672;
_1243 = temp4_1545 + _ifc__1244;
# DEBUG temp4D.7772 => _1243
# DEBUG temp3D.7771 => _1245
# DEBUG temp2D.7770 => _1247
# DEBUG temp1D.7769 => _1249
# DEBUG temp0D.7768 => _1251
# DEBUG BEGIN_STMT
# RANGE [1, 2147483647] NONZERO 2147483647
k_1827 = k_3019 + 1;
# DEBUG temp4D.7772 => _1243
# DEBUG temp3D.7771 => _1245
# DEBUG temp2D.7770 => _1247
# DEBUG temp1D.7769 => _1249
# DEBUG temp0D.7768 => _1251
# DEBUG kD.7615 => k_1827
# DEBUG BEGIN_STMT
# PT = nonlocal escaped null
vectp.1415_997 = vectp.1415_998 + 32;
ivtmp_666 = ivtmp_667 + 1;
if (ivtmp_666 < bnd.1407_1013)goto <bb 216>; [83.33%]
elsegoto <bb 303>; [16.67%

bb 分块的优化方案：

1：找到vec_cond_expr,将其中第一个参数mask作为上一个bb的结束，（其后还有一个mask）并且在其后新建一个该mask与0进行比较的gimple_cond，将这两个mask相与。同时新建该mask判断为ture 和 false的edge,分别指向分割的bb和其下一个bb。

2：以vec_cond_expr的第二个参数的ssa_name_def作为要分割bb的末尾，进行分割。并且生成一条指向其下一个bb的edge。同时将其作为mask判断为false的edge的dest。

optimize_mask_stores 代码

10093 /* The code below is trying to perform simple optimization - revert
10094    if-conversion for masked stores, i.e. if the mask of a store is zero
10095    do not perform it and all stored value producers also if possible.
10096    For example,
10097      for (i=0; i<n; i++)
10098        if (c[i])
10099   {
10100     p1[i] += 1;
10101     p2[i] = p3[i] +2;
10102   }
10103    this transformation will produce the following semi-hammock:
10104 
10105    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10106      {
10107        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10108        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10109        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10110        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10111        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10112        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10113      }
10114 */
10115 
10116 void
10117 optimize_mask_stores (class loop *loop)
10118 {
10119   basic_block *bbs = get_loop_body (loop);
10120   unsigned nbbs = loop->num_nodes;
10121   unsigned i;
10122   basic_block bb;
10123   class loop *bb_loop;
10124   gimple_stmt_iterator gsi;
10125   gimple *stmt;
10126   auto_vec<gimple *> worklist;
10127   auto_purge_vect_location sentinel;
10128 
10129   vect_location = find_loop_location (loop);
10130   /* Pick up all masked stores in loop if any.  */
10131   for (i = 0; i < nbbs; i++)
10132     {
10133       bb = bbs[i];
10134       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10135      gsi_next (&gsi))
10136   {
10137     stmt = gsi_stmt (gsi);
10138     if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10139       worklist.safe_push (stmt);
10140   }
10141     }
10142 
10143   free (bbs);
10144   if (worklist.is_empty ())
10145     return;
10146 
10147   /* Loop has masked stores.  */
10148   while (!worklist.is_empty ())
10149     {
10150       gimple *last, *last_store;
10151       edge e, efalse;
10152       tree mask;
10153       basic_block store_bb, join_bb;
10154       gimple_stmt_iterator gsi_to;
10155       tree vdef, new_vdef;
10156       gphi *phi;
10157       tree vectype;
10158       tree zero;
10159 
10160       last = worklist.pop ();
10161       mask = gimple_call_arg (last, 2);
10162       bb = gimple_bb (last);
10163       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10164    the same loop as if_bb.  It could be different to LOOP when two
10165    level loop-nest is vectorized and mask_store belongs to the inner
10166    one.  */
10167       e = split_block (bb, last);
10168       bb_loop = bb->loop_father;
10169       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10170       join_bb = e->dest;
10171       store_bb = create_empty_bb (bb);
10172       add_bb_to_loop (store_bb, bb_loop);
10173       e->flags = EDGE_TRUE_VALUE;
10174       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10175       /* Put STORE_BB to likely part.  */
10176       efalse->probability = profile_probability::unlikely ();
10177       store_bb->count = efalse->count ();
10178       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10179       if (dom_info_available_p (CDI_DOMINATORS))
10180   set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10181       if (dump_enabled_p ())
10182   dump_printf_loc (MSG_NOTE, vect_location,
10183        "Create new block %d to sink mask stores.",
10184        store_bb->index);
10185       /* Create vector comparison with boolean result.  */
10186       vectype = TREE_TYPE (mask);
10187       zero = build_zero_cst (vectype);
10188       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10189       gsi = gsi_last_bb (bb);
10190       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10191       /* Create new PHI node for vdef of the last masked store:
10192    .MEM_2 = VDEF <.MEM_1>
10193    will be converted to
10194    .MEM.3 = VDEF <.MEM_1>
10195    and new PHI node will be created in join bb
10196    .MEM_2 = PHI <.MEM_1, .MEM_3>
10197       */
10198       vdef = gimple_vdef (last);
10199       new_vdef = make_ssa_name (gimple_vop (cfun), last);
10200       gimple_set_vdef (last, new_vdef);
10201       phi = create_phi_node (vdef, join_bb);
10202       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10203 
10204       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10205       while (true)
10206   {
10207     gimple_stmt_iterator gsi_from;
10208     gimple *stmt1 = NULL;
10209 
10210     /* Move masked store to STORE_BB.  */
10211     last_store = last;
10212     gsi = gsi_for_stmt (last);
10213     gsi_from = gsi;
10214     /* Shift GSI to the previous stmt for further traversal.  */
10215     gsi_prev (&gsi);
10216     gsi_to = gsi_start_bb (store_bb);
10217     gsi_move_before (&gsi_from, &gsi_to);
10218     /* Setup GSI_TO to the non-empty block start.  */
10219     gsi_to = gsi_start_bb (store_bb);
10220     if (dump_enabled_p ())
10221       dump_printf_loc (MSG_NOTE, vect_location,
10222            "Move stmt to created bb\n%G", last);
10223     /* Move all stored value producers if possible.  */
10224     while (!gsi_end_p (gsi))
10225       {
10226         tree lhs;
10227         imm_use_iterator imm_iter;
10228         use_operand_p use_p;
10229         bool res;
10230 
10231         /* Skip debug statements.  */
10232         if (is_gimple_debug (gsi_stmt (gsi)))
10233     {
10234       gsi_prev (&gsi);
10235       continue;
10236     }
10237         stmt1 = gsi_stmt (gsi);
10238         /* Do not consider statements writing to memory or having
10239      volatile operand.  */
10240         if (gimple_vdef (stmt1)
10241       || gimple_has_volatile_ops (stmt1))
10242     break;
10243         gsi_from = gsi;
10244         gsi_prev (&gsi);
10245         lhs = gimple_get_lhs (stmt1);
10246         if (!lhs)
10247     break;
10248 
10249         /* LHS of vectorized stmt must be SSA_NAME.  */
10250         if (TREE_CODE (lhs) != SSA_NAME)
10251     break;
10252 
10253         if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10254     {
10255       /* Remove dead scalar statement.  */
10256       if (has_zero_uses (lhs))
10257         {
10258           gsi_remove (&gsi_from, true);
10259           continue;
10260         }
10261     }
10262 
10263         /* Check that LHS does not have uses outside of STORE_BB.  */
10264         res = true;
10265         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10266     {
10267       gimple *use_stmt;
10268       use_stmt = USE_STMT (use_p);
10269       if (is_gimple_debug (use_stmt))
10270         continue;
10271       if (gimple_bb (use_stmt) != store_bb)
10272         {
10273           res = false;
10274           break;
10275         }
10276     }
10277         if (!res)
10278     break;
10279 
10280         if (gimple_vuse (stmt1)
10281       && gimple_vuse (stmt1) != gimple_vuse (last_store))
10282     break;
10283 
10284         /* Can move STMT1 to STORE_BB.  */
10285         if (dump_enabled_p ())
10286     dump_printf_loc (MSG_NOTE, vect_location,
10287          "Move stmt to created bb\n%G", stmt1);
10288         gsi_move_before (&gsi_from, &gsi_to);
10289         /* Shift GSI_TO for further insertion.  */
10290         gsi_prev (&gsi_to);
10291       }
10292     /* Put other masked stores with the same mask to STORE_BB.  */
10293     if (worklist.is_empty ()
10294         || gimple_call_arg (worklist.last (), 2) != mask
10295         || worklist.last () != stmt1)
10296       break;
10297     last = worklist.pop ();
10298   }
10299       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10300     }
10301 }

optimize_mask_vec_cond 代码

10093 void
10094 optimize_mask_vec_cond (class loop *loop)
10095 {
10096   basic_block *bbs = get_loop_body (loop);
10097   unsigned nbbs = loop->num_nodes;
10098   unsigned i;
10099   basic_block bb, bb_mask;
10100   class loop *bb_loop;
10101   gimple_stmt_iterator gsi;
10102   gimple *stmt;
10103   auto_vec<gimple *> worklist;
10104   auto_purge_vect_location sentinel;
10105 
10106   enum tree_code code;
10107 
10108   vect_location = find_loop_location (loop);
10109   /* Pick up all vec_cond_expr in loop if any.  */
10110   for (i = 0; i < nbbs; i++)
10111     {
10112        bb = bbs[i];
10113        for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10114         gsi_next (&gsi))
10115       {
10116         stmt = gsi_stmt (gsi);
10117         if (is_gimple_assign(stmt)) {
10118           gassign *stmt_assign = dyn_cast <gassign *> (gsi_stmt (gsi));
10119           code = gimple_assign_rhs_code (stmt_assign);
10120           // 检查语句是否为 VEC_COND_EXPR
10121           if (code == VEC_COND_EXPR) {
10122             worklist.safe_push (stmt);
10123           }
10124         }
10125        }
10126      }
10128   free (bbs);
10129   if (worklist.is_empty ())
10130     return;
10131 
10132   /* Loop has vec_cond_expr.  */
10133   while (!worklist.is_empty ())
10134     {
10135       gimple *last, *last_store, *last1;
10136       edge e, efalse;
10137       tree mask;
10138       basic_block store_bb, join_bb;
10139       gimple_stmt_iterator gsi_to;
10140       gimple_stmt_iterator gsi_stmt_def;
10141       tree vdef, new_vdef;
10142       gphi *phi;
10143       tree vectype;
10144       tree zero;
10145 
10146       last = worklist.pop ();
10147       gassign *stmt_assign = dyn_cast <gassign *> (last);
10148       mask = gimple_assign_rhs1(stmt_assign);
10149       tree true_vector_operand = gimple_assign_rhs2(stmt_assign);
10150 
10151       gimple *mask_def = SSA_NAME_DEF_STMT (mask);
10152 
10153       gimple *stmt_def = SSA_NAME_DEF_STMT (true_vector_operand);
10154 
10155       bb = gimple_bb (stmt_def);
10156 
10157     //  bb_mask = gimple_bb (mask_def);
10158       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10159    the same loop as if_bb.  It could be different to LOOP when two
10160    level loop-nest is vectorized and mask_store belongs to the inner
10161    one.  */
10162 
10163       gsi_stmt_def = gsi_for_stmt (stmt_def);
10164       gsi_next(&gsi_stmt_def);
10165 
10166       stmt_def = gsi_stmt(gsi_stmt_def);
10167 
10168       e = split_block (bb, stmt_def);
10169       bb_loop = bb->loop_father;
10170    //   gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10171       join_bb = e->dest;
10172       store_bb = create_empty_bb (bb);
10173       add_bb_to_loop (store_bb, bb_loop);
10174       e->flags = EDGE_TRUE_VALUE;
10175       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10176       /* Put STORE_BB to likely part.  */
10177       efalse->probability = profile_probability::unlikely ();
10178       store_bb->count = efalse->count ();
10179       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10180       if (dom_info_available_p (CDI_DOMINATORS))
10181   set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10182       if (dump_enabled_p ())
10183   dump_printf_loc (MSG_NOTE, vect_location,
10184        "Create new block %d to sink vect cond expr",
10185        store_bb->index);
10186       /* Create vector comparison with boolean result.  */
10187       vectype = TREE_TYPE (mask);
10188       zero = build_zero_cst (vectype);
10189       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10190    //   gsi = gsi_last_bb (bb);
10191       gsi = gsi_for_stmt (mask_def);
10192       gsi_next(&gsi);
10193       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10194       /* Create new PHI node for vdef of the last masked store:
10195    .MEM_2 = VDEF <.MEM_1>
10196    will be converted to
10197    .MEM.3 = VDEF <.MEM_1>
10198    and new PHI node will be created in join bb
10199    .MEM_2 = PHI <.MEM_1, .MEM_3>
10200       */
10201   /*    vdef = gimple_vdef (last);
10202       new_vdef = make_ssa_name (gimple_vop (cfun), last);
10203       gimple_set_vdef (last, new_vdef);
10204       phi = create_phi_node (vdef, join_bb);
10205       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);*/
10206 
10207       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10208   //    while (true)
10209 //  {
10210     gimple_stmt_iterator gsi_from;
10211     gimple *stmt1 = NULL;
10213     /* Move vec_cond second var def to STORE_BB.  */
10214     last_store = stmt_def;
10215     gsi = gsi_for_stmt (stmt_def);
10216     gsi_from = gsi;
10217     /* Shift GSI to the previous stmt for further traversal.  */
10218     gsi_prev (&gsi);
10219     gsi_to = gsi_start_bb (store_bb);
10220     gsi_move_before (&gsi_from, &gsi_to);
10221     /* Setup GSI_TO to the non-empty block start.  */
10222     gsi_to = gsi_start_bb (store_bb);
10223     if (dump_enabled_p ())
10224       dump_printf_loc (MSG_NOTE, vect_location,
10225            "Move stmt to created bb\n%G", last);
10226     /* Move all stored value producers if possible.  */
10227     while (!gsi_end_p (gsi))
10228       {
10229         tree lhs;
10230         imm_use_iterator imm_iter;
10231         use_operand_p use_p;
10232         bool res;
10233 
10234         /* Skip debug statements.  */
10235         if (is_gimple_debug (gsi_stmt (gsi)))
10236     {
10237       gsi_prev (&gsi);
10238       continue;
10239     }
10240         stmt1 = gsi_stmt (gsi);
10241         /* Do not consider statements writing to memory or having
10242      volatile operand.  */
10243         if (gimple_vdef (stmt1)
10244       || gimple_has_volatile_ops (stmt1))
10245     break;
10246         gsi_from = gsi;
10247         gsi_prev (&gsi);
10248         lhs = gimple_get_lhs (stmt1);
10249         if (!lhs)
10250     break;
10251 
10252         /* LHS of vectorized stmt must be SSA_NAME.  */
10253         if (TREE_CODE (lhs) != SSA_NAME)
10254     break;
10255 
10256         if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10257     {
10258       /* Remove dead scalar statement.  */
10259     /*  if (has_zero_uses (lhs))
10260         {
10261           gsi_remove (&gsi_from, true);
10262           continue;
10263         }*/
10264     }
10265 
10266         /* Check that LHS does not have uses outside of STORE_BB.  */
10267         res = true;
10268   /*      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10269     {
10270       gimple *use_stmt;
10271       use_stmt = USE_STMT (use_p);
10272       if (is_gimple_debug (use_stmt))
10273         continue;
10274       if (gimple_bb (use_stmt) != store_bb)
10275         {
10276           res = false;
10277           break;
10278         }
10279     }*/
10280         if (!res)
10281     break;
10282 
10283     /*    if (gimple_vuse (stmt1)
10284       && gimple_vuse (stmt1) != gimple_vuse (last_store))
10285     break;*/
10286 
10287         /* Can move STMT1 to STORE_BB.  */
10288         if (dump_enabled_p ())
10289     dump_printf_loc (MSG_NOTE, vect_location,
10290          "Move stmt to created bb\n%G", stmt1);
10291         gsi_move_before (&gsi_from, &gsi_to);
10292         /* Shift GSI_TO for further insertion.  */
10293         gsi_prev (&gsi_to);
10294       }
10295     /* Put other masked stores with the same mask to STORE_BB.  */
10296   /*  if (worklist.is_empty ()
10297         || gimple_call_arg (worklist.last (), 2) != mask
10298         || worklist.last () != stmt1)
10299       break;
10300     last = worklist.pop ();*/
10301   //  last1 = worklist.pop ();
10302 //  }
10303     //  add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10304     if (!worklist.is_empty ())
10305     last = worklist.pop ();
10306     }
10307 }

能够按照预期进行拆分bb块，同时解决编译不过的两个问题：

1：加上-g 之后，在fre pass 会报错，在对debug gimple 进行分析删除的时候，找不到某个标量的定义。最后一个分支的标量gimple被直接删除了，没有生成debug gimple。导致后面debug gimple 使用到该标量是找不到其定义，报编译错误。解决方法，先去掉-g。后续在dce pass 中找删除标量和插入debug的逻辑。# DEBUG D#583 => D#597 ? _2164 : 0.0

2：在sink pass 中报编译错误，gimple_redirect_edge_and_branch函数中，assert不通过，需要该edge 是一个fallthru edge。在构造edge的时候需要生成。暂时注释掉。

default:6134       /* Otherwise it must be a fallthru edge, and we don't need to6135    do anything besides redirecting it.  */6136    //   gcc_assert (e->flags & EDGE_FALLTHRU);

解决掉编译错误后，可以正确编译运行，但是结果错误。

原因是该loop 的 vf是8.每次会对loop 中的8个元素进行运算，计算mask的数据是double类型，会生成两个mask。每个分支需要对两个mask同时和{0，0，0，0}比较是否为0，目前只能进行一个mask的比较。可以的方法：

1：修改loop 中int 的类型使其在确定vf的时候将其作为double 看待（VIEW_CONVERT_EXPR），这样vf 是4, 就不存在两个mask。

2：gimple cond 不能支持这种if ( a==0 && b==0) 这种复杂条件表达，构造两个gimple cond。然后做&运算，将此条件作为需要判断的cond。

1761处循环：

1：在每个分支条件构造后插入两个mask按位或的gimple，并且以此新建一个gimple cond,作为分支判断的条件。

2：课题运算结果VE.查找原因。从打印每个分支运算结果来看，temp4的结果恒为0，即最后一个分支完全没有走到，存在问题，同时加上-g后报错，也是最后一个分支的标量被删除，怀疑最后分支在拆分的时候存在问题。（正确结果在源码中加打印中间结果，无法进行打印）。

Lhs use outside of BB。当其使用的outside BB是 VEC_COND 所在的BB认为是没问题的，其他情况需要进行添加phi节点操作

2中的stmt的 lhs res在4 里面被使用，原本在同一个bb里面不需要做额外的操作，当分到不同的bb后，走不走2 res的值会不同，如果不走4中用的res会使用上一次2中计算的res值，显然结果错误，需要添加phi节点来解决。

若2中的lhs res0 被 4 use ,需要在 2的上一个bb 1新建一个向量变量res1 = 0，在2 的下一个bb 3中，新建一个phi节点，res2 = phi<res1(1),res0(2)>, 并且将4中用到res0的地方改为res2。

若2中的lhs res0 被 4 use ,需要在 2的上一个bb 1新建一个向量变量res1 = 0，将2中的res0 = xx 修改为 res2 = xx,在2 的下一个bb 3中，新建一个phi节点，res0 = phi<res1(1),res2(2)>。

若2中的res0 2中的其他stmt使用到，则需要将所有用到res0的地方改成res2

对于多个分支都要进行计算的变量，可以将第二个分支直接用到此计算的地方，需要使用该计算的全部。在用到其的地方需要进行计算。

新增phi节点的代码

FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10283     {
10284       gimple *use_stmt;
10285       use_stmt = USE_STMT (use_p);
10286       if (is_gimple_debug (use_stmt))
10287         continue;
10288       if (gimple_bb (use_stmt) != store_bb && gimple_bb (use_stmt) != gimple_bb (last))
10289         {
10290          // res = false;
10291 
10292         if (dump_enabled_p ())
10293     dump_printf_loc (MSG_NOTE, vect_location,
10294          "LHS have use outside of store_BB\n%G", stmt1);
10295         tree lhs_use_out,new_lhs,new_lhs1,new_lhs2;
10296         tree new_lhs_phi;
10297         gphi *phi;
10298         tree vectype;
10299         tree zero;
10300         gimple *zero_def;
10301         lhs_use_out = gimple_assign_lhs(stmt1);
10302 
10303       /*  if (is_gimple_assign(stmt1)) {
10304             lhs_use_out = gimple_assign_lhs(stmt1);
10305             new_lhs = create_tmp_var(TREE_TYPE(lhs_use_out), "new_tmp_var");
10306             new_lhs_phi = make_ssa_name(new_lhs,NULL);
10307         //    gimple_assign_set_lhs(stmt1, new_lhs1);
10308         
10309 
10310             phi = create_phi_node (new_lhs_phi, join_bb);
10311             add_phi_arg (phi, lhs_use_out, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10312          
10313             vectype = TREE_TYPE (lhs_use_out);
10314             zero = build_zero_cst (vectype);
10315             new_lhs1 = create_tmp_var(TREE_TYPE(lhs_use_out), "new_tmp_var1");
10316             new_lhs2 = make_ssa_name(new_lhs1,NULL);
10317             zero_def = gimple_build_assign(new_lhs2, zero);
10318 
10319      //    basic_block stmt_bb = gimple_bb(stmt1);
10320             edge e_temp;
10321             edge_iterator ei;
10322             basic_block pred_bb;
10323             gimple_stmt_iterator gsi_temp;
10324 
10325        //  if (EDGE_COUNT(stmt_bb->preds) == 1) {
10326             e_temp = EDGE_PRED(store_bb, 0);
10327             pred_bb = e_temp->src;
10328             gsi_temp = gsi_start_bb(pred_bb);
10329             gsi_insert_before(&gsi_temp, zero_def, GSI_SAME_STMT);
10330        //  }
10331 
10332            add_phi_arg (phi, new_lhs2, e, UNKNOWN_LOCATION);
10333          //  update_stmt (phi);
10334 
10335         /*   edge e_join;
10336            edge_iterator ei_join;
10337 
10338            FOR_EACH_EDGE(e_join, ei_join, join_bb->succs)
10339            {
10340               if (EDGE_TRUE_P(e_join))
10341               {
10342                 *true_bb = e->dest;
10343               }
10344            }*/
10345 
10346            for (unsigned int i = 0; i < gimple_num_ops(use_stmt); i++) {
10347               tree rhs = gimple_op(use_stmt, i);
10348               if(rhs == lhs_use_out) {
10349                 gimple_stmt_iterator gsi = gsi_for_stmt(use_stmt);
10350                 gsi_insert_before (&gsi,stmt1,GSI_SAME_STMT);
10351                 break;
10352               //  create_new_def_for (rhs, phi,gimple_phi_result_ptr (phi));
10353               //  update_stmt (phi);
10354               }
10355            }
10356      //   }

2069处循环：

1：需要进行dim=3的常量传播，加上拆分循环这两个条件。验证前一个循环向量化后有7%的性能，加上ymm寄存器后有11%的性能。

2：查看gcc的loop split 和 loop distribute pass,发现loop distribute的总体思想是将能够向量化的代码最大限度拆分到一个循环中，（1）但其只对非嵌套循环的最内层循环分析，发现其dump的信息中没有对2069循环进行distribute。（2）同时其只能对没有数据依赖的部分distribute，源码有数据依赖的部分使用临时数组存储后进行拆分，需要自行编写代码实现。

549课题在mask store中涉及的运算上对数学函数添加mask代码

1  #include "config.h"2  #include "system.h"3  #include "coretypes.h"4  #include "backend.h"5  #include "tree.h"6  #include "gimple.h"7  #include "predict.h"8  #include "tree-pass.h"9  #include "ssa.h"10  #include "cgraph.h"11  #include "fold-const.h"12  #include "stor-layout.h"13  #include "gimple-iterator.h"14  #include "gimple-walk.h"15  #include "tree-ssa-loop-manip.h"16  #include "tree-ssa-loop-niter.h"17  #include "tree-cfg.h"18  #include "cfgloop.h"19  #include "tree-vectorizer.h"20  #include "tree-ssa-propagate.h"21  #include "dbgcnt.h"22  #include "tree-scalar-evolution.h"23  #include "stringpool.h"24  #include "attribs.h"25  #include "gimple-pretty-print.h"26  #include "opt-problem.h"27  #include "internal-fn.h"28  #include "tree-ssa-sccvn.h"29  #include "gimple-expr.h"30  #include <cstdio>31 32  namespace33  {34  const pass_data pass_data_test = {35    GIMPLE_PASS,           /* type */36    "mask_vecmath_func",                /* name */37    OPTGROUP_NONE,         /* optinfo_flags */38    TV_TREE_VECT_MASK_VECMATH_FUNC,          /* tv_id */39    (PROP_cfg | PROP_ssa), /* properties_required */40    0,                     /* properties_provided */41    0,                     /* properties_destroyed */42    0,                     /* todo_flags_start */43    0,                     /* todo_flags_finish */44  };
45 46  class pass_mask_vecmath_func : public gimple_opt_pass47  {48  public:49    pass_mask_vecmath_func (gcc::context *ctxt) : gimple_opt_pass (pass_data_test, ctxt) {}50    virtual bool51    gate (function *fun)52    {53     // printf ("gate function noipa.\n");54      return flag_tree_mask_vecmath_func;55    }56 57    virtual unsigned int execute (function *);58  };59 60 61 static void add_mask_to_call(gimple *stmt, tree new_arg, const char *func_name) {62     if (!is_gimple_call(stmt)) {63         // 如果不是函数调用语句，则不做任何操作64         return;65     }66 67     // 获取原始函数调用的目标和参数列表68     tree call_fn = gimple_call_fndecl(stmt);69 70    // 获取或创建新的标识符节点来表示新的函数名称71    tree new_func_id;72    if(strcmp(func_name, "vmldCos2") == 0)73      new_func_id = get_identifier("__svml_cos2_mask_e9");74    else if (strcmp(func_name, "vmldExp2") == 0)75      new_func_id = get_identifier("__svml_exp2_mask_e9");76    else if (strcmp(func_name, "vmldSin2") == 0)77      new_func_id = get_identifier("__svml_sin2_mask_e9");78    else if (strcmp(func_name, "sin.simdclone.2") == 0)79      new_func_id = get_identifier("__svml_sin4_mask_e9");80    else if (strcmp(func_name, "cos.simdclone.2") == 0)81      new_func_id = get_identifier("__svml_cos4_mask_e9");82    else if (strcmp(func_name, "exp.simdclone.2") == 0)83      new_func_id = get_identifier("__svml_exp4_mask_e9");84 85    tree fntype = TREE_TYPE(call_fn);87    tree new_fndecl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, new_func_id, fntype);88 89    TREE_PUBLIC (new_fndecl) = 1;90    DECL_EXTERNAL (new_fndecl) = 1;91    DECL_IS_NOVOPS (new_fndecl) = 1;92    TREE_READONLY (new_fndecl) = 1;93 94 95    // 将新的标识符节点分配给函数声明的汇编名96   // DECL_ASSEMBLER_NAME(call_fn) = new_func_id;97 98     int num_args = gimple_call_num_args(stmt);99     vec<tree> vargs = vNULL;
100     vargs.create (num_args+1);
101 
102     // 创建一个新的参数列表，包含原始的参数和新的参数
103     for (int i = 0; i < num_args; i++) {
104         tree arg = gimple_call_arg(stmt, i);
105         vargs.safe_push(arg);
106     }
107     vargs.safe_push(new_arg);
108 
109     tree lhs = gimple_call_lhs(stmt);
110 
111     // 创建新的函数调用语句，包含新的参数
112     gimple *new_call = gimple_build_call_vec(new_fndecl,vargs);
113     gimple_call_set_lhs (new_call, lhs);
114 
115     // 替换原始的函数调用语句
116     gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
117 
118   //  printf ("-------------finish add mask to vecmath func call------------.\n");
119 
120     gsi_replace(&gsi, new_call,true);
121     stmt = new_call;
122 
123     // 释放参数列表的内存
124     vargs.release ();
125 }
126 
127 static void find_relate_operand(tree operand, gimple *stmt, tree mask)
128 {
129   if (!stmt)
130         return ;
131 
132   if (TREE_CODE (operand) == SSA_NAME && is_gimple_call(stmt)) {  // operand is ssa && stmt is gimple call
133      tree fndecl = gimple_call_fndecl(stmt);  // 获取函数声明
134        if (fndecl && DECL_P(fndecl)) {  // 确保fndecl有效并且是一个声明
135         const  char *func_name = IDENTIFIER_POINTER(DECL_NAME(fndecl));  // 获取函数名称
136          // if (strcmp(func_name, "vmldLn2") == 0) {
137           if (strcmp(func_name, "vmldCos2") == 0 ||
138               strcmp(func_name, "vmldExp2") == 0 ||
139               strcmp(func_name, "vmldSin2") == 0 ||
140               strcmp(func_name, "exp.simdclone.2") == 0 ||
141               strcmp(func_name, "cos.simdclone.2") == 0 ||
142               strcmp(func_name, "sin.simdclone.2") == 0) {
143      //       printf ("-------------find math func------------.\n");
144             add_mask_to_call(stmt,mask,func_name);
145             return ;
146           }
147        }
148   }
149   if (TREE_CODE (operand) == SSA_NAME && is_gimple_assign(stmt)) {   // only find gimple assign
150 
151      for (unsigned i = 1; i < gimple_num_ops(stmt); ++i) {  // get gimple assign right hand side operand
152         tree op = gimple_op(stmt, i);
153         if(TREE_CODE (op) == SSA_NAME) {
154 
155            gimple *stmt_2 = SSA_NAME_DEF_STMT (op);
156            find_relate_operand(op,stmt_2,mask);
157         //   if(result) return result;
158         }
159     }
160   }
161   return ;
162 }
163 
164 
165  unsigned
166  pass_mask_vecmath_func::execute (function *fun)
167  {
168    unsigned ret = 0;
169 
170    basic_block bb;
171    enum tree_code code;
172    FOR_EACH_BB_FN(bb, fun) {
173        gimple_stmt_iterator gsi;
174 
175   /* for (int i = 1; i < number_of_loops (fun); i++)
176      {
177        loop_vec_info loop_vinfo;
178        bool has_mask_store;
179  
180        class loop *loop = get_loop (fun, i);
181        if (!loop || !loop->aux)
182        continue;
183        loop_vinfo = (loop_vec_info) loop->aux;
184        has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
185        delete loop_vinfo;
186        if (has_mask_store) {
187          
188          printf ("-------------have mask store------------.\n");
189 
190          basic_block *bbs = get_loop_body (loop);
191          unsigned nbbs = loop->num_nodes;
192          unsigned i;
193          basic_block bb;
194          class loop *bb_loop;
195          gimple_stmt_iterator gsi;
196          gimple *stmt;
197 
198          for (i = 0; i < nbbs; i++)
199          {
200             bb = bbs[i];*/
201             for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
202              gsi_next (&gsi))
203             {
204               gimple *stmt = gsi_stmt (gsi);
205               if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) {
206  //                printf ("------------ find mask store------------.\n");
207                  basic_block bb1 = gimple_bb(stmt);
208                  tree mask = gimple_call_arg (stmt, 2);
209                  tree value = gimple_call_arg (stmt, 3);
210                  if(TREE_CODE (value) == SSA_NAME) {
211                    gimple *value_def = SSA_NAME_DEF_STMT (value);
212                    basic_block bb2 = gimple_bb(value_def);
213    //                printf ("-------------begin find relate operand------------.\n");
214                    if(bb1 == bb2) //  mask store and value def in same bb
215                    find_relate_operand(value,value_def,mask);
216                  }
217               }
218             }
219 
220          // free (bbs);
221          }
222      //  }
223    //  }
224 
225    return ret;
226 
227  }
228  }
229 
230  gimple_opt_pass *
231  make_pass_mask_vecmath_func (gcc::context *ctxt)
232  {
233    return new pass_mask_vecmath_func (ctxt);
234  }

10092 
10093
10094 void
10095 optimize_mask_vec_cond (class loop *loop)
10096 {
10097   basic_block *bbs = get_loop_body (loop);
10098   unsigned nbbs = loop->num_nodes;
10099   unsigned i;
10100   basic_block bb, bb_mask;
10101   class loop *bb_loop;
10102   gimple_stmt_iterator gsi;
10103   gimple *stmt;
10104   auto_vec<gimple *> worklist;
10105   auto_purge_vect_location sentinel;
10106
10107   enum tree_code code;
10108
10109   vect_location = find_loop_location (loop);
10110   /* Pick up all vec_cond_expr in loop if any.  */
10111   for (i = 0; i < nbbs; i++)
10112     {
10113        bb = bbs[i];
10114        for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10115         gsi_next (&gsi))
10116       {
10117         stmt = gsi_stmt (gsi);
10118         if (is_gimple_assign(stmt)) {
10119           gassign *stmt_assign = dyn_cast <gassign *> (gsi_stmt (gsi));
10120           code = gimple_assign_rhs_code (stmt_assign);
10121           // 检查语句是否为 VEC_COND_EXPR
10122           if (code == VEC_COND_EXPR) {
10123             worklist.safe_push (stmt);
10124           }
10125         }
10126        }
10127      }
10128
10129   free (bbs);
10130   if (worklist.is_empty () || worklist.length()==1)
10131     return;
10132
10133   /* Loop has vec_cond_expr.  */
10134   while (!worklist.is_empty ())
10135     {
10136       gimple *last, *last_store, *last1;
10137       edge e, efalse;
10138       tree mask,mask2;
10139       basic_block store_bb, join_bb;
10140       gimple_stmt_iterator gsi_to;
10141       gimple_stmt_iterator gsi_stmt_def,gsi_mask_def;
10142       tree vdef, new_vdef;
10143       gphi *phi;
10144       tree vectype;
10145       tree zero_vector;
10146
10147       last = worklist.pop ();
10148       gassign *stmt_assign = dyn_cast <gassign *> (last);
10149       mask = gimple_assign_rhs1(stmt_assign);
10150       tree true_vector_operand = gimple_assign_rhs2(stmt_assign);
10151
10152       gimple *mask_def = SSA_NAME_DEF_STMT (mask);
10153
10154       gsi_mask_def = gsi_for_stmt(mask_def);
10155       gsi_prev(&gsi_mask_def);
10156       gimple *mask2_def = gsi_stmt(gsi_mask_def);
10157       gassign *stmt_mask2 = dyn_cast <gassign *> (mask2_def);
10158       mask2 = gimple_assign_lhs(stmt_mask2);
10159
10160
10161       gimple *stmt_def = SSA_NAME_DEF_STMT (true_vector_operand);
10162
10163       bb = gimple_bb (stmt_def);
10164
10165       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10166    the same loop as if_bb.  It could be different to LOOP when two
10167    level loop-nest is vectorized and mask_store belongs to the inner
10168    one.  */
10169
10170       gsi_stmt_def = gsi_for_stmt (stmt_def);
10171       gsi_next(&gsi_stmt_def);
10172
10173       stmt_def = gsi_stmt(gsi_stmt_def);
10174
10175       e = split_block (bb, stmt_def);
10176       bb_loop = bb->loop_father;
10177    //   gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10178       join_bb = e->dest;
10179       store_bb = create_empty_bb (bb);
10180       add_bb_to_loop (store_bb, bb_loop);
10181       e->flags = EDGE_TRUE_VALUE;
10182       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10183       /* Put STORE_BB to likely part.  */
10184       efalse->probability = profile_probability::unlikely ();
10185       store_bb->count = efalse->count ();
10186       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10187       if (dom_info_available_p (CDI_DOMINATORS))
10188   set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10189       if (dump_enabled_p ())
10190   dump_printf_loc (MSG_NOTE, vect_location,
10191        "Create new block %d to sink vect cond expr",
10192        store_bb->index);
10193       /* Create vector comparison with boolean result.  */
10194       vectype = TREE_TYPE (mask);
10195       zero_vector = build_zero_cst (vectype);
10196
10197       tree combined_mask = create_tmp_var(TREE_TYPE(zero_vector), "combined_mask");
10198
10199       gimple *combine_stmt1 = gimple_build_assign(combined_mask, BIT_IOR_EXPR, mask, mask2);
10200
10201       gsi = gsi_for_stmt (mask_def);
10202       gsi_next(&gsi);
10203       gsi_insert_after (&gsi, combine_stmt1, GSI_SAME_STMT);
10204
10205     /*  vec<constructor_elt, va_gc> *ret_ctor_elts_tmp = NULL;
10206       vec_alloc (ret_ctor_elts_tmp, 2);
10207       CONSTRUCTOR_APPEND_ELT(ret_ctor_elts_tmp, NULL_TREE, mask2); // 添加第二个左子树
10208       CONSTRUCTOR_APPEND_ELT(ret_ctor_elts_tmp, NULL_TREE, mask); // 添加第一个左子树
10209
10210     //  tree signed_boolean_type = build_nonstandard_integer_type(64, 1);
10211       tree signed_boolean_type = build_nonstandard_boolean_type(64);
10212
10213       tree vect_type = build_vector_type(signed_boolean_type, 4);
10214       tree constructor = build_constructor(vect_type, ret_ctor_elts_tmp);
10215
10216       tree new_var_constru = create_tmp_var(vect_type, "mask_array");
10217       gimple *new_stmt_construc = gimple_build_assign(make_ssa_name(new_var_constru), constructor);
10218       gsi_next(&gsi);
10219       gsi_insert_after (&gsi, new_stmt_construc, GSI_SAME_STMT);*/
10220
10221       gimple *gcond = gimple_build_cond(EQ_EXPR, combined_mask, zero_vector, NULL, NULL);
10222       gsi_next(&gsi);
10223       gsi_insert_after(&gsi, gcond, GSI_NEW_STMT);
10224
10225
10226       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10227   //    while (true)
10228 //  {
10229     gimple_stmt_iterator gsi_from;
10230     gimple *stmt1 = NULL;
10231
10232     /* Move vec_cond second var def to STORE_BB.  */
10233     last_store = stmt_def;
10234     gsi = gsi_for_stmt (stmt_def);
10235     gsi_from = gsi;
10236     /* Shift GSI to the previous stmt for further traversal.  */
10237     gsi_prev (&gsi);
10238     gsi_to = gsi_start_bb (store_bb);
10239     gsi_move_before (&gsi_from, &gsi_to);
10240     /* Setup GSI_TO to the non-empty block start.  */
10241     gsi_to = gsi_start_bb (store_bb);
10242     if (dump_enabled_p ())
10243       dump_printf_loc (MSG_NOTE, vect_location,
10244            "Move stmt to created bb\n%G", last);
10245     /* Move all stored value producers if possible.  */
10246     while (!gsi_end_p (gsi))
10247       {
10248         tree lhs;
10249         imm_use_iterator imm_iter;
10250         use_operand_p use_p;
10251         bool res;
10252
10253         /* Skip debug statements.  */
10254         if (is_gimple_debug (gsi_stmt (gsi)))
10255     {
10256       gsi_prev (&gsi);
10257       continue;
10258     }
10259         stmt1 = gsi_stmt (gsi);
10260         /* Do not consider statements writing to memory or having
10261      volatile operand.  */
10262         if (gimple_vdef (stmt1)
10263       || gimple_has_volatile_ops (stmt1))
10264     break;
10265         gsi_from = gsi;
10266         gsi_prev (&gsi);
10267         lhs = gimple_get_lhs (stmt1);
10268         if (!lhs)
10269     break;
10270
10271         /* LHS of vectorized stmt must be SSA_NAME.  */
10272         if (TREE_CODE (lhs) != SSA_NAME)
10273     break;
10274
10275         if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10276     {
10277       /* Remove dead scalar statement.  */
10278       if (has_zero_uses (lhs))
10279         {
10280           gsi_remove (&gsi_from, true);
10281           continue;
10282         }
10283     }
10284
10285         /* Check that LHS does not have uses outside of STORE_BB.  */
10286         res = true;
10287     //    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10288         gimple *use_lhs;
10289         FOR_EACH_IMM_USE_STMT (use_lhs, imm_iter, lhs)
10290     {
10291         gimple *use_stmt;
10292         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) {
10293
10294       //  gimple *use_stmt;
10295         use_stmt = USE_STMT (use_p);
10296         if (is_gimple_debug (use_stmt))
10297         continue;
10298       if (gimple_bb (use_stmt) != store_bb && gimple_bb (use_stmt) != gimple_bb (last))
10299     {
10300          // res = false;
10301
10302         if (dump_enabled_p ())
10303     dump_printf_loc (MSG_NOTE, vect_location,
10304          "LHS have use outside of store_BB\n%G", stmt1);
10305         tree new_lhs,new_lhs1,new_lhs2;
10306         tree new_lhs_phi;
10307         gphi *phi;
10308         tree vectype;
10309         tree zero;
10310         gimple *zero_def;
10311
10312         gimple *new_assign_stmt;
10313
10314         if (is_gimple_assign(stmt1) && is_gimple_assign(use_lhs)) {
10315         for (unsigned int i = 1; i < gimple_num_ops(use_stmt); i++) {
10316           tree rhs = gimple_op(use_stmt, i);
10317           if(TREE_CODE (rhs) == SSA_NAME && (rhs == lhs)) {
10318
10319            if (dump_enabled_p ())
10320               dump_printf_loc (MSG_NOTE, vect_location,
10321          "insert new stmt to use out of BB\n");
10322             new_lhs = create_tmp_var(TREE_TYPE(lhs), "new_tmp_var");
10323             new_lhs1 = make_ssa_name(new_lhs,NULL);
10324             tree rhs1 = gimple_assign_rhs1(stmt1);
10325             tree rhs2 = gimple_assign_rhs2(stmt1);
10326             new_assign_stmt = gimple_build_assign(new_lhs1, gimple_assign_rhs_code(stmt1), rhs1, rhs2);
10327
10328             gimple_stmt_iterator gsi_temp = gsi_for_stmt(use_stmt);
10329             gsi_insert_before (&gsi_temp,new_assign_stmt,GSI_SAME_STMT);
10330             update_stmt(new_assign_stmt);
10331
10332             if( i == 1) {
10333
10334               gimple_assign_set_rhs1(use_stmt, new_lhs1);
10335            //   update_stmt(use_stmt);
10336             }
10337             else if (i == 2) {
10338               gimple_assign_set_rhs2(use_stmt, new_lhs1);
10339             //  update_stmt(use_stmt);
10340             }
10341
10342           //  update_stmt(use_stmt);
10343           }
10344         }
10345       }
10346     }
10347         }
10348
10349             update_stmt(use_stmt);
10350     }
10351
10352         /* Can move STMT1 to STORE_BB.  */
10353      /*   if (dump_enabled_p ())
10354     dump_printf_loc (MSG_NOTE, vect_location,
10355          "Move stmt to created bb\n%G", stmt1);*/
10356         gsi_move_before (&gsi_from, &gsi_to);
10357         /* Shift GSI_TO for further insertion.  */
10358         gsi_prev (&gsi_to);
10359       }
10360     if (!worklist.is_empty ())
10361     last = worklist.pop ();
10362     }
10363
10364 }

对 if continue的分块

10161   /*    if(worklist.length()== 1) {
10162         if (dump_enabled_p ())
10163                dump_printf_loc (MSG_NOTE, vect_location,
10164             " if-continue split bb\n");
10165         tree mask_tmp2 = gimple_assign_rhs2(stmt_mask2);
10166         tree mask_tmp1 = gimple_assign_rhs2(stmt_mask1);
10167
10168         gimple *mask_temp2_def = SSA_NAME_DEF_STMT (mask_tmp2);
10169         gimple *mask_temp1_def = SSA_NAME_DEF_STMT (mask_tmp1);
10170
10171         gassign *stmt_mask_tmp2 = dyn_cast <gassign *> (mask_temp2_def);
10172         gassign *stmt_mask_tmp1 = dyn_cast <gassign *> (mask_temp1_def);
10173
10174         tree temp2_rhs1 = gimple_assign_rhs1(stmt_mask_tmp2);
10175         tree temp1_rhs1 = gimple_assign_rhs1(stmt_mask_tmp1);
10176
10177         tree target_mask3 = gimple_assign_lhs(stmt_mask_tmp2);
10178         tree target_mask4 = gimple_assign_lhs(stmt_mask_tmp1);
10179
10180         tree temp2_rhs2 = gimple_assign_rhs2(stmt_mask_tmp2);
10181         tree temp1_rhs2 = gimple_assign_rhs2(stmt_mask_tmp1);
10182
10183         gimple *target_stmt1 = SSA_NAME_DEF_STMT (temp2_rhs1);
10184         gimple *target_stmt2 = SSA_NAME_DEF_STMT (temp1_rhs1);
10185
10186         gassign *stmt_target_stmt1 = dyn_cast <gassign *> (target_stmt1);
10187         gassign *stmt_target_stmt2 = dyn_cast <gassign *> (target_stmt2);
10188
10189         tree target_mask1 = gimple_assign_lhs(stmt_target_stmt1);
10190         tree target_mask2 = gimple_assign_lhs(stmt_target_stmt2);
10191
10192
10193         gimple *target_stmt3 = SSA_NAME_DEF_STMT (temp2_rhs2);
10194         gimple *target_stmt4 = SSA_NAME_DEF_STMT (temp1_rhs2);
10195
10196         basic_block bb_tmp =  gimple_bb (target_stmt1);
10197         basic_block bb_tmp_next =  gimple_bb (target_stmt4);
10198         edge e_tmp;
10199         gimple_stmt_iterator target_stmt4_gsi = gsi_for_stmt(mask_temp1_def);
10200         gsi_next(&target_stmt4_gsi);
10201         gimple *target_stmt4_next = gsi_stmt(target_stmt4_gsi);
10202
10203         gimple_stmt_iterator target_stmt2_gsi = gsi_for_stmt(target_stmt2);
10204         gsi_next(&target_stmt2_gsi);
10205         gimple *target_stmt2_next = gsi_stmt(target_stmt2_gsi);
10206
10207         e_tmp = split_block (bb_tmp, target_stmt4_next);
10208         class loop *bb_loop_tmp = bb_tmp->loop_father;
10209         gcc_assert (loop == bb_loop_tmp || flow_loop_nested_p (loop, bb_loop_tmp));
10210
10211         basic_block bb_last_tmp = gimple_bb(last);
10212         basic_block join_bb_tmp;
10213         gimple *last_stmt_tmp = last_stmt(bb_last_tmp);
10214         if (last_stmt_tmp && gimple_code(last_stmt_tmp) == GIMPLE_COND) {
10215
10216             edge e_tmp2;
10217             edge_iterator ei_tmp2;
10218              basic_block true_bb;
10219
10220             FOR_EACH_EDGE(e_tmp2, ei_tmp2, bb_last_tmp->succs) {
10221         // 检查是否为 true 分支
10222               if (e_tmp2->flags & EDGE_TRUE_VALUE) {
10223                   true_bb = e_tmp2->dest;
10224               }
10225             }
10226             join_bb_tmp = e_tmp->dest;
10227             basic_block store_bb_tmp = create_empty_bb (bb_tmp);
10228             add_bb_to_loop (store_bb_tmp, bb_loop_tmp);
10229        //     e_tmp->flags = EDGE_TRUE_VALUE;
10230
10231             edge efalse_tmp_true = make_edge (bb_tmp, bb_last_tmp, EDGE_TRUE_VALUE);
10232                /* Put STORE_BB to likely part.  */
10233     /*        efalse_tmp_true->probability = profile_probability::likely ();
10234             store_bb_tmp->count = efalse_tmp_true->count ();
10235
10236             edge efalse_tmp = make_edge (bb_tmp, store_bb_tmp, EDGE_FALSE_VALUE);
10237                /* Put STORE_BB to likely part.  */
10238       /*      efalse_tmp->probability = profile_probability::unlikely ();
10239             store_bb_tmp->count = efalse_tmp->count ();
10240        //     make_single_succ_edge (store_bb_tmp, join_bb_tmp, EDGE_FALLTHRU);
10241
10242             edge efalse_tmp_next = make_edge (store_bb_tmp, join_bb_tmp, EDGE_FALSE_VALUE);
10243             efalse_tmp_next->probability = profile_probability::unlikely ();
10244        //     store_bb_tmp->count = efalse_tmp_true->count ();
10245
10246             edge etrue_tmp_next = make_edge (store_bb_tmp, bb_last_tmp, EDGE_TRUE_VALUE);
10247             etrue_tmp_next->probability = profile_probability::likely ();
10248             store_bb_tmp->count = efalse_tmp_true->count ();
10249         //    true_bb = e_tmp->dest;
10250
10251         //    e_tmp->dest = NULL;
10252          //   e_tmp->flags = EDGE_TRUE_VALUE;
10253
10254             edge e_dele = find_edge(bb_tmp, join_bb_tmp);
10255             if (e_dele) {
10256                   remove_edge(e_dele); // 删除这条边
10257                  }
10258
10259         //    true_bb->preds = chainon(true_bb->preds, e_tmp);
10260             add_to_dominance_info(CDI_DOMINATORS,join_bb_tmp);
10261
10262             if (dom_info_available_p (CDI_DOMINATORS)) {
10263                 set_immediate_dominator (CDI_DOMINATORS, store_bb_tmp, bb_tmp);
10264                 set_immediate_dominator (CDI_DOMINATORS, join_bb_tmp, store_bb_tmp);
10265                 set_immediate_dominator (CDI_DOMINATORS, bb_last_tmp, bb_tmp);
10266            //     free_dominance_info(CDI_DOMINATORS);
10267                 calculate_dominance_info(CDI_DOMINATORS);
10268             }
10269
10270        //     free_dominance_info(CDI_DOMINATORS);
10271     //        calculate_dominance_info(CDI_DOMINATORS);
10272
10273             tree vectype_tmp = TREE_TYPE (mask_tmp1);
10274             tree zero_vector_tmp = build_zero_cst (vectype_tmp);
10275
10276             tree combined_mask_tmp = create_tmp_var(TREE_TYPE(zero_vector_tmp), "combined_mask_ifconti");
10277
10278             tree combined_mask_tmp2 = create_tmp_var(TREE_TYPE(zero_vector_tmp), "combined_mask_ifconti2");
10279
10280             gimple *combine_stmt1_tmp = gimple_build_assign(combined_mask_tmp, BIT_IOR_EXPR, target_mask1, target_mask2);
10281
10282             gimple *combine_stmt1_tmp2 = gimple_build_assign(combined_mask_tmp2, BIT_IOR_EXPR, target_mask3, target_mask4);
10283
10284             gimple_stmt_iterator gsi_tmp = gsi_for_stmt (target_stmt2);
10285             gsi_next(&gsi_tmp);
10286             gsi_insert_after (&gsi_tmp, combine_stmt1_tmp, GSI_SAME_STMT);
10287
10288             gimple_stmt_iterator gsi_tmp_next_if = gsi_last_bb (store_bb_tmp);
10289          //   gsi_prev(&gsi_tmp_next_if);
10290             gsi_insert_before (&gsi_tmp_next_if, combine_stmt1_tmp2, GSI_SAME_STMT);
10291
10292             gimple *gcond_tmp = gimple_build_cond(EQ_EXPR, combined_mask_tmp, zero_vector_tmp, NULL, NULL);
10293             gsi_next(&gsi_tmp);
10294             gsi_insert_after(&gsi_tmp, gcond_tmp, GSI_NEW_STMT);
10295
10296             gimple *gcond_tmp_next = gimple_build_cond(EQ_EXPR, combined_mask_tmp2, zero_vector_tmp, NULL, NULL);
10297          //   gsi_next(&gsi_tmp_next_if);
10298             gsi_insert_before(&gsi_tmp_next_if, gcond_tmp_next, GSI_NEW_STMT);
10299
10300         //    calculate_dominance_info(CDI_DOMINATORS);
10301
10302             gimple_stmt_iterator gsi_from_tmp;
10303             gimple *stmt1 = NULL;
10304
10305     /* Move vec_cond second var def to STORE_BB.  */
10306      /*       gimple *last_store = target_stmt4_next;
10307             gimple_stmt_iterator gsi_tmp4 = gsi_for_stmt (target_stmt4_next);
10308             gsi_from_tmp = gsi_tmp4;
10309     /* Shift GSI to the previous stmt for further traversal.  */
10310     /*        gsi_prev (&gsi_tmp4);
10311             gimple_stmt_iterator gsi_to_tmp = gsi_start_bb (store_bb_tmp);
10312             gsi_move_before (&gsi_from_tmp, &gsi_to_tmp);
10313     /* Setup GSI_TO to the non-empty block start.  */
10314     /*        gsi_to_tmp = gsi_start_bb (store_bb_tmp);
10315             if (dump_enabled_p ())
10316               dump_printf_loc (MSG_NOTE, vect_location,
10317            "Move if-continue stmt to created bb\n%G", last);
10318     /* Move all stored value producers if possible.  */
10319      /*       while (!gsi_end_p (gsi_tmp4)) {
10320
10321                tree lhs;
10322                imm_use_iterator imm_iter;
10323                use_operand_p use_p;
10324                bool res;
10325
10326               /* Skip debug statements.  */
10327        /*        if (is_gimple_debug (gsi_stmt (gsi_tmp4)))
10328                 {
10329                   gsi_prev (&gsi_tmp4);
10330                   continue;
10331                 }
10332                stmt1 = gsi_stmt (gsi_tmp4);
10333         /* Do not consider statements writing to memory or having
10334      volatile operand.  */
10335         /*       if (gimple_vdef (stmt1) || gimple_has_volatile_ops (stmt1))
10336                  break;
10337                gsi_from_tmp = gsi_tmp4;
10338                gsi_prev (&gsi_tmp4);
10339                lhs = gimple_get_lhs (stmt1);
10340                if (!lhs)
10341                  break;
10342
10343         /* LHS of vectorized stmt must be SSA_NAME.  */
10344         /*       if (TREE_CODE (lhs) != SSA_NAME)
10345                  break;
10346
10347                if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10348                  {
10349                /* Remove dead scalar statement.  */
10350         /*           if (has_zero_uses (lhs))
10351                      {
10352                        gsi_remove (&gsi_from_tmp, true);
10353                        continue;
10354                      }
10355                  }
10356
10357                 gsi_move_before (&gsi_from_tmp, &gsi_to_tmp);
10358                /* Shift GSI_TO for further insertion.  */
10359           /*     gsi_prev (&gsi_to_tmp);
10360              }
10361         }
10362       }*/

当vf 是4的时候，进行mask的合并，以及将合并后的mask加入到数学函数里面

mask合并代码

10410       vec<constructor_elt, va_gc> *ret_ctor_elts_tmp = NULL;
10411       vec_alloc (ret_ctor_elts_tmp, 2);
10412       CONSTRUCTOR_APPEND_ELT(ret_ctor_elts_tmp, NULL_TREE, mask2); // 添加第二个左子树
10413       CONSTRUCTOR_APPEND_ELT(ret_ctor_elts_tmp, NULL_TREE, mask); // 添加第一个左子树
10414
10415     //  tree signed_boolean_type = build_nonstandard_integer_type(64, 1);
10416       tree signed_boolean_type = build_nonstandard_boolean_type(64);
10417
10418       tree vect_type = build_vector_type(signed_boolean_type, 4);
10419       tree constructor = build_constructor(vect_type, ret_ctor_elts_tmp);
10420
10421       tree new_var_constru = create_tmp_var(vect_type, "mask_array");
10422       gimple *new_stmt_construc = gimple_build_assign(make_ssa_name(new_var_constru), constructor);
10423       gsi_next(&gsi);
10424       gsi_insert_after (&gsi, new_stmt_construc, GSI_SAME_STMT);

将合并后的mask加入到数学函数里面

195                       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mask_operand)
196                       {
197                         gimple *use_stmt;
198                         use_stmt = USE_STMT (use_p);
199                         if(is_gimple_assign(use_stmt)) {
200                           tree rhs1_tmp1 = gimple_assign_rhs1(use_stmt);
201                           if (TREE_CODE(rhs1_tmp1) == CONSTRUCTOR) {
202                             tree lhs_tmp1 = gimple_assign_lhs(use_stmt);
203                             if(stmt_vecmath)
204                             add_mask_to_call(stmt_vecmath,lhs_tmp1);
205                           }
206                         }
207                       }

oneapi的cfg图

在移动的过程中如果store bb的中的LHS在除了store bb外的其他bb中被使用，则需要重新计算

10490         /* Check that LHS does not have uses outside of STORE_BB.  */
10491         res = true;
10492     //    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10493         gimple *use_lhs;
10494         FOR_EACH_IMM_USE_STMT (use_lhs, imm_iter, lhs)
10495     {
10496         gimple *use_stmt;
10497         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) {
10498
10499       //  gimple *use_stmt;
10500         use_stmt = USE_STMT (use_p);
10501         if (is_gimple_debug (use_stmt))
10502         continue;
10503       if (gimple_bb (use_stmt) != store_bb && gimple_bb (use_stmt) != gimple_bb (last))
10504     {
10505          // res = false;
10506
10507         if (dump_enabled_p ())
10508     dump_printf_loc (MSG_NOTE, vect_location,
10509          "LHS have use outside of store_BB\n%G", stmt1);
10510         tree new_lhs,new_lhs1,new_lhs2;
10511         tree new_lhs_phi;
10512         gphi *phi;
10513         tree vectype;
10514         tree zero;
10515         gimple *zero_def;
10516
10517         gimple *new_assign_stmt;
10518
10519         if (is_gimple_assign(stmt1) && is_gimple_assign(use_lhs)) {
10520         for (unsigned int i = 1; i < gimple_num_ops(use_stmt); i++) {
10521           tree rhs = gimple_op(use_stmt, i);
10522           if(TREE_CODE (rhs) == SSA_NAME && (rhs == lhs)) {
10523
10524            if (dump_enabled_p ())
10525               dump_printf_loc (MSG_NOTE, vect_location,
10526          "insert new stmt to use out of BB\n");
10527             new_lhs = create_tmp_var(TREE_TYPE(lhs), "new_tmp_var");
10528             new_lhs1 = make_ssa_name(new_lhs,NULL);
10529             tree rhs1 = gimple_assign_rhs1(stmt1);
10530             tree rhs2 = gimple_assign_rhs2(stmt1);
10531             new_assign_stmt = gimple_build_assign(new_lhs1, gimple_assign_rhs_code(stmt1), rhs1, rhs2);
10532
10533             gimple_stmt_iterator gsi_temp = gsi_for_stmt(use_stmt);
10534             gsi_insert_before (&gsi_temp,new_assign_stmt,GSI_SAME_STMT);
10535             update_stmt(new_assign_stmt);
10536
10537             if( i == 1) {
10538
10539               gimple_assign_set_rhs1(use_stmt, new_lhs1);
10540            //   update_stmt(use_stmt);
10541             }
10542             else if (i == 2) {
10543               gimple_assign_set_rhs2(use_stmt, new_lhs1);
10544             //  update_stmt(use_stmt);
10545             }
10546
10547           //  update_stmt(use_stmt);
10548           }
10549         }
10550       }
10551     }
10552         }
10553
10554             update_stmt(use_stmt);
10555     } */

消除同一个reduction 在loop 中使用多次

        # temp_value.920_2824 = PHI <tmp_var.921_2823(234), 0.0(279)>
48420   # temp_value.923_2821 = PHI <tmp_var.924_2820(234), 0.0(279)>
48421   # temp_value.926_2814 = PHI <tmp_var.927_2813(234), 0.0(279)>
48422   # temp_value.929_2807 = PHI <tmp_var.930_2806(234), 0.0(279)>
48423   # temp_value.932_2800 = PHI <tmp_var.933_2798(234), 0.0(279)>_ifc__2843 = _3089 ? _2132 : 0.0;
48574   tmp_var.927_2813 = _ifc__2843 + temp_value.926_2814;
48575   _ifc__2842 = _3084 ? _2145 : 0.0;
48576   tmp_var.930_2806 = _ifc__2842 + temp_value.929_2807;
48577   _ifc__2841 = _3192 ? _2085 : 0.0;
48578   tmp_var.921_2823 = _ifc__2841 + temp_value.920_2824;
48579   _ifc__2840 = _3172 ? _2101 : 0.0;
48580   tmp_var.933_2798 = _ifc__2840 + temp_value.932_2800;
48581   _ifc__2839 = _3161 ? _2113 : 0.0;
48582   tmp_var.924_2820 = _ifc__2839 + temp_value.923_2821;# tmp_sumi.922_2822 = PHI <tmp_var.921_2823(83), 0.0(81), 0.0(276)>
48880   # tmp_sumi.925_2816 = PHI <tmp_var.924_2820(83), 0.0(81), 0.0(276)>
48881   # tmp_sumi.928_2809 = PHI <tmp_var.927_2813(83), 0.0(81), 0.0(276)>
48882   # tmp_sumi.931_2805 = PHI <tmp_var.930_2806(83), 0.0(81), 0.0(276)>
48883   # tmp_sumi.934_2793 = PHI <tmp_var.933_2798(83), 0.0(81), 0.0(276)>_2752 = tmp_sumi.922_2822 + tmp_sumi.925_2816;
48885   _2750 = _2752 + tmp_sumi.928_2809;
48886   _2747 = _2750 + tmp_sumi.931_2805;
48887   _2746 = _2747 + tmp_sumi.934_2793;_2156 = ri1i_2025 + _2746;
48931   _2163 = _2160 * _2746;

1761           for (k = 0; k < lpears[i] + upears[i]; k++) {
1762
1763             if (pearlist[i] == NULL) {
1764                fprintf(nabout,
1765                        "NULL pair list entry in egb loop 1, taskid = %d\n",
1766                        mytaskid);
1767                fflush(nabout);
1768             }
1769             j = pearlist[i][k];
1770
1771             xij = xi - x[dim * j];
1772             yij = yi - x[dim * j + 1];
1773             zij = zi - x[dim * j + 2];
1774             r2 = xij * xij + yij * yij + zij * zij;
1775
1776             if (dim == 4) {                     // delete
1777                wij = wi - x[dim * j + 3];
1778                r2 += wij * wij;
1779             }
1780
1781             if (r2 > rgbmaxpsmax2)      //  %hir.cmp.4310 ule
1782                continue;
1783             dij1i = 1.0 / sqrt(r2);
1784             dij = r2 * dij1i;
1785             sj = fs[j] * (rborn[j] - BOFFSET);   //   select fast
1786             sj2 = sj * sj;
1787
1788             /*
1789              * ---following are from the Appendix of Schaefer and Froemmel,
1790              * JMB 216:1045-1066, 1990;  Taylor series expansion for d>>s
1791              * is by Andreas Svrcek-Seiler; smooth rgbmax idea is from
1792              * Andreas Svrcek-Seiler and Alexey Onufriev.
1793              */
1794
1795             if (dij > rgbmax + sj)      // rgbmax = 20;   %hir.cmp.4333 ule
1796                continue;
1797
1798             if ((dij > rgbmax - sj)) {      //    %hir.cmp.4349  ogt
1799                uij = 1. / (dij - sj);
1800                sumi -= 0.125 * dij1i * (1.0 + 2.0 * dij * uij +
1801                                         rgbmax2i * (r2 -
1802                                                     4.0 * rgbmax *
1803                                                     dij - sj2) +
1804                                         2.0 * log((dij - sj) * rgbmax1i));
1805
1806             } else if (dij > 4.0 * sj) {
1807                dij2i = dij1i * dij1i;
1808                tmpsd = sj2 * dij2i;
1809                dumbo =
1810                    TA + tmpsd * (TB +
1811                                  tmpsd * (TC +
1812                                           tmpsd * (TD + tmpsd * TDD)));
1813                sumi -= sj * tmpsd * dij2i * dumbo;
1814
1815             } else if (dij > ri + sj) {
1816                sumi -= 0.5 * (sj / (r2 - sj2) +
1817                               0.5 * dij1i * log((dij - sj) / (dij + sj)));
1818
1819             } else if (dij > fabs(ri - sj)) {
1820                theta = 0.5 * ri1i * dij1i * (r2 + ri * ri - sj2);
1821                uij = 1. / (dij + sj);
1822                sumi -= 0.25 * (ri1i * (2. - theta) - uij +
1823                                dij1i * log(ri * uij));
1824
1825             } else if (ri < sj) {
1826                sumi -= 0.5 * (sj / (r2 - sj2) + 2. * ri1i +
1827                               0.5 * dij1i * log((sj - dij) / (sj + dij)));
1828
1829             }
1830
1831          }

1：if fprintf 分析不出内存关系，无法ifcvt。（lim pass 其无法外提也是因为fprintf中内存关系无法分析）

解决：将其外提到最内层循环外面。

2 ： dim常量传播（ipa-cp pass)

mme → mme34 → egb

dim 作为全局变量无法常量传播，作为函数参数的时候可以传播到。

解决：新建一个pass,识别全局变量(当其没有作为函数传参时）和函数调用关系，在函数调用的地方将变量替换为常量值。(pass 的位置？是否有参数能解决）

根据inline pass debug的信息，发现mme34无法inline进mme 原因是--param early-inlining-insns= 值过小，将此值调大，可以成功inline。

inline 过后

 ;;   basic block 2, loop depth 0, count 27580514 (estimated locally), maybe hot74798 ;;    prev block 0, next block 3, flags: (NEW, REACHABLE, VISITED)74799 ;;    pred:       ENTRY [always]  count:27580514 (estimated locally) (FALLTHRU,EXECUTABLE)74800   # .MEM_2325 = VDEF <.MEM_2324(D)>74801   dim.lto_priv.0D.4751 = 3;74802   # VUSE <.MEM_2325>basic block 96, loop depth 2, count 954868629 (estimated locally), maybe hot77095 ;;    prev block 95, next block 97, flags: (NEW, REACHABLE, VISITED)77096 ;;    pred:       94 [82.6% (guessed)]  count:788435027 (estimated locally) (FALSE_VALUE,EXECUTABLE)77097 ;;                95 [always]  count:166433602 (estimated locally) (FALLTHRU,EXECUTABLE)_698 = dim.lto_priv.0D.4751;77112     _699 = j_697 * _698;if (_698 == 4)77146      goto <bb 97>; [34.00%]77147       else77148       goto <bb 98>; [66.00%]

怀疑是mme34函数中其他部分的代码，影响了其做常量传播的分析，注释掉mme34函数中的部分代码，发现其能够做到常量将dim =3 作为常量。

 _77 = j_76 * 3;

但是需要同时注释掉的内容较多，无法准确找到哪部分代码影响了传播，以及这部分代码的特性。

写了一个例子发现其静态全局变量可以成功作为常量计算，怀疑是mme34函数中的其他部分，影响到dim的常量传播。

 1     #include<stdio.h>2     #include<math.h>3     #include<stdlib.h>456     static int threshold = 5;78     static inline int check_value1(int x) {9       if(threshold < 20)10       return x*threshold;11       else return threshold;12     }1314     static inline int check_value2(int x) {15       if(threshold < 5)16       return x+threshold;17       else return threshold;18     }19     static inline int check_value3(int x) {20       threshold = 10;21       return check_value1(x);22     }23     static inline int check_value4(int x) {24       threshold = 50;25       return check_value2(x);26     }2728     int use_threshold(int threshold) {2930       return 10 + threshold;31     }32     int main()33     {34       int num = 30;35       int num2 = 5;36       int ans3 = use_threshold(threshold);37       int ans1 = check_value3(num);38       int ans2 = check_value4(num2);39       int ans = ans1 + ans2 +ans3;40       printf("ans is %d\n",ans);41       return 0;42     }

查看ccp pass 中的debug的信息

39040 Visiting statement:39041 # VUSE <.MEM_2279>39042 _698 = dim.lto_priv.0D.4751;39043 which is likely CONSTANT39044 Lattice value changed to VARYING.  Adding SSA edges to worklist.

在这里进行gdb 调试，

69046 Substituting values and folding statements69048 Folding statement: dim = 3;69049 Not folded

1761          for (k = 0; k < lpears[i] + upears[i]; k++) {
1762
1763             if (pearlist[i] == NULL) {
1764                fprintf(nabout,
1765                        "NULL pair list entry in egb loop 1, taskid = %d\n",
1766                        mytaskid);
1767                fflush(nabout);abort();
1768             }
1769             j = pearlist[i][k];
1770

在ifcvt pass k看if 并没有被外提，无法ifcvt

插入abort需要识别的patern

14044   <bb 148> [local count: 919275880]:
14045   _2044 = _127 + _2039;
14046   _2045 = *_2044;
14047   if (_2045 == 0B)
14048     goto <bb 149>; [17.43%]
14049   else
14050     goto <bb 150>; [82.57%]
14051
14052   <bb 149> [local count: 160229786]:
14053   _2046 = 0;
14054   _2047 = nabout;
14055   fprintf (_2047, "NULL pair list entry in egb loop 1, taskid = %d\n", _2046);
14056   _2048 = nabout;
14057   fflush (_2048);
14058
14059   <bb 150> [local count: 919275880]:
14060   _2049 = *_2044;
14061   _2051 = (long unsigned int) k_2050;
14062   _2052 = _2051 * 4;
14063   _2053 = _2049 + _2052;
14064   j_2054 = *_2053;

Eff.c:3282

build_base_HygonGCC_Spec2017_rate_perf-test.cfg-64.0000

build_base_HygonGCC_Spec2017_rate_perf.cfg-64.0001

加上一个参数使mme34内联进mme中，但是dim = 3的常量传播无法做到。写了一个静态全局变量的例子，发现其能够传播到，怀疑是函数中的其他代码影响了对常量的分析无法传播到，通过注释原题中的代码

加上if -continue 107

不加 106

Base 99.6

2.6 带有mask的向量数学函数

相关文章：