diff mbox

[Cilkplus] Implementation of the Mask Clause in elemental function

Message ID BF230D13CA30DD48930C31D4099330000F261B41@FMSMSX102.amr.corp.intel.com
State New
Headers show

Commit Message

Iyer, Balaji V May 23, 2012, 9:19 p.m. UTC
Hello Everyone,
     This patch is for the Cilkplus branch affecting both C and C++ compilers.This patch will implement the mask clause of elemental functions.

Thanking You,

Yours Sincerely,

Balaji V. Iyer.
diff mbox

Patch

Index: cgraph.h
===================================================================
--- cgraph.h	(revision 187789)
+++ cgraph.h	(working copy)
@@ -630,7 +630,8 @@ 
 void tree_function_versioning (tree, tree, VEC (ipa_replace_map_p,gc)*,
 			       bool, bitmap, bool, bitmap, basic_block);
 void tree_elem_fn_versioning (tree, tree, VEC (ipa_replace_map_p,gc)*,
-			      bool, bitmap, bool, bitmap, basic_block, int);
+			      bool, bitmap, bool, bitmap, basic_block, int,
+			      bool);
 bool cgraph_process_new_functions (void);
 void cgraph_process_same_body_aliases (void);
 void fixup_same_cpp_alias_visibility (symtab_node node, symtab_node target, tree alias);
Index: tree.h
===================================================================
--- tree.h	(revision 187789)
+++ tree.h	(working copy)
@@ -6234,6 +6234,9 @@ 
 tree build_call_list (tree return_type, tree fn, tree arglist);
 tree build_function_linkage_variant (tree ttype,
 				     enum function_linkage linkage);
+bool is_elem_fn (tree);
+enum elem_fn_parm_type find_elem_fn_parm_type (gimple, tree, tree*);
+void elem_fn_create_fn (tree) __attribute__((weak));
 
 /* Functional interface to the builtin functions.  */
 
Index: cgraphunit.c
===================================================================
--- cgraphunit.c	(revision 187789)
+++ cgraphunit.c	(working copy)
@@ -226,15 +226,23 @@ 
 static bool
 cgraph_decide_is_function_needed (struct cgraph_node *node, tree decl)
 {
+  bool is_cloned_elem_func = false;
   /* If the user told us it is used, then it must be so.  */
   if (node->symbol.force_output)
     return true;
 
+  /* When an elemental function is cloned, we set the elem_fn_already_cloned,
+     will be set to true, for all other functions, it is initalized to zero.
+     So, if it is an elemental function, we output it without questioning */
+  if (DECL_STRUCT_FUNCTION (decl))
+    is_cloned_elem_func = DECL_STRUCT_FUNCTION (decl)->elem_fn_already_cloned;
+  
   /* Double check that no one output the function into assembly file
      early.  */
   gcc_checking_assert (!DECL_ASSEMBLER_NAME_SET_P (decl)
 	               || (node->thunk.thunk_p || node->same_body_alias)
-	               ||  !TREE_SYMBOL_REFERENCED (DECL_ASSEMBLER_NAME (decl)));
+	               ||  !TREE_SYMBOL_REFERENCED (DECL_ASSEMBLER_NAME (decl))
+		       || is_cloned_elem_func);
 
 
   /* Keep constructors, destructors and virtual functions.  */
Index: cilk.h
===================================================================
--- cilk.h	(revision 187789)
+++ cilk.h	(working copy)
@@ -195,6 +195,8 @@ 
 #define notify_zc_intrinsic_fndecl      cilk_trees[NOTIFY_ZC_INTRINSIC]
 #define notify_intrinsic_fndecl         cilk_trees[NOTIFY_INTRINSIC]
 
+/* this is the max number of data we have have in elem-function arrays */
+#define MAX_VARS 50
 
 typedef struct zca_data_t
 {
@@ -205,7 +207,41 @@ 
   struct zca_data_t *ptr_next;
 } zca_data;
 
+/* These are different mask options. I put 12345 so that we can defferenciate
+ * the value during debugging */
+enum mask_options {
+  USE_MASK = 12345,
+  USE_NOMASK,
+  USE_BOTH
+};
 
+/* this data structure will hold all the data from the vector attribute */
+typedef struct
+{
+  char *proc_type;
+  enum mask_options mask;
+  int vectorlength[MAX_VARS];
+  int no_vlengths;
+  char *uniform_vars[MAX_VARS];
+  int no_uvars;
+  int uniform_location[MAX_VARS]; /* their location in parm list */
+  char *linear_vars[MAX_VARS];
+  int linear_steps[MAX_VARS];
+  int linear_location[MAX_VARS]; /* their location in parm list */
+  int no_lvars;
+  int private_location[MAX_VARS]; /* parm not in uniform or linear list */
+  int no_pvars;
+  char *func_prefix;
+  int total_no_args;
+} elem_fn_info;
+
+/* this data structure will hold all the arguments in the function */
+typedef struct {
+  tree induction_var;
+  tree arguments;
+  tree return_var;
+} fn_vect_elements;
+
 /* Offset of fields in the Cilk frame descriptor.
    Index is same as for cilk_trees.  If the index
    does not correspond to a field of the Cilk frame
@@ -270,6 +306,14 @@ 
 extern void debug_zca_data (void);
 extern zca_data *get_zca_entry (int);
 extern void insert_in_zca_table (zca_data);
-extern bool is_elem_fn (tree);				 
+extern bool is_elem_fn (tree);
 extern tree find_elem_fn_name (tree, tree, tree);
+extern void elem_fn_create_fn (tree);
+extern char *find_processor_code (elem_fn_info *);
+extern char *find_vlength_code (elem_fn_info *);
+extern tree rename_elem_fn (tree, const char *);
+extern char *find_suffix (elem_fn_info *, bool);
+extern enum elem_fn_parm_type find_elem_fn_parm_type (gimple, tree, tree *);
+extern tree find_elem_fn_name (tree, tree, tree);
+elem_fn_info *extract_elem_fn_values (tree);
 #endif /* GCC_CILK_H */
Index: elem-function.c
===================================================================
--- elem-function.c	(revision 187789)
+++ elem-function.c	(working copy)
@@ -1,9 +1,10 @@ 
 /* This file is part of the Intel(R) Cilk(TM) Plus support
-   This file contains the functions for Elemental functions.
+   This file contains C/C++ specific functions for elemental
+   functions.
    
    Copyright (C) 2012  Free Software Foundation, Inc.
    Written by Balaji V. Iyer <balaji.v.iyer@intel.com>,
-   Intel Corporation
+              Intel Corporation
 
    Many Thanks to Karthik Kumar for advice on the basic technique
    about cloning functions.
@@ -29,7 +30,8 @@ 
 #include "coretypes.h"
 #include "tm.h"
 #include "tree.h"
-#include "rtl.h"
+#include "langhooks.h"
+#include "cilk.h"
 #include "tm_p.h"
 #include "hard-reg-set.h"
 #include "basic-block.h"
@@ -40,8 +42,8 @@ 
 #include "tree-dump.h"
 #include "tree-pass.h"
 #include "timevar.h"
-#include "cfgloop.h"
 #include "flags.h"
+#include "c-tree.h"
 #include "tree-inline.h"
 #include "cgraph.h"
 #include "ipa-prop.h"
@@ -52,196 +54,12 @@ 
 #include "intl.h"
 #include "vec.h"
 
-#define MAX_VARS 50
 
-enum mask_options {
-  USE_MASK = 12345,
-  USE_NOMASK,
-  USE_BOTH
-};
-
-typedef struct
-{
-  char *proc_type;
-  enum mask_options mask;
-  int vectorlength[MAX_VARS];
-  int no_vlengths;
-  char *uniform_vars[MAX_VARS];
-  int no_uvars;
-  int uniform_location[MAX_VARS]; /* their location in parm list */
-  char *linear_vars[MAX_VARS];
-  int linear_steps[MAX_VARS];
-  int linear_location[MAX_VARS]; /* their location in parm list */
-  int no_lvars;
-  int private_location[MAX_VARS]; /* parm not in uniform or linear list */
-  int no_pvars;
-  char *func_prefix;
-  int total_no_args;
-} elem_fn_info;
-
-static elem_fn_info *extract_elem_fn_values (tree);
 static tree create_optimize_attribute (int);
 static tree create_processor_attribute (elem_fn_info *, tree *);
+static tree elem_fn_build_array (tree base_var, tree index);
 
-/* this is an helper function for find_elem_fn_param_type */
-static enum elem_fn_parm_type
-find_elem_fn_parm_type_1 (tree fndecl, int parm_no, tree *step_size)
-{
-  int ii = 0;
-  elem_fn_info *elem_fn_values;
 
-  elem_fn_values = extract_elem_fn_values (fndecl);
-  if (!elem_fn_values)
-    return TYPE_NONE;
-
-  for (ii = 0; ii < elem_fn_values->no_lvars; ii++)
-    if (elem_fn_values->linear_location[ii] == parm_no)
-      {
-	if (step_size != NULL)
-	  *step_size = build_int_cst (integer_type_node,
-				      elem_fn_values->linear_steps[ii]);
-	return TYPE_LINEAR;
-      }
-    
-  for (ii = 0; ii < elem_fn_values->no_uvars; ii++)
-    if (elem_fn_values->uniform_location[ii] == parm_no)
-      return TYPE_UNIFORM;
-    
-  return TYPE_NONE;
-}
-  
-  
-/* this function will return the type of a parameter in elemental function.
-   The choices are UNIFORM or LINEAR. */
-enum elem_fn_parm_type
-find_elem_fn_parm_type (gimple stmt, tree op, tree *step_size)
-{
-  tree fndecl, parm = NULL_TREE;
-  int ii, nargs;
-  enum elem_fn_parm_type return_type = TYPE_NONE;
-  
-  if (gimple_code (stmt) != GIMPLE_CALL)
-    return TYPE_NONE;
-
-  fndecl = gimple_call_fndecl (stmt);
-  gcc_assert (fndecl);
-
-  nargs = gimple_call_num_args (stmt);
-
-  for (ii = 0; ii < nargs; ii++)
-    {
-      parm = gimple_call_arg (stmt, ii);
-      if (op == parm)
-	{
-	  return_type = find_elem_fn_parm_type_1 (fndecl, ii, step_size);
-	  return return_type;
-	}
-    }
-  return return_type;
-}
-  
-/* this function will concatinate the suffix to the existing function decl */
-static tree
-rename_elem_fn (tree decl, const char *suffix)
-{
-  int length = 0;
-  const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (decl));
-  char *new_fn_name;
-  tree new_decl = NULL_TREE;
-  
-  if (!suffix || !fn_name)
-    return decl;
-  else
-    new_decl = decl;
-
-  length = strlen (fn_name) + strlen (suffix) + 1;
-  new_fn_name = (char *)xmalloc (length);
-  strcpy (new_fn_name, fn_name);
-  strcat (new_fn_name, suffix);
-
-  DECL_NAME (new_decl) = get_identifier (new_fn_name);
-  return new_decl;
-}
-
-/* this function will check to see if the node is part of an function that
- * needs to be converted to its vector equivalent. */
-bool
-is_elem_fn (tree fndecl)
-{
-  tree ii_tree;
-
-  for (ii_tree = DECL_ATTRIBUTES (fndecl); ii_tree;
-       ii_tree = TREE_CHAIN (ii_tree))
-    {
-      tree ii_value = TREE_PURPOSE (ii_tree);
-      if (TREE_CODE (ii_value) == IDENTIFIER_NODE
-	  && !strcmp (IDENTIFIER_POINTER (ii_value), "vector"))
-	return true;
-    }
-
-  /* If we are here, then we didn't find a vector keyword, so it is false */
-  return false;
-}
-
-/* This function will find the appropriate processor code in the function
- * mangling vector function
- */
-static char *
-find_processor_code (elem_fn_info *elem_fn_values)
-{
-  if (!elem_fn_values || !elem_fn_values->proc_type)
-    return xstrdup ("B");
-
-  if (!strcmp (elem_fn_values->proc_type, "pentium_4"))
-    return xstrdup ("B");
-  else if (!strcmp (elem_fn_values->proc_type, "pentium4_sse3"))
-    return xstrdup ("D");
-  else if (!strcmp (elem_fn_values->proc_type, "core2_duo_ssse3"))
-    return xstrdup ("E");
-  else if (!strcmp (elem_fn_values->proc_type, "core2_duo_sse_4_1"))
-    return xstrdup ("F");
-  else if (!strcmp (elem_fn_values->proc_type, "core_i7_sse4_2"))
-    return xstrdup ("H");
-  else
-    gcc_unreachable ();
-
-  return NULL; /* should never get here */
-}
-
-/* this function will return vectorlength, if specified, in string format -OR-
- * it will give the default vector length for the specified architecture. */
-static char *
-find_vlength_code (elem_fn_info *elem_fn_values)
-{
-  char *vlength_code = (char *) xmalloc (sizeof (char) * 10);
-  if (!elem_fn_values)
-    { 
-      sprintf (vlength_code, "4");
-      return vlength_code;
-    }
-
-  memset (vlength_code, 10, 0);
-  
-  if (elem_fn_values->no_vlengths != 0)
-    sprintf(vlength_code,"%d", elem_fn_values->vectorlength[0]);
-  else
-    {
-      if (!strcmp (elem_fn_values->proc_type, "pentium_4"))
-	sprintf (vlength_code,"4");
-      else if (!strcmp (elem_fn_values->proc_type, "pentium4_sse3"))
-	sprintf (vlength_code, "4");
-      else if (!strcmp (elem_fn_values->proc_type, "core2_duo_ssse3"))
-	sprintf (vlength_code, "4");
-      else if (!strcmp (elem_fn_values->proc_type, "core2_duo_sse_4_1"))
-	sprintf (vlength_code, "4");
-      else if (!strcmp (elem_fn_values->proc_type, "core_i7_sse4_2"))
-	sprintf (vlength_code, "4");
-      else
-	gcc_unreachable ();
-    }
-  return vlength_code;
-}
-
 /* This function will create the appropriate __target__ attribute for the
  * processor */
 static tree
@@ -356,7 +174,16 @@ 
   VEC(tree,gc) *opt_vec = VEC_alloc (tree,gc, 4);
   char optimization[2];
   optimization[0] = 'O';
-  sprintf(&optimization[1], "%1d", option);
+  
+  if (option == 3)
+    optimization[1] = '3';
+  else if (option == 2)
+    optimization[1] = '2';
+  else if (option == 1)
+    optimization[1] = '1';
+  else if (option == 0)
+    optimization[1] = '0';
+  
   VEC_safe_push (tree, gc, opt_vec, build_string (2, optimization));
   opt_attr = build_tree_list_vec (opt_vec);
   VEC_truncate (tree, opt_vec, 0);
@@ -364,343 +191,363 @@ 
   return opt_attr;
 }
 
-/* this function will find the appropriate mangling suffix for the vector
- * function */
-static char *
-find_suffix (elem_fn_info *elem_fn_values, bool masked)
+
+/* this function will store return expression to a temporary var */
+static tree
+replace_return_with_new_var (tree *tp, int *walk_subtrees, void *data)
 {
-  char *suffix = (char*)xmalloc (100);
-  char tmp_str[10];
-  int arg_number, ii_pvar, ii_uvar, ii_lvar;
-  strcpy (suffix, "._simdsimd_");
-  strcat (suffix, find_processor_code (elem_fn_values));
-  strcat (suffix, find_vlength_code (elem_fn_values));
-  if (masked)
-    strcpy (suffix, "m");
-  else
-    strcat (suffix, "n");
+  tree mod_expr = NULL_TREE, return_var = NULL_TREE, ret_expr = NULL_TREE;
+  
+  if (!*tp)
+    return NULL_TREE;
 
-  for (arg_number = 1; arg_number <= elem_fn_values->total_no_args;
-       arg_number++)
+  if (TREE_CODE (*tp) == RETURN_EXPR)
     {
-      for (ii_lvar = 0; ii_lvar < elem_fn_values->no_lvars; ii_lvar++)
+      return_var = (tree) data;
+      ret_expr = TREE_OPERAND (TREE_OPERAND (*tp, 0), 1);
+      mod_expr = build2 (MODIFY_EXPR, TREE_TYPE (return_var), return_var,
+			 ret_expr);
+      *tp = mod_expr;
+      *walk_subtrees = 0;
+    }
+  return NULL_TREE;
+}
+
+
+/* This function will create a vector access as a array access */
+static tree
+elem_fn_build_array (tree base_var, tree index)
+{
+  return build_array_ref (UNKNOWN_LOCATION, base_var, index);
+}
+
+/* this function wil replace all vector references with array references. */
+static tree
+replace_array_ref_for_vec (tree *tp, int *walk_subtrees, void *data)
+{
+  tree ii_var;
+  fn_vect_elements *func_data;
+  if (!*tp)
+    return NULL_TREE;
+
+  if (TREE_CODE (*tp) == VAR_DECL || TREE_CODE (*tp) == PARM_DECL)
+    {
+      func_data = (fn_vect_elements *) data;
+      gcc_assert (func_data->induction_var);
+      for (ii_var = func_data->arguments; ii_var; ii_var = DECL_CHAIN (ii_var))
 	{
-	  if (elem_fn_values->linear_location[ii_lvar] == arg_number)
+	  if (DECL_NAME (ii_var) == DECL_NAME (*tp))
 	    {
-	      strcat (suffix, "_l");
-	      sprintf(tmp_str, "%d", elem_fn_values->linear_steps[ii_lvar]);
-	      strcat (suffix, tmp_str);
+	      *tp =  elem_fn_build_array (*tp, func_data->induction_var);
+	      *walk_subtrees = 0;
+	      return NULL_TREE;
 	    }
 	}
-      for (ii_uvar = 0; ii_uvar < elem_fn_values->no_uvars; ii_uvar++)
+      if (func_data->return_var &&
+	  (DECL_NAME (*tp) == DECL_NAME (func_data->return_var)))
 	{
-	  if (elem_fn_values->uniform_location[ii_uvar] == arg_number)
-	    strcat (suffix, "_s1");
+	  *tp = elem_fn_build_array (*tp, func_data->induction_var);
+	  *walk_subtrees = 0;
 	}
-      for (ii_pvar = 0; ii_pvar < elem_fn_values->no_pvars; ii_pvar++)
-	{
-	  if (elem_fn_values->private_location[ii_pvar] == arg_number)
-	    strcat (suffix, "_v1");
-	}
-    } 
-  return suffix;
+    }
+  return NULL_TREE;
 }
 
-tree
-find_elem_fn_name (tree old_fndecl,
-		   tree vectype_out ATTRIBUTE_UNUSED,
-		   tree vectype_in ATTRIBUTE_UNUSED)
+/* this function will move return values to the end of the function */
+static void
+fix_elem_fn_return_value (tree fndecl, tree induction_var)
 {
-  elem_fn_info *elem_fn_values = NULL;
-  tree new_fndecl = NULL_TREE, arg_type = NULL_TREE;
-  char *suffix = NULL;
+  fn_vect_elements data;
+  tree old_fndecl;
+  tree new_var, new_var_init,  new_body = NULL_TREE;
+  tree ret_expr, ret_stmt = NULL_TREE;
+  if (!fndecl || !DECL_SAVED_TREE (fndecl))
+    return;
+
+  if (TREE_TYPE (DECL_RESULT (fndecl)) == void_type_node)
+    return;
+
+  old_fndecl = current_function_decl;
+  push_cfun (DECL_STRUCT_FUNCTION (fndecl));
+  current_function_decl = fndecl;
   
-  elem_fn_values = extract_elem_fn_values (old_fndecl);
- 
-  if (elem_fn_values)
+  new_var = create_tmp_var (TREE_TYPE (DECL_RESULT (fndecl)), "elem_fn_ret");
+  new_var_init =
+    build_vector_from_val
+    (TREE_TYPE (DECL_RESULT (fndecl)),
+     build_zero_cst (TREE_TYPE (TREE_TYPE (DECL_RESULT (fndecl)))));
+  DECL_INITIAL (new_var) = new_var_init;
+  walk_tree (&DECL_SAVED_TREE (fndecl), replace_return_with_new_var,
+	     (void *)new_var, NULL);
+  data.return_var = new_var;
+  data.arguments = DECL_ARGUMENTS (fndecl);
+  data.induction_var = induction_var;
+
+  walk_tree (&DECL_SAVED_TREE (fndecl), replace_array_ref_for_vec,
+	     (void *) &data, NULL);
+  ret_expr = build2 (MODIFY_EXPR, TREE_TYPE (new_var),
+		     DECL_RESULT (fndecl), new_var);
+  
+  ret_stmt = build1 (RETURN_EXPR, TREE_TYPE (ret_expr), ret_expr);
+  if (TREE_CODE (DECL_SAVED_TREE (fndecl)) == BIND_EXPR)
     {
-      if (elem_fn_values->no_vlengths > 0)
+      
+      if (!BIND_EXPR_BODY (DECL_SAVED_TREE (fndecl)))
+        ;
+      else if (TREE_CODE (BIND_EXPR_BODY (DECL_SAVED_TREE (fndecl))) !=
+	       TREE_LIST)
 	{
-	  if (elem_fn_values->vectorlength[0] ==
-	      (int)TYPE_VECTOR_SUBPARTS (vectype_out))
-	    suffix = find_suffix (elem_fn_values, false);
-	  else
-	    return NULL_TREE;
+	  append_to_statement_list_force
+	    (BIND_EXPR_BODY (DECL_SAVED_TREE (fndecl)), &new_body);
+	  append_to_statement_list_force (ret_stmt, &new_body);
 	}
       else
-	return NULL_TREE;
+	{
+	  new_body = BIND_EXPR_BODY (DECL_SAVED_TREE (fndecl));
+	  append_to_statement_list_force (ret_stmt, &new_body);
+	}
+      BIND_EXPR_BODY (DECL_SAVED_TREE (fndecl)) = new_body;
     }
-  else
+
+  pop_cfun ();
+  current_function_decl = old_fndecl;
+  return;
+}
+
+/* this function will break a vector value to scalar with a for loop in front */
+static tree
+add_elem_fn_loop (tree fndecl, int vlength)
+{
+  tree exit_label = NULL_TREE, if_label = NULL_TREE, body_label = NULL_TREE;
+  tree fn_body, loop = NULL_TREE, loop_var, mod_var, incr_expr, cond_expr;
+  tree cmp_expr, old_fndecl;
+  
+  if (!fndecl)
+    return NULL_TREE; 
+
+  if (!DECL_SAVED_TREE (fndecl))
     return NULL_TREE;
 
-  new_fndecl = copy_node (rename_elem_fn (old_fndecl, suffix));
-  TREE_TYPE (new_fndecl) = copy_node (TREE_TYPE (old_fndecl));
+  old_fndecl = current_function_decl;
+  push_cfun (DECL_STRUCT_FUNCTION (fndecl));
+  current_function_decl = fndecl;
+  
+  if (TREE_CODE (DECL_SAVED_TREE (fndecl)) == BIND_EXPR)
+    fn_body = BIND_EXPR_BODY (DECL_SAVED_TREE (fndecl));
+  else
+    fn_body = DECL_SAVED_TREE (fndecl);
 
-  TYPE_ARG_TYPES (TREE_TYPE (new_fndecl)) =
-    copy_list (TYPE_ARG_TYPES (TREE_TYPE (new_fndecl)));
+  loop = alloc_stmt_list ();
   
-  for (arg_type = TYPE_ARG_TYPES (TREE_TYPE (new_fndecl));
-       arg_type && arg_type != void_type_node;
-       arg_type = TREE_CHAIN (arg_type))
-    TREE_VALUE (arg_type) = vectype_out;
+  loop_var = create_tmp_var (integer_type_node, "ii_elem_fn_vec_val");
+  mod_var = build2 (MODIFY_EXPR, void_type_node, loop_var,
+		    build_int_cst (integer_type_node, 0));
+  append_to_statement_list_force (mod_var, &loop);
   
-  if (TREE_TYPE (TREE_TYPE (new_fndecl)) != void_type_node)
-    {
-      TREE_TYPE (TREE_TYPE (new_fndecl)) =
-	copy_node (TREE_TYPE (TREE_TYPE (new_fndecl)));
-      TREE_TYPE (TREE_TYPE (new_fndecl)) = vectype_out;
-      DECL_MODE (new_fndecl) = TYPE_MODE (vectype_out);
-    }
+  if_label = build_decl (UNKNOWN_LOCATION, LABEL_DECL,
+			 get_identifier ("if_lab"), void_type_node);
+  DECL_CONTEXT (if_label) = fndecl;
+  DECL_ARTIFICIAL (if_label) = 0;
+  DECL_IGNORED_P (if_label) = 1;
+
+  exit_label = build_decl (UNKNOWN_LOCATION, LABEL_DECL,
+			   get_identifier ("exit_label"), void_type_node);
+  DECL_CONTEXT (exit_label) = fndecl;
+  DECL_ARTIFICIAL (exit_label) = 0;
+  DECL_IGNORED_P (exit_label) = 1;
+
+  body_label = build_decl (UNKNOWN_LOCATION, LABEL_DECL,
+			   get_identifier ("body_label"), void_type_node);
+  DECL_CONTEXT (body_label) = fndecl;
+  DECL_ARTIFICIAL (body_label) = 0;
+  DECL_IGNORED_P (body_label) = 1;
+  append_to_statement_list_force (build1 (LABEL_EXPR, void_type_node,
+					  if_label), &loop);
+  cmp_expr = build2 (LT_EXPR, boolean_type_node, loop_var,
+		     build_int_cst (integer_type_node, vlength));
+  cond_expr = build3 (COND_EXPR, void_type_node, cmp_expr,
+		      build1 (GOTO_EXPR, void_type_node, body_label),
+		      build1 (GOTO_EXPR, void_type_node, exit_label));
+
+  append_to_statement_list_force (cond_expr, &loop);
+  append_to_statement_list_force (build1 (LABEL_EXPR, void_type_node,
+					  body_label), &loop);
+  append_to_statement_list_force (fn_body, &loop);
+
+  incr_expr = build2 (MODIFY_EXPR, void_type_node, loop_var,
+		      build2 (PLUS_EXPR, TREE_TYPE (loop_var), loop_var,
+			      build_int_cst (integer_type_node, 1)));
+
+  append_to_statement_list_force (incr_expr, &loop);
+  append_to_statement_list_force (build1 (GOTO_EXPR, void_type_node, if_label),
+				  &loop);
+  append_to_statement_list_force (build1 (LABEL_EXPR, void_type_node,
+					  exit_label), &loop);
   
-  return new_fndecl;
+  if (TREE_CODE (DECL_SAVED_TREE (fndecl)) == BIND_EXPR)
+    BIND_EXPR_BODY (DECL_SAVED_TREE (fndecl)) = loop;
+  else
+    DECL_SAVED_TREE (fndecl) = loop;
+
+  pop_cfun ();
+  current_function_decl = old_fndecl;
+  
+  return loop_var;
 }
 
-/* this function wil create the elemental vector function node */
-static struct cgraph_node *
-create_elem_fn_nodes (struct cgraph_node *node)
+/* this function will add the mask if statement for masked clone */
+static void
+add_elem_fn_mask (tree fndecl)
 {
-  tree new_decl, old_decl, new_decl_name, opt_attr;
-  tree proc_attr, opp_proc_attr = NULL_TREE;
-  struct cgraph_node *new_node;
-  elem_fn_info *elem_fn_values = NULL;
-  char *suffix = NULL;
-  
-  old_decl = node->symbol.decl;
-  new_decl = copy_node (old_decl);
-  TREE_TYPE (new_decl) = copy_node (TREE_TYPE (old_decl));
-  elem_fn_values = extract_elem_fn_values (old_decl);
+  tree ii_arg;
+  tree cond_expr, cmp_expr, old_fndecl;
+  tree fn_body = NULL_TREE;
 
-  if (elem_fn_values)
+  old_fndecl = current_function_decl;
+  push_cfun (DECL_STRUCT_FUNCTION (fndecl));
+  current_function_decl = fndecl;
+  
+  if (!DECL_SAVED_TREE (fndecl))
+    return;
+  
+  for (ii_arg = DECL_ARGUMENTS (fndecl); DECL_CHAIN (ii_arg);
+       ii_arg = DECL_CHAIN (ii_arg))
     {
-      suffix = find_suffix (elem_fn_values, false);
+      ;
     }
+  if (TREE_CODE (DECL_SAVED_TREE (fndecl)) == BIND_EXPR)
+    fn_body = BIND_EXPR_BODY (DECL_SAVED_TREE (fndecl));
   else
-    return NULL;
-  
-  new_decl_name = rename_elem_fn (new_decl, suffix);
+    fn_body = DECL_SAVED_TREE (fndecl); /* not sure if we ever get here */
 
-  SET_DECL_ASSEMBLER_NAME (new_decl, DECL_NAME(new_decl_name));
-  SET_DECL_RTL (new_decl, NULL);
-  TREE_SYMBOL_REFERENCED (DECL_NAME (new_decl_name)) = 1;
-  
-  new_node = cgraph_copy_node_for_versioning (node, new_decl, NULL, NULL);
-  new_node->symbol.externally_visible = node->symbol.externally_visible;
-  new_node->lowered = true;
+  gcc_assert (DECL_NAME (ii_arg) == get_identifier ("__elem_fn_mask"));
 
-  tree_elem_fn_versioning (old_decl, new_decl, NULL, false, NULL, false, NULL,
-			   NULL, elem_fn_values->vectorlength[0]);
-  cgraph_call_function_insertion_hooks (new_node);
-  DECL_STRUCT_FUNCTION (new_decl)->elem_fn_already_cloned = true;
-  DECL_STRUCT_FUNCTION (new_decl)->curr_properties = cfun->curr_properties;
-  DECL_ATTRIBUTES (cfun->decl) =
-    remove_attribute ("vector", DECL_ATTRIBUTES (cfun->decl));
-  DECL_ATTRIBUTES (new_node->symbol.decl) =
-    remove_attribute ("vector", DECL_ATTRIBUTES (new_node->symbol.decl));
+  cmp_expr = fold_build2 (NE_EXPR, TREE_TYPE (ii_arg), ii_arg,
+			  build_int_cst (TREE_TYPE (TREE_TYPE (ii_arg)), 0));
+  cond_expr = fold_build3 (COND_EXPR, void_type_node, cmp_expr, fn_body,
+			   build_empty_stmt (UNKNOWN_LOCATION));
 
-  proc_attr = create_processor_attribute (elem_fn_values, &opp_proc_attr);
+  if (TREE_CODE (DECL_SAVED_TREE (fndecl)) == BIND_EXPR)
+    BIND_EXPR_BODY (DECL_SAVED_TREE (fndecl)) = cond_expr;
+  else
+    DECL_SAVED_TREE (fndecl) = cond_expr;
+
+  pop_cfun ();
+  current_function_decl = old_fndecl;
   
-  if (proc_attr)
-    decl_attributes (&new_node->symbol.decl, proc_attr, 0);
-  if (opp_proc_attr)
-    decl_attributes (&cfun->decl, opp_proc_attr, 0);
+  return;
+ 
+}
 
-  opt_attr = create_optimize_attribute (3); /* this will turn vectorizer on */
-  if (opt_attr)
-    decl_attributes (&new_node->symbol.decl, opt_attr, 0);
+/* this function will do hacks necessary to recognize the cloned function */
+static void
+cg_hacks (tree fndecl)
+{
+  const tree outer = current_function_decl;
+  struct function *f = DECL_STRUCT_FUNCTION (fndecl);
+
+  f->curr_properties = cfun->curr_properties;
+  push_cfun (f);
+  current_function_decl = fndecl;
   
-  return new_node;
+  cgraph_add_new_function (fndecl, false);
+  cgraph_finalize_function (fndecl, true);
+
+  pop_cfun ();
+  current_function_decl = outer;
+
+  return;
 }
 
-/* This function will extact the vector attribute and store the data in the
- * elem_fn_info structure.
- */
-static elem_fn_info *
-extract_elem_fn_values (tree decl)
+/* this function will create clones for function marked with vector attribute */
+void
+elem_fn_create_fn (tree fndecl)
 {
+  tree new_masked_fn = NULL_TREE, new_unmasked_fn = NULL_TREE;
+  tree induction_var = NULL_TREE;
   elem_fn_info *elem_fn_values = NULL;
-  int x = 0; /* this is a dummy variable */
-  int arg_number = 0, ii = 0;
-  tree ii_tree, jj_tree, kk_tree;
-  tree decl_attr = DECL_ATTRIBUTES (decl);
-  
-  if (!decl_attr)
-    return NULL;
+  char *masked_suffix = NULL, *unmasked_suffix = NULL;
+  tree proc_attr = NULL_TREE, opp_proc_attr = NULL_TREE, opt_attr = NULL_TREE;
+  if (!fndecl)
+    return;
 
-  elem_fn_values = (elem_fn_info *)xmalloc (sizeof (elem_fn_info));
-  gcc_assert (elem_fn_values);
+  elem_fn_values = extract_elem_fn_values (fndecl);
 
-  elem_fn_values->proc_type = NULL;
-  elem_fn_values->mask = USE_BOTH;
-  elem_fn_values->no_vlengths = 0;
-  elem_fn_values->no_uvars = 0;
-  elem_fn_values->no_lvars = 0;
-  
+  if (!elem_fn_values)
+    return;
 
-  for (ii_tree = decl_attr; ii_tree; ii_tree = TREE_CHAIN (ii_tree))
+  if (elem_fn_values->mask == USE_MASK)
+    masked_suffix = find_suffix (elem_fn_values, true);
+  else if (elem_fn_values->mask == USE_NOMASK)
+    unmasked_suffix = find_suffix (elem_fn_values, false);
+  else
     {
-      tree ii_purpose = TREE_PURPOSE (ii_tree);
-      tree ii_value = TREE_VALUE (ii_tree);
-      if (TREE_CODE (ii_purpose) == IDENTIFIER_NODE
-	  && !strcmp (IDENTIFIER_POINTER (ii_purpose), "vector"))
-	{
-	  for (jj_tree = ii_value; jj_tree;
-	       jj_tree = TREE_CHAIN (jj_tree))
-	    {
-	      tree jj_value = TREE_VALUE (jj_tree);
-	      tree jj_purpose = TREE_PURPOSE (jj_value);
-	      if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
-		  && !strcmp (IDENTIFIER_POINTER (jj_purpose), "processor"))
-		{
-		  for (kk_tree = TREE_VALUE (jj_value); kk_tree;
-		       kk_tree = TREE_CHAIN (kk_tree))
-		    {
-		      tree kk_value = TREE_VALUE (kk_tree);
-		      if (TREE_CODE (kk_value) == STRING_CST)
-			elem_fn_values->proc_type =
-			  xstrdup (TREE_STRING_POINTER (kk_value));
-		    }
-		}
-	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
-		       && !strcmp (IDENTIFIER_POINTER (jj_purpose),
-				  "vectorlength"))
-		{
-		  for (kk_tree = TREE_VALUE (jj_value); kk_tree;
-		       kk_tree = TREE_CHAIN (kk_tree))
-		    {
-		      tree kk_value = TREE_VALUE (kk_tree);
-		      if (TREE_CODE (kk_value) == INTEGER_CST)
-			{
-			  x = elem_fn_values->no_vlengths;
-			  elem_fn_values->vectorlength[x] =
-			    (int) TREE_INT_CST_LOW (kk_value);
-			  elem_fn_values->no_vlengths++;
-			}
-		    }
-		}
-	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
-		       && !strcmp (IDENTIFIER_POINTER (jj_purpose), "uniform"))
-		{
-		  for (kk_tree = TREE_VALUE (jj_value); kk_tree;
-		       kk_tree = TREE_CHAIN (kk_tree))
-		    {
-		      tree kk_value = TREE_VALUE (kk_tree);
-		      elem_fn_values->uniform_vars[elem_fn_values->no_uvars] =
-			xstrdup (TREE_STRING_POINTER (kk_value));
-		      elem_fn_values->no_uvars++;
-		    }
-		}
-	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
-		       && !strcmp (IDENTIFIER_POINTER (jj_purpose), "linear"))
-		{
-		  for (kk_tree = TREE_VALUE (jj_value); kk_tree;
-		       kk_tree = TREE_CHAIN (kk_tree))
-		    {
-		      tree kk_value = TREE_VALUE (kk_tree);
-		      elem_fn_values->linear_vars[elem_fn_values->no_lvars] =
-			xstrdup (TREE_STRING_POINTER (kk_value));
-		      kk_tree = TREE_CHAIN (kk_tree);
-		      kk_value = TREE_VALUE (kk_tree);
-		      elem_fn_values->linear_steps[elem_fn_values->no_lvars] =
-			TREE_INT_CST_LOW (kk_value);
-		      elem_fn_values->no_lvars++;
-		    }
-		}
-	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
-		       && !strcmp (IDENTIFIER_POINTER (jj_purpose), "mask"))
-		elem_fn_values->mask = USE_MASK;
-	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
-		       && !strcmp (IDENTIFIER_POINTER (jj_purpose), "nomask"))
-		elem_fn_values->mask = USE_NOMASK;
-	    }
-	}
+      masked_suffix   = find_suffix (elem_fn_values, true);
+      unmasked_suffix = find_suffix (elem_fn_values, false);
     }
 
-  for (ii_tree = DECL_ARGUMENTS (decl); ii_tree; ii_tree = DECL_CHAIN (ii_tree))
+  if (masked_suffix)
     {
-      arg_number++;
-      bool already_found = false;
-      for (ii = 0; ii < elem_fn_values->no_uvars; ii++)
-	{
-	  if (DECL_NAME (ii_tree)
-	      && !strcmp (IDENTIFIER_POINTER (DECL_NAME (ii_tree)),
-			  elem_fn_values->uniform_vars[ii]))
-	    {
-	      already_found = true;
-	      elem_fn_values->uniform_location[ii] = arg_number;
-	    }
-	}
-      for (ii = 0; ii < elem_fn_values->no_lvars; ii++)
-	{
-	  if (DECL_NAME (ii_tree)
-	      && !strcmp (IDENTIFIER_POINTER (DECL_NAME (ii_tree)),
-			  elem_fn_values->linear_vars[ii]))
-	    {
-	      if (already_found)
-		  fatal_error
-		    ("variable %s defined in both uniform and linear clause",
-		     elem_fn_values->linear_vars[ii]);
-	      else
-		{
-		  already_found = true;
-		  elem_fn_values->linear_location[ii] = arg_number;
-		}
-	    }
-	}
-      if (!already_found) /* this means this variable is a private */
-	elem_fn_values->private_location[elem_fn_values->no_pvars++] =
-	  arg_number;
-    }
+      new_masked_fn = copy_node (fndecl);
+      new_masked_fn = rename_elem_fn (new_masked_fn, masked_suffix);
+      SET_DECL_RTL (new_masked_fn, NULL);
+      TREE_SYMBOL_REFERENCED (DECL_NAME (new_masked_fn)) = 1;
+      tree_elem_fn_versioning (fndecl, new_masked_fn, NULL, false, NULL, false,
+			       NULL, NULL, elem_fn_values->vectorlength[0],
+			       true);
+      proc_attr = create_processor_attribute (elem_fn_values, &opp_proc_attr);
+      if (proc_attr)
+	decl_attributes (&new_masked_fn, proc_attr, 0);
+      if (opp_proc_attr)
+	decl_attributes (&fndecl, opp_proc_attr, 0);
+      
+      opt_attr = create_optimize_attribute (3); /* will turn vectorizer on */
+      if (opt_attr)
+	decl_attributes (&new_masked_fn, opt_attr, 0);
 
-  elem_fn_values->total_no_args = arg_number;
-  
-  return elem_fn_values;
-}  
-
-/* Entry point function for creating the vector elemental function */
-static unsigned int
-create_elem_vec_fn (void)
-{
-  struct cgraph_node *ii_node, *copied_node;
-  
-  FOR_EACH_DEFINED_FUNCTION (ii_node)
-    {
-      tree node_decl = ii_node->symbol.decl;
-      if (is_elem_fn (node_decl)
-	  && DECL_STRUCT_FUNCTION (node_decl) 
-	  && !DECL_STRUCT_FUNCTION (node_decl)->elem_fn_already_cloned)
-	{
-       	  copied_node = create_elem_fn_nodes (ii_node);
-	  if (DECL_RTL (ii_node->symbol.decl))
-	    {
-	      SET_DECL_RTL (copied_node->symbol.decl,
-			    copy_rtx (DECL_RTL (ii_node->symbol.decl)));
-	      XEXP (DECL_RTL (copied_node->symbol.decl), 0) =
-		gen_rtx_SYMBOL_REF
-		(GET_MODE (XEXP (DECL_RTL (ii_node->symbol.decl), 0)),
-		 IDENTIFIER_POINTER (DECL_NAME (copied_node->symbol.decl)));
-	    }
-	  
-	}
+      DECL_ATTRIBUTES (new_masked_fn) =
+	remove_attribute ("vector", DECL_ATTRIBUTES (new_masked_fn));
+	
+      add_elem_fn_mask (new_masked_fn);
+      induction_var = add_elem_fn_loop (new_masked_fn,
+					elem_fn_values->vectorlength[0]);
+      fix_elem_fn_return_value (new_masked_fn, induction_var);
+      cg_hacks (new_masked_fn);
+      SET_DECL_ASSEMBLER_NAME (new_masked_fn, DECL_NAME (new_masked_fn));
+      if (DECL_STRUCT_FUNCTION (new_masked_fn))
+	DECL_STRUCT_FUNCTION (new_masked_fn)->elem_fn_already_cloned = true;
     }
-  return 0;
-}
- 
-
-struct gimple_opt_pass pass_elem_fn =
-  {
+  if (unmasked_suffix)
     {
-      GIMPLE_PASS,
-      "tree_elem_fn",			/* name */
-      0,				/* gate */
-      create_elem_vec_fn,		/* execute */
-      NULL,				/* sub */
-      NULL,				/* next */
-      0,				/* static_pass_number */
-      TV_NONE,				/* tv_id */
-      PROP_gimple_any| PROP_cfg, 	/* properties_required */
-      0,				/* properties_provided */
-      0,				/* properties_destroyed */
-      0,				/* todo_flags_start */
-      TODO_verify_flow,			/* todo_flags_finish */
+      new_unmasked_fn = copy_node (fndecl);
+      new_unmasked_fn = rename_elem_fn (new_unmasked_fn, unmasked_suffix);
+      SET_DECL_RTL (new_unmasked_fn, NULL);
+      TREE_SYMBOL_REFERENCED (DECL_NAME (new_unmasked_fn)) = 1;
+      tree_elem_fn_versioning (fndecl, new_unmasked_fn, NULL, false, NULL,
+			       false, NULL, NULL,
+			       elem_fn_values->vectorlength[0], false);
+      proc_attr = create_processor_attribute (elem_fn_values, &opp_proc_attr);
+      if (proc_attr)
+	decl_attributes (&new_unmasked_fn, proc_attr, 0);
+      if (opp_proc_attr)
+	decl_attributes (&fndecl, opp_proc_attr, 0);
+      
+      opt_attr = create_optimize_attribute (3); /* will turn vectorizer on */
+      if (opt_attr)
+	decl_attributes (&new_unmasked_fn, opt_attr, 0);
+
+      DECL_ATTRIBUTES (new_unmasked_fn) =
+	remove_attribute ("vector", DECL_ATTRIBUTES (new_unmasked_fn));
+      induction_var = add_elem_fn_loop (new_unmasked_fn,
+					elem_fn_values->vectorlength[0]);
+      fix_elem_fn_return_value (new_unmasked_fn, induction_var);
+      cg_hacks (new_unmasked_fn);
+      SET_DECL_ASSEMBLER_NAME (new_unmasked_fn, DECL_NAME (new_unmasked_fn));
+      if (DECL_STRUCT_FUNCTION (new_unmasked_fn))
+	DECL_STRUCT_FUNCTION (new_unmasked_fn)->elem_fn_already_cloned = true;
     }
-  };
+  DECL_ATTRIBUTES (fndecl) = remove_attribute ("vector",
+					       DECL_ATTRIBUTES (fndecl));
+  free (elem_fn_values);
+  return;
+}
Index: elem-function-common.c
===================================================================
--- elem-function-common.c	(revision 0)
+++ elem-function-common.c	(revision 0)
@@ -0,0 +1,461 @@ 
+/* This file is part of the Intel(R) Cilk(TM) Plus support
+   This file contains the language independent functions for
+   Elemental functions.
+   
+   Copyright (C) 2012  Free Software Foundation, Inc.
+   Written by Balaji V. Iyer <balaji.v.iyer@intel.com>,
+              Intel Corporation
+
+   Many Thanks to Karthik Kumar for advice on the basic technique
+   about cloning functions.
+   
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "langhooks.h"
+#include "cilk.h"
+#include "tm_p.h"
+#include "hard-reg-set.h"
+#include "basic-block.h"
+#include "output.h"
+#include "c-family/c-common.h"
+#include "diagnostic.h"
+#include "tree-flow.h"
+#include "tree-dump.h"
+#include "tree-pass.h"
+#include "timevar.h"
+#include "flags.h"
+#include "c-tree.h"
+#include "tree-inline.h"
+#include "cgraph.h"
+#include "ipa-prop.h"
+#include "opts.h"
+#include "tree-iterator.h"
+#include "toplev.h"
+#include "options.h"
+#include "intl.h"
+#include "vec.h"
+#include "cilk.h"
+
+#define MAX_VARS 50
+
+enum elem_fn_parm_type find_elem_fn_parm_type (gimple, tree, tree *);
+bool is_elem_fn (tree);
+tree find_elem_fn_name (tree old_fndecl, tree vectype_out, tree vectype_in);
+elem_fn_info *extract_elem_fn_values (tree decl);
+
+/* This function will find the appropriate processor code in the function
+ * mangling vector function
+ */
+char *
+find_processor_code (elem_fn_info *elem_fn_values)
+{
+  if (!elem_fn_values || !elem_fn_values->proc_type)
+    return xstrdup ("B");
+
+  if (!strcmp (elem_fn_values->proc_type, "pentium_4"))
+    return xstrdup ("B");
+  else if (!strcmp (elem_fn_values->proc_type, "pentium4_sse3"))
+    return xstrdup ("D");
+  else if (!strcmp (elem_fn_values->proc_type, "core2_duo_ssse3"))
+    return xstrdup ("E");
+  else if (!strcmp (elem_fn_values->proc_type, "core2_duo_sse_4_1"))
+    return xstrdup ("F");
+  else if (!strcmp (elem_fn_values->proc_type, "core_i7_sse4_2"))
+    return xstrdup ("H");
+  else
+    gcc_unreachable ();
+
+  return NULL; /* should never get here */
+}
+
+/* this function will return vectorlength, if specified, in string format -OR-
+ * it will give the default vector length for the specified architecture. */
+char *
+find_vlength_code (elem_fn_info *elem_fn_values)
+{
+  char *vlength_code = (char *) xmalloc (sizeof (char) * 10);
+  if (!elem_fn_values)
+    { 
+      sprintf (vlength_code, "4");
+      return vlength_code;
+    }
+
+  memset (vlength_code, 10, 0);
+  
+  if (elem_fn_values->no_vlengths != 0)
+    sprintf(vlength_code,"%d", elem_fn_values->vectorlength[0]);
+  else
+    {
+      if (!strcmp (elem_fn_values->proc_type, "pentium_4"))
+	sprintf (vlength_code,"4");
+      else if (!strcmp (elem_fn_values->proc_type, "pentium4_sse3"))
+	sprintf (vlength_code, "4");
+      else if (!strcmp (elem_fn_values->proc_type, "core2_duo_ssse3"))
+	sprintf (vlength_code, "4");
+      else if (!strcmp (elem_fn_values->proc_type, "core2_duo_sse_4_1"))
+	sprintf (vlength_code, "4");
+      else if (!strcmp (elem_fn_values->proc_type, "core_i7_sse4_2"))
+	sprintf (vlength_code, "4");
+      else
+	gcc_unreachable ();
+    }
+  return vlength_code;
+}
+
+
+/* this function will concatinate the suffix to the existing function decl */
+tree
+rename_elem_fn (tree decl, const char *suffix)
+{
+  int length = 0;
+  const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (decl));
+  char *new_fn_name;
+  tree new_decl = NULL_TREE;
+  
+  if (!suffix || !fn_name)
+    return decl;
+  else
+    new_decl = decl;
+
+  length = strlen (fn_name) + strlen (suffix) + 1;
+  new_fn_name = (char *)xmalloc (length);
+  strcpy (new_fn_name, fn_name);
+  strcat (new_fn_name, suffix);
+
+  DECL_NAME (new_decl) = get_identifier (new_fn_name);
+  return new_decl;
+}
+
+
+/* this function will find the appropriate mangling suffix for the vector
+ * function */
+char *
+find_suffix (elem_fn_info *elem_fn_values, bool masked)
+{
+  char *suffix = (char*)xmalloc (100);
+  char tmp_str[10];
+  int arg_number, ii_pvar, ii_uvar, ii_lvar;
+  strcpy (suffix, "._simdsimd_");
+  strcat (suffix, find_processor_code (elem_fn_values));
+  strcat (suffix, find_vlength_code (elem_fn_values));
+
+  if (masked)
+    strcat (suffix, "m");
+  else
+    strcat (suffix, "n");
+
+  for (arg_number = 1; arg_number <= elem_fn_values->total_no_args;
+       arg_number++)
+    {
+      for (ii_lvar = 0; ii_lvar < elem_fn_values->no_lvars; ii_lvar++)
+	{
+	  if (elem_fn_values->linear_location[ii_lvar] == arg_number)
+	    {
+	      strcat (suffix, "_l");
+	      sprintf(tmp_str, "%d", elem_fn_values->linear_steps[ii_lvar]);
+	      strcat (suffix, tmp_str);
+	    }
+	}
+      for (ii_uvar = 0; ii_uvar < elem_fn_values->no_uvars; ii_uvar++)
+	{
+	  if (elem_fn_values->uniform_location[ii_uvar] == arg_number)
+	    strcat (suffix, "_s1");
+	}
+      for (ii_pvar = 0; ii_pvar < elem_fn_values->no_pvars; ii_pvar++)
+	{
+	  if (elem_fn_values->private_location[ii_pvar] == arg_number)
+	    strcat (suffix, "_v1");
+	}
+    } 
+  return suffix;
+}
+
+
+/* this is an helper function for find_elem_fn_param_type */
+static enum elem_fn_parm_type
+find_elem_fn_parm_type_1 (tree fndecl, int parm_no, tree *step_size)
+{
+  int ii = 0;
+  elem_fn_info *elem_fn_values;
+
+  elem_fn_values = extract_elem_fn_values (fndecl);
+  if (!elem_fn_values)
+    return TYPE_NONE;
+
+  for (ii = 0; ii < elem_fn_values->no_lvars; ii++)
+    if (elem_fn_values->linear_location[ii] == parm_no)
+      {
+	if (step_size != NULL)
+	  *step_size = build_int_cst (integer_type_node,
+				      elem_fn_values->linear_steps[ii]);
+	return TYPE_LINEAR;
+      }
+    
+  for (ii = 0; ii < elem_fn_values->no_uvars; ii++)
+    if (elem_fn_values->uniform_location[ii] == parm_no)
+      return TYPE_UNIFORM;
+    
+  return TYPE_NONE;
+}
+  
+  
+/* this function will return the type of a parameter in elemental function.
+   The choices are UNIFORM or LINEAR. */
+enum elem_fn_parm_type
+find_elem_fn_parm_type (gimple stmt, tree op, tree *step_size)
+{
+  tree fndecl, parm = NULL_TREE;
+  int ii, nargs;
+  enum elem_fn_parm_type return_type = TYPE_NONE;
+  
+  if (gimple_code (stmt) != GIMPLE_CALL)
+    return TYPE_NONE;
+
+  fndecl = gimple_call_fndecl (stmt);
+  gcc_assert (fndecl);
+
+  nargs = gimple_call_num_args (stmt);
+
+  for (ii = 0; ii < nargs; ii++)
+    {
+      parm = gimple_call_arg (stmt, ii);
+      if (op == parm)
+	{
+	  return_type = find_elem_fn_parm_type_1 (fndecl, ii, step_size);
+	  return return_type;
+	}
+    }
+  return return_type;
+}
+/* this function will return the appropriate cloned named for the function */
+tree
+find_elem_fn_name (tree old_fndecl, tree vectype_out, 
+		   tree vectype_in ATTRIBUTE_UNUSED)
+{
+  elem_fn_info *elem_fn_values = NULL;
+  tree new_fndecl = NULL_TREE, arg_type = NULL_TREE;
+  char *suffix = NULL;
+  
+  elem_fn_values = extract_elem_fn_values (old_fndecl);
+ 
+  if (elem_fn_values)
+    {
+      if (elem_fn_values->no_vlengths > 0)
+	{
+	  if (elem_fn_values->vectorlength[0] ==
+	      (int)TYPE_VECTOR_SUBPARTS (vectype_out))
+	    suffix = find_suffix (elem_fn_values, false);
+	  else
+	    return NULL_TREE;
+	}
+      else
+	return NULL_TREE;
+    }
+  else
+    return NULL_TREE;
+
+  new_fndecl = copy_node (rename_elem_fn (old_fndecl, suffix));
+  TREE_TYPE (new_fndecl) = copy_node (TREE_TYPE (old_fndecl));
+
+  TYPE_ARG_TYPES (TREE_TYPE (new_fndecl)) =
+    copy_list (TYPE_ARG_TYPES (TREE_TYPE (new_fndecl)));
+  
+  for (arg_type = TYPE_ARG_TYPES (TREE_TYPE (new_fndecl));
+       arg_type && arg_type != void_type_node;
+       arg_type = TREE_CHAIN (arg_type))
+    TREE_VALUE (arg_type) = vectype_out;
+  
+  if (TREE_TYPE (TREE_TYPE (new_fndecl)) != void_type_node)
+    {
+      TREE_TYPE (TREE_TYPE (new_fndecl)) =
+	copy_node (TREE_TYPE (TREE_TYPE (new_fndecl)));
+      TREE_TYPE (TREE_TYPE (new_fndecl)) = vectype_out;
+      DECL_MODE (new_fndecl) = TYPE_MODE (vectype_out);
+    }
+  
+  return new_fndecl;
+}
+
+/* this function will extract the elem. function values from a vector and store
+ * it in a data structure and return that */
+elem_fn_info *
+extract_elem_fn_values (tree decl)
+{
+  elem_fn_info *elem_fn_values = NULL;
+  int x = 0; /* this is a dummy variable */
+  int arg_number = 0, ii = 0;
+  tree ii_tree, jj_tree, kk_tree;
+  tree decl_attr = DECL_ATTRIBUTES (decl);
+  
+  if (!decl_attr)
+    return NULL;
+
+  elem_fn_values = (elem_fn_info *)xmalloc (sizeof (elem_fn_info));
+  gcc_assert (elem_fn_values);
+
+  elem_fn_values->proc_type = NULL;
+  elem_fn_values->mask = USE_BOTH;
+  elem_fn_values->no_vlengths = 0;
+  elem_fn_values->no_uvars = 0;
+  elem_fn_values->no_lvars = 0;
+  
+
+  for (ii_tree = decl_attr; ii_tree; ii_tree = TREE_CHAIN (ii_tree))
+    {
+      tree ii_purpose = TREE_PURPOSE (ii_tree);
+      tree ii_value = TREE_VALUE (ii_tree);
+      if (TREE_CODE (ii_purpose) == IDENTIFIER_NODE
+	  && !strcmp (IDENTIFIER_POINTER (ii_purpose), "vector"))
+	{
+	  for (jj_tree = ii_value; jj_tree;
+	       jj_tree = TREE_CHAIN (jj_tree))
+	    {
+	      tree jj_value = TREE_VALUE (jj_tree);
+	      tree jj_purpose = TREE_PURPOSE (jj_value);
+	      if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
+		  && !strcmp (IDENTIFIER_POINTER (jj_purpose), "processor"))
+		{
+		  for (kk_tree = TREE_VALUE (jj_value); kk_tree;
+		       kk_tree = TREE_CHAIN (kk_tree))
+		    {
+		      tree kk_value = TREE_VALUE (kk_tree);
+		      if (TREE_CODE (kk_value) == STRING_CST)
+			elem_fn_values->proc_type =
+			  xstrdup (TREE_STRING_POINTER (kk_value));
+		    }
+		}
+	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
+		       && !strcmp (IDENTIFIER_POINTER (jj_purpose),
+				  "vectorlength"))
+		{
+		  for (kk_tree = TREE_VALUE (jj_value); kk_tree;
+		       kk_tree = TREE_CHAIN (kk_tree))
+		    {
+		      tree kk_value = TREE_VALUE (kk_tree);
+		      if (TREE_CODE (kk_value) == INTEGER_CST)
+			{
+			  x = elem_fn_values->no_vlengths;
+			  elem_fn_values->vectorlength[x] =
+			    (int) TREE_INT_CST_LOW (kk_value);
+			  elem_fn_values->no_vlengths++;
+			}
+		    }
+		}
+	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
+		       && !strcmp (IDENTIFIER_POINTER (jj_purpose), "uniform"))
+		{
+		  for (kk_tree = TREE_VALUE (jj_value); kk_tree;
+		       kk_tree = TREE_CHAIN (kk_tree))
+		    {
+		      tree kk_value = TREE_VALUE (kk_tree);
+		      elem_fn_values->uniform_vars[elem_fn_values->no_uvars] =
+			xstrdup (TREE_STRING_POINTER (kk_value));
+		      elem_fn_values->no_uvars++;
+		    }
+		}
+	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
+		       && !strcmp (IDENTIFIER_POINTER (jj_purpose), "linear"))
+		{
+		  for (kk_tree = TREE_VALUE (jj_value); kk_tree;
+		       kk_tree = TREE_CHAIN (kk_tree))
+		    {
+		      tree kk_value = TREE_VALUE (kk_tree);
+		      elem_fn_values->linear_vars[elem_fn_values->no_lvars] =
+			xstrdup (TREE_STRING_POINTER (kk_value));
+		      kk_tree = TREE_CHAIN (kk_tree);
+		      kk_value = TREE_VALUE (kk_tree);
+		      elem_fn_values->linear_steps[elem_fn_values->no_lvars] =
+			TREE_INT_CST_LOW (kk_value);
+		      elem_fn_values->no_lvars++;
+		    }
+		}
+	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
+		       && !strcmp (IDENTIFIER_POINTER (jj_purpose), "mask"))
+		elem_fn_values->mask = USE_MASK;
+	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
+		       && !strcmp (IDENTIFIER_POINTER (jj_purpose), "nomask"))
+		elem_fn_values->mask = USE_NOMASK;
+	    }
+	}
+    }
+
+  for (ii_tree = DECL_ARGUMENTS (decl); ii_tree;
+       ii_tree = DECL_CHAIN (ii_tree))
+    {
+      arg_number++;
+      bool already_found = false;
+      for (ii = 0; ii < elem_fn_values->no_uvars; ii++)
+	{
+	  if (DECL_NAME (ii_tree)
+	      && !strcmp (IDENTIFIER_POINTER (DECL_NAME (ii_tree)),
+			  elem_fn_values->uniform_vars[ii]))
+	    {
+	      already_found = true;
+	      elem_fn_values->uniform_location[ii] = arg_number;
+	    }
+	}
+      for (ii = 0; ii < elem_fn_values->no_lvars; ii++)
+	{
+	  if (DECL_NAME (ii_tree)
+	      && !strcmp (IDENTIFIER_POINTER (DECL_NAME (ii_tree)),
+			  elem_fn_values->linear_vars[ii]))
+	    {
+	      if (already_found)
+		  fatal_error
+		    ("variable %s defined in both uniform and linear clause",
+		     elem_fn_values->linear_vars[ii]);
+	      else
+		{
+		  already_found = true;
+		  elem_fn_values->linear_location[ii] = arg_number;
+		}
+	    }
+	}
+      if (!already_found) /* this means this variable is a private */
+	elem_fn_values->private_location[elem_fn_values->no_pvars++] =
+	  arg_number;
+    }
+
+  elem_fn_values->total_no_args = arg_number;
+  
+  return elem_fn_values;
+}
+
+/* this function will check to see if the node is part of an function that
+ * needs to be converted to its vector equivalent. */
+bool
+is_elem_fn (tree fndecl)
+{
+  tree ii_tree;
+
+  for (ii_tree = DECL_ATTRIBUTES (fndecl); ii_tree;
+       ii_tree = TREE_CHAIN (ii_tree))
+    {
+      tree ii_value = TREE_PURPOSE (ii_tree);
+      if (TREE_CODE (ii_value) == IDENTIFIER_NODE
+	  && !strcmp (IDENTIFIER_POINTER (ii_value), "vector"))
+	return true;
+    }
+
+  /* If we are here, then we didn't find a vector keyword, so it is false */
+  return false;
+}
Index: gimplify.c
===================================================================
--- gimplify.c	(revision 187789)
+++ gimplify.c	(working copy)
@@ -8360,6 +8360,13 @@ 
 
   oldfn = current_function_decl;
   current_function_decl = fndecl;
+
+  /* here we check to see if we have a function with the attribute vector
+   * with it. If so, then we must clone it to masked/unmasked when apropriate.
+   */
+  if (flag_enable_cilk && is_elem_fn (fndecl))
+    elem_fn_create_fn (fndecl);
+  
   if (DECL_STRUCT_FUNCTION (fndecl))
     push_cfun (DECL_STRUCT_FUNCTION (fndecl));
   else
Index: tree-inline.c
===================================================================
--- tree-inline.c	(revision 187789)
+++ tree-inline.c	(working copy)
@@ -3806,7 +3806,7 @@ 
 static inline void
 elem_fn_add_local_variables (struct function *callee, struct function *caller,
 			     copy_body_data *id, bool check_var_ann,
-			     int vlength)
+			     int vlength ATTRIBUTE_UNUSED)
 {
   tree var;
   unsigned ix;
@@ -3836,9 +3836,6 @@ 
 	    SET_DECL_DEBUG_EXPR (new_var, tem);
 	  }
 	TREE_TYPE (new_var) = copy_node (TREE_TYPE (new_var));
-	TREE_TYPE (new_var) =
-	  build_vector_type (copy_node (TREE_TYPE (new_var)), vlength);
-	DECL_GIMPLE_REG_P (new_var) = 1;
  	add_local_decl (caller, new_var);
       }
 }
@@ -4994,27 +4991,35 @@ 
 static tree
 elem_fn_copy_arguments_for_versioning (tree orig_parm, copy_body_data * id,
 				       bitmap args_to_skip, tree *vars,
-				       int vlength)
+				       int vlength, bool masked)
 {
   tree arg, *parg;
   tree new_parm = NULL;
   int i = 0;
-
+  tree masked_parm = NULL_TREE;
   parg = &new_parm;
 
+  if (masked)
+    {
+      masked_parm = build_decl (UNKNOWN_LOCATION, PARM_DECL,
+				get_identifier ("__elem_fn_mask"),
+				build_vector_type (integer_type_node, vlength));
+      DECL_ARG_TYPE (masked_parm) = build_vector_type (integer_type_node,
+						       vlength);
+      DECL_ARTIFICIAL (masked_parm) = 1;
+      lang_hooks.dup_lang_specific_decl (masked_parm);
+    }
   for (arg = orig_parm; arg; arg = DECL_CHAIN (arg), i++)
     if (!args_to_skip || !bitmap_bit_p (args_to_skip, i))
       {
         tree new_tree = remap_decl (arg, id);
 	if (TREE_CODE (new_tree) != PARM_DECL)
 	  new_tree = id->copy_decl (arg, id);
-	/* bviyer; I am using a dummy value of 4 to make sure this works */
 	TREE_TYPE (new_tree) = copy_node (TREE_TYPE (new_tree));
-	TREE_TYPE (new_tree) =
-	  build_vector_type (TREE_TYPE (new_tree), vlength);
-	DECL_ARG_TYPE (new_tree) =
-	  build_vector_type (DECL_ARG_TYPE (new_tree), vlength);
-	DECL_GIMPLE_REG_P (new_tree) = 1;
+	TREE_TYPE (new_tree) = build_vector_type (TREE_TYPE (new_tree),
+						  vlength);
+	DECL_ARG_TYPE (new_tree) = build_vector_type (DECL_ARG_TYPE (new_tree),
+						      vlength);
         lang_hooks.dup_lang_specific_decl (new_tree);
         *parg = new_tree;
 	parg = &DECL_CHAIN (new_tree);
@@ -5031,6 +5036,14 @@ 
         DECL_CHAIN (var) = *vars;
         *vars = var;
       }
+  if (masked && masked_parm)
+    {
+      for (arg = new_parm; DECL_CHAIN (arg); arg = DECL_CHAIN(arg))
+	;
+      
+      DECL_CONTEXT (masked_parm) = DECL_CONTEXT (arg);
+      DECL_CHAIN (arg) = masked_parm;
+    }
   return new_parm;
 }
 
@@ -5444,20 +5457,60 @@ 
   return;
 }
 
+static void
+initialize_elem_fn_cfun (tree new_fndecl, tree callee_fndecl)
+{
+  struct function *src_cfun = DECL_STRUCT_FUNCTION (callee_fndecl);
+
+  /* Get clean struct function.  */
+  push_struct_function (new_fndecl);
+
+  /* We will rebuild these, so just sanity check that they are empty.  */
+  gcc_assert (VALUE_HISTOGRAMS (cfun) == NULL);
+  gcc_assert (cfun->local_decls == NULL);
+  gcc_assert (cfun->cfg == NULL);
+  gcc_assert (cfun->decl == new_fndecl);
+
+  /* Copy items we preserve during cloning.  */
+  cfun->static_chain_decl = src_cfun->static_chain_decl;
+  cfun->nonlocal_goto_save_area = src_cfun->nonlocal_goto_save_area;
+  cfun->function_end_locus = src_cfun->function_end_locus;
+  cfun->curr_properties = src_cfun->curr_properties & ~PROP_loops;
+  cfun->last_verified = src_cfun->last_verified;
+  cfun->va_list_gpr_size = src_cfun->va_list_gpr_size;
+  cfun->va_list_fpr_size = src_cfun->va_list_fpr_size;
+  cfun->has_nonlocal_label = src_cfun->has_nonlocal_label;
+  cfun->stdarg = src_cfun->stdarg;
+  cfun->after_inlining = src_cfun->after_inlining;
+  cfun->can_throw_non_call_exceptions
+    = src_cfun->can_throw_non_call_exceptions;
+  cfun->returns_struct = src_cfun->returns_struct;
+  cfun->returns_pcc_struct = src_cfun->returns_pcc_struct;
+  cfun->after_tree_profile = src_cfun->after_tree_profile;
+  
+  if (src_cfun->eh)
+    init_eh_for_function ();
+
+  if (src_cfun->gimple_df)
+    {
+      init_tree_ssa (cfun);
+      cfun->gimple_df->in_ssa_p = true;
+      init_ssa_operands (cfun);
+    }
+  pop_cfun ();
+}
+
 void
 tree_elem_fn_versioning (tree old_decl, tree new_decl,
 			 VEC(ipa_replace_map_p,gc)* tree_map,
 			 bool update_clones, bitmap args_to_skip,
-			 bool skip_return, bitmap blocks_to_copy,
-			 basic_block new_entry, int vlength)
+			 bool skip_return, bitmap blocks_to_copy ATTRIBUTE_UNUSED,
+			 basic_block new_entry ATTRIBUTE_UNUSED, int vlength, bool masked)
 {
-  struct cgraph_node *old_version_node;
-  struct cgraph_node *new_version_node;
   copy_body_data id;
   tree p;
   unsigned i;
   struct ipa_replace_map *replace_info;
-  basic_block old_entry_block, bb;
   VEC (gimple, heap) *init_stmts = VEC_alloc (gimple, heap, 10);
 
   tree old_current_function_decl = current_function_decl;
@@ -5466,21 +5519,7 @@ 
   gcc_assert (TREE_CODE (old_decl) == FUNCTION_DECL
 	      && TREE_CODE (new_decl) == FUNCTION_DECL);
   DECL_POSSIBLY_INLINED (old_decl) = 1;
-
-  old_version_node = cgraph_get_node (old_decl);
-  gcc_checking_assert (old_version_node);
-  new_version_node = cgraph_get_node (new_decl);
-  gcc_checking_assert (new_version_node);
-
-  if (TREE_TYPE (TREE_TYPE (old_decl)) != void_type_node)
-    {
-      TREE_TYPE (TREE_TYPE (new_decl)) =
-	copy_node (TREE_TYPE (TREE_TYPE (old_decl)));
-      TREE_TYPE (TREE_TYPE (new_decl)) =
-	build_vector_type (TREE_TYPE (TREE_TYPE (new_decl)), vlength);
-    }
   
-  
   /* Copy over debug args.  */
   if (DECL_HAS_DEBUG_ARGS_P (old_decl))
     {
@@ -5502,9 +5541,6 @@ 
   (*debug_hooks->outlining_inline_function) (old_decl);
 
   DECL_ARTIFICIAL (new_decl) = 1;
-  DECL_ABSTRACT_ORIGIN (new_decl) = DECL_ORIGIN (old_decl);
-  DECL_FUNCTION_PERSONALITY (new_decl) = DECL_FUNCTION_PERSONALITY (old_decl);
-
   /* Prepare the data structures for the tree copy.  */
   memset (&id, 0, sizeof (id));
 
@@ -5515,24 +5551,10 @@ 
   id.debug_map = NULL;
   id.src_fn = old_decl;
   id.dst_fn = new_decl;
-  id.src_node = old_version_node;
-  id.dst_node = new_version_node;
+  id.src_node = NULL;
+  id.dst_node = NULL;
   id.src_cfun = DECL_STRUCT_FUNCTION (old_decl);
-  if (id.src_node->ipa_transforms_to_apply)
-    {
-      VEC(ipa_opt_pass,heap) * old_transforms_to_apply =
-	id.dst_node->ipa_transforms_to_apply;
-      unsigned int i;
 
-      id.dst_node->ipa_transforms_to_apply =
-	VEC_copy (ipa_opt_pass, heap, id.src_node->ipa_transforms_to_apply);
-      for (i = 0; i < VEC_length (ipa_opt_pass, old_transforms_to_apply); i++)
-        VEC_safe_push (ipa_opt_pass, heap, id.dst_node->ipa_transforms_to_apply,
-		       VEC_index (ipa_opt_pass,
-		       		  old_transforms_to_apply,
-				  i));
-    }
-
   id.copy_decl = copy_decl_no_change;
   id.transform_call_graph_edges
     = update_clones ? CB_CGE_MOVE_CLONES : CB_CGE_MOVE;
@@ -5541,12 +5563,8 @@ 
   id.transform_lang_insert_block = NULL;
 
   current_function_decl = new_decl;
-  old_entry_block = ENTRY_BLOCK_PTR_FOR_FUNCTION
-    (DECL_STRUCT_FUNCTION (old_decl));
-  initialize_cfun (new_decl, old_decl,
-		   old_entry_block->count);
-  DECL_STRUCT_FUNCTION (new_decl)->gimple_df->ipa_pta
-    = id.src_cfun->gimple_df->ipa_pta;
+  
+  initialize_elem_fn_cfun (new_decl, old_decl);
   push_cfun (DECL_STRUCT_FUNCTION (new_decl));
 
   /* Copy the function's static chain.  */
@@ -5602,7 +5620,8 @@ 
   if (DECL_ARGUMENTS (old_decl) != NULL_TREE)
     DECL_ARGUMENTS (new_decl) =
       elem_fn_copy_arguments_for_versioning (DECL_ARGUMENTS (old_decl), &id,
-					     args_to_skip, &vars, vlength);
+					     args_to_skip, &vars,
+					     vlength, masked);
 
   DECL_INITIAL (new_decl) = remap_blocks (DECL_INITIAL (id.src_fn), &id);
   BLOCK_SUPERCONTEXT (DECL_INITIAL (new_decl)) = new_decl;
@@ -5629,7 +5648,6 @@ 
     {
       tree old_name;
       DECL_RESULT (new_decl) = remap_decl (DECL_RESULT (old_decl), &id);
-      /* bviyer; we are just using 4 for vectorlength just to see if it works */
       if (TREE_TYPE (DECL_RESULT (new_decl)) != void_type_node)
 	{
 	  TREE_TYPE (DECL_RESULT (new_decl)) =
@@ -5638,6 +5656,14 @@ 
 	  DECL_MODE (DECL_RESULT (new_decl)) =
 	    TYPE_MODE (TREE_TYPE (DECL_RESULT (new_decl)));
 	}
+      if (TREE_TYPE (TREE_TYPE (old_decl)) != void_type_node)
+	{
+	  TREE_TYPE (new_decl) = copy_node (TREE_TYPE (old_decl));
+	  TREE_TYPE (TREE_TYPE (new_decl)) =
+	    copy_node (TREE_TYPE (TREE_TYPE (old_decl)));
+	  TREE_TYPE (TREE_TYPE (new_decl)) =
+	    build_vector_type (TREE_TYPE (TREE_TYPE (new_decl)), vlength);
+	}
       lang_hooks.dup_lang_specific_decl (DECL_RESULT (new_decl));
       if (gimple_in_ssa_p (id.src_cfun)
 	  && DECL_BY_REFERENCE (DECL_RESULT (old_decl))
@@ -5650,22 +5676,11 @@ 
 	  set_default_def (DECL_RESULT (new_decl), new_name);
 	}
     }
-
-  /* Copy the Function's body.  */
-  copy_body (&id, old_entry_block->count, REG_BR_PROB_BASE,
-	     ENTRY_BLOCK_PTR, EXIT_BLOCK_PTR, blocks_to_copy, new_entry);
-
+  walk_tree (&DECL_SAVED_TREE (new_decl), copy_tree_body_r, &id, NULL);
   /* Renumber the lexical scoping (non-code) blocks consecutively.  */
   number_blocks (new_decl);
 
-  /* We want to create the BB unconditionally, so that the addition of
-     debug stmts doesn't affect BB count, which may in the end cause
-     codegen differences.  */
-  bb = split_edge (single_succ_edge (ENTRY_BLOCK_PTR));
-  while (VEC_length (gimple, init_stmts))
-    insert_init_stmt (&id, bb, VEC_pop (gimple, init_stmts));
-  update_clone_info (&id);
-
+  
   /* Remap the nonlocal_goto_save_area, if any.  */
   if (cfun->nonlocal_goto_save_area)
     {
@@ -5675,49 +5690,12 @@ 
       wi.info = &id;
       walk_tree (&cfun->nonlocal_goto_save_area, remap_gimple_op_r, &wi, NULL);
     }
-
+  
   /* Clean up.  */
   pointer_map_destroy (id.decl_map);
   if (id.debug_map)
     pointer_map_destroy (id.debug_map);
-  free_dominance_info (CDI_DOMINATORS);
-  free_dominance_info (CDI_POST_DOMINATORS);
 
-  fold_marked_statements (0, id.statements_to_fold);
-  pointer_set_destroy (id.statements_to_fold);
-  fold_cond_expr_cond ();
-  delete_unreachable_blocks_update_callgraph (&id);
-  if (id.dst_node->analyzed)
-    cgraph_rebuild_references ();
-  update_ssa (TODO_update_ssa);
-
-  /* After partial cloning we need to rescale frequencies, so they are
-     within proper range in the cloned function.  */
-  if (new_entry)
-    {
-      struct cgraph_edge *e;
-      rebuild_frequencies ();
-
-      new_version_node->count = ENTRY_BLOCK_PTR->count;
-      for (e = new_version_node->callees; e; e = e->next_callee)
-	{
-	  basic_block bb = gimple_bb (e->call_stmt);
-	  e->frequency = compute_call_stmt_bb_frequency (current_function_decl,
-							 bb);
-	  e->count = bb->count;
-	}
-      for (e = new_version_node->indirect_calls; e; e = e->next_callee)
-	{
-	  basic_block bb = gimple_bb (e->call_stmt);
-	  e->frequency = compute_call_stmt_bb_frequency (current_function_decl,
-							 bb);
-	  e->count = bb->count;
-	}
-    }
-
-  free_dominance_info (CDI_DOMINATORS);
-  free_dominance_info (CDI_POST_DOMINATORS);
-
   gcc_assert (!id.debug_stmts);
   VEC_free (gimple, heap, init_stmts);
   pop_cfun ();
Index: Makefile.in
===================================================================
--- Makefile.in	(revision 187789)
+++ Makefile.in	(working copy)
@@ -1126,7 +1126,7 @@ 
   c-family/c-format.o c-family/c-gimplify.o c-family/c-lex.o \
   c-family/c-omp.o c-family/c-opts.o c-family/c-pch.o \
   c-family/c-ppoutput.o c-family/c-pragma.o c-family/c-pretty-print.o \
-  c-family/c-semantics.o c-family/c-ada-spec.o cilk-spawn.o
+  c-family/c-semantics.o c-family/c-ada-spec.o cilk-spawn.o elem-function.o
 
 # Language-specific object files for C and Objective C.
 C_AND_OBJC_OBJS = attribs.o c-errors.o c-decl.o c-typeck.o \
@@ -1451,7 +1451,7 @@ 
         cilk.o \
         cilk-low.o \
 	array-notation-common.o \
-	elem-function.o \
+	elem-function-common.o \
 	$(out_object_file) \
 	$(EXTRA_OBJS) \
 	$(host_hook_obj)
Index: passes.c
===================================================================
--- passes.c	(revision 187789)
+++ passes.c	(working copy)
@@ -1311,7 +1311,6 @@ 
       NEXT_PASS (pass_lower_vector);
       NEXT_PASS (pass_early_warn_uninitialized);
       NEXT_PASS (pass_rebuild_cgraph_edges);
-      NEXT_PASS (pass_elem_fn);
       NEXT_PASS (pass_inline_parameters);
       NEXT_PASS (pass_early_inline);
       NEXT_PASS (pass_all_early_optimizations);
Index: ChangeLog.cilk
===================================================================
--- ChangeLog.cilk	(revision 187789)
+++ ChangeLog.cilk	(working copy)
@@ -1,3 +1,43 @@ 
+2012-05-23  Balaji V Iyer  <balaji.v.iyer@intel.com>
+
+	* cgraphunit.c (cgraph_decide_is_function_needed): Added a check for
+	a cloned elemental function in the checking assert.
+	* tree-inline.c (elem_fn_add_local_variables): Make vlength as an
+	unused attribute.  Also stopped moving local variables to vector values.
+	(elem_fn_copy_arguments_for_versioning): Added a bool called masked as
+	a new parameter.  Also, added a masked parameter to the cloned function
+	if this bool is set to true.  Also, stopped setting arguments to
+	GIMPLE_REG.
+	(initialize_elem_fn_cfun): New function.
+	(tree_elem_fn_versioning): Removed several information pertaining to
+	maintaining a CFG.  Also added a masked bool parameter to be passed into
+	several local functions.  Called initialize_elem_fn_cfun function.
+	* tree-ssa.c (verify_use): Added a check if current function is
+	an elemental function.
+	* gimplify.c (gimplify_function_tree): Called function to create
+	elemental function when the function is marked appropriately.
+	* elem-function-common.c (find_processor_code): Moved function from
+	elem-function.c.
+	(find_vlength_code): Likewise.
+	(rename_elem_fn): Likewise.
+	(find_suffix): Likewise.
+	(find_elem_fn_parm_type_1): Likewise.
+	(find_elem_fn_parm_type): Likewise.
+	(find_elem_fn_name): Likewise.
+	(extract_elem_fn_values): Likewise.
+	(is_elem_fn): Likewise.
+	* elem-function.c (create_optimize_attribute): Changed sprintf to an
+	if-statement.
+	(find_suffix): Changed a string copy to string concatination.
+	(replace_return_with_new_var): New function.
+	(elem_fn_build_array): Likewise.
+	(replace_array_ref_for_vec): Likewise.
+	(fix_elem_fn_return_value): Likewise.
+	(add_elem_fn_loop): Likewise.
+	(add_elem_fn_mask): Likewise.
+	(cg_hacks): Likewise.
+	(elem_fn_create_fn): Likewise.
+	* passes.c (init_optimization_passes): Removed elem_fn_pass.
+
 2012-05-08  Balaji V. Iyer  <balaji.v.iyer@intel.com>
 
 	* cilk.c (expand_builtin_cilk_detach): Added mode type for