From 8938e14442b64634f7af0e0b5a55ebc37c6a8da0 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Wed, 4 Jan 2012 12:37:26 -0800
Subject: [PATCH] Add support for emitting ~generic vectorized C++ code.

The compiler now supports an --emit-c++ option, which generates generic
vector C++ code.  To actually compile this code, the user must provide
C++ code that implements a variety of types and operations (e.g. adding
two floating-point vector values together, comparing them, etc).

There are two examples of this required code in examples/intrinsics:
generic-16.h is a "generic" 16-wide implementation that does all required
with scalar math; it's useful for demonstrating the requirements of the
implementation.  Then, sse4.h shows a simple implementation of a SSE4
target that maps the emitted function calls to SSE intrinsics.

When using these example implementations with the ispc test suite,
all but one or two tests pass with gcc and clang on Linux and OSX.
There are currently ~10 failures with icc on Linux, and ~50 failures with
MSVC 2010.  (To be fixed in coming days.)

Performance varies: when running the examples through the sse4.h
target, some have the same performance as when compiled with --target=sse4
from ispc directly (options), while noise is 12% slower, rt is 26%
slower, and aobench is 2.2x slower.  The details of this haven't yet been
carefully investigated, but will be in coming days as well.

Issue #92.
---
 Makefile                         |   10 +-
 bitcode2cpp.py                   |    2 +
 cbackend.cpp                     | 4342 ++++++++++++++++++++++++++++++
 docs/ispc.txt                    |   60 +
 examples/intrinsics/generic-16.h | 1428 ++++++++++
 examples/intrinsics/sse4.h       | 3665 +++++++++++++++++++++++++
 ispc.vcxproj                     |    1 +
 main.cpp                         |   49 +-
 module.cpp                       |   41 +-
 module.h                         |   13 +-
 opt.cpp                          |   10 +-
 11 files changed, 9594 insertions(+), 27 deletions(-)
 create mode 100644 cbackend.cpp
 create mode 100644 examples/intrinsics/generic-16.h
 create mode 100644 examples/intrinsics/sse4.h

diff --git a/Makefile b/Makefile
index f2e18543d9b..f9d0cbab4be 100644
--- a/Makefile
+++ b/Makefile
@@ -57,9 +57,9 @@ YACC=bison -d -v -t
 
 ###########################################################################
 
-CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
-	llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
-	util.cpp
+CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
+	ispc.cpp llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp \
+	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
 TARGETS=avx avx-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 generic-16
@@ -107,6 +107,10 @@ objs/%.o: %.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 
+objs/cbackend.o: cbackend.cpp
+	@echo Compiling $<
+	@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<
+
 objs/%.o: objs/%.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
diff --git a/bitcode2cpp.py b/bitcode2cpp.py
index a1a5d2bff66..8c09b216aa4 100755
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -12,7 +12,9 @@
 src=str(sys.argv[1])
 
 target = re.sub("builtins/target-", "", src)
+target = re.sub(r"builtins\\target-", "", target)
 target = re.sub("builtins/", "", target)
+target = re.sub(r"builtins\\", "", target)
 target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)
diff --git a/cbackend.cpp b/cbackend.cpp
new file mode 100644
index 00000000000..4119011721d
--- /dev/null
+++ b/cbackend.cpp
@@ -0,0 +1,4342 @@
+//===-- CBackend.cpp - Library for converting LLVM code to C --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This library converts LLVM code to C code, compilable by GCC and other C
+// compilers.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef LLVM_2_9
+#warning "The C++ backend isn't supported when building with LLVM 2.9"
+#else
+
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/ConstantsScanner.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Config/config.h"
+
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+#include <llvm/Support/ToolOutputFile.h>
+#include <llvm/Assembly/PrintModulePass.h>
+#include <algorithm>
+// Some ms header decided to define setjmp as _setjmp, undo this for this file.
+#ifdef _MSC_VER
+#undef setjmp
+#endif
+using namespace llvm;
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+// FIXME:
+namespace {
+  /// TypeFinder - Walk over a module, identifying all of the types that are
+  /// used by the module.
+  class TypeFinder {
+    // To avoid walking constant expressions multiple times and other IR
+    // objects, we keep several helper maps.
+    DenseSet<const Value*> VisitedConstants;
+    DenseSet<Type*> VisitedTypes;
+    
+    std::vector<ArrayType*> &ArrayTypes;
+  public:
+    TypeFinder(std::vector<ArrayType*> &t)
+      : ArrayTypes(t) {}
+    
+    void run(const Module &M) {
+      // Get types from global variables.
+      for (Module::const_global_iterator I = M.global_begin(),
+           E = M.global_end(); I != E; ++I) {
+        incorporateType(I->getType());
+        if (I->hasInitializer())
+          incorporateValue(I->getInitializer());
+      }
+      
+      // Get types from aliases.
+      for (Module::const_alias_iterator I = M.alias_begin(),
+           E = M.alias_end(); I != E; ++I) {
+        incorporateType(I->getType());
+        if (const Value *Aliasee = I->getAliasee())
+          incorporateValue(Aliasee);
+      }
+      
+      SmallVector<std::pair<unsigned, MDNode*>, 4> MDForInst;
+
+      // Get types from functions.
+      for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
+        incorporateType(FI->getType());
+        
+        for (Function::const_iterator BB = FI->begin(), E = FI->end();
+             BB != E;++BB)
+          for (BasicBlock::const_iterator II = BB->begin(),
+               E = BB->end(); II != E; ++II) {
+            const Instruction &I = *II;
+            // Incorporate the type of the instruction and all its operands.
+            incorporateType(I.getType());
+            for (User::const_op_iterator OI = I.op_begin(), OE = I.op_end();
+                 OI != OE; ++OI)
+              incorporateValue(*OI);
+            
+            // Incorporate types hiding in metadata.
+            I.getAllMetadataOtherThanDebugLoc(MDForInst);
+            for (unsigned i = 0, e = MDForInst.size(); i != e; ++i)
+              incorporateMDNode(MDForInst[i].second);
+            MDForInst.clear();
+          }
+      }
+      
+      for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
+           E = M.named_metadata_end(); I != E; ++I) {
+        const NamedMDNode *NMD = I;
+        for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+          incorporateMDNode(NMD->getOperand(i));
+      }
+    }
+    
+  private:
+    void incorporateType(Type *Ty) {
+      // Check to see if we're already visited this type.
+      if (!VisitedTypes.insert(Ty).second)
+        return;
+
+      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty))
+          ArrayTypes.push_back(ATy);
+      
+      // Recursively walk all contained types.
+      for (Type::subtype_iterator I = Ty->subtype_begin(),
+           E = Ty->subtype_end(); I != E; ++I)
+        incorporateType(*I);
+    }
+    
+    /// incorporateValue - This method is used to walk operand lists finding
+    /// types hiding in constant expressions and other operands that won't be
+    /// walked in other ways.  GlobalValues, basic blocks, instructions, and
+    /// inst operands are all explicitly enumerated.
+    void incorporateValue(const Value *V) {
+      if (const MDNode *M = dyn_cast<MDNode>(V))
+        return incorporateMDNode(M);
+      if (!isa<Constant>(V) || isa<GlobalValue>(V)) return;
+      
+      // Already visited?
+      if (!VisitedConstants.insert(V).second)
+        return;
+      
+      // Check this type.
+      incorporateType(V->getType());
+      
+      // Look in operands for types.
+      const User *U = cast<User>(V);
+      for (Constant::const_op_iterator I = U->op_begin(),
+           E = U->op_end(); I != E;++I)
+        incorporateValue(*I);
+    }
+    
+    void incorporateMDNode(const MDNode *V) {
+      
+      // Already visited?
+      if (!VisitedConstants.insert(V).second)
+        return;
+      
+      // Look in operands for types.
+      for (unsigned i = 0, e = V->getNumOperands(); i != e; ++i)
+        if (Value *Op = V->getOperand(i))
+          incorporateValue(Op);
+    }
+  };
+} // end anonymous namespace
+
+static void findUsedArrayTypes(const Module *m, std::vector<ArrayType*> &t) {
+  TypeFinder(t).run(*m);
+}
+
+namespace {
+  class CBEMCAsmInfo : public MCAsmInfo {
+  public:
+    CBEMCAsmInfo() {
+      GlobalPrefix = "";
+      PrivateGlobalPrefix = "";
+    }
+  };
+
+  /// CWriter - This class is the main chunk of code that converts an LLVM
+  /// module to a C translation unit.
+  class CWriter : public FunctionPass, public InstVisitor<CWriter> {
+    formatted_raw_ostream &Out;
+    IntrinsicLowering *IL;
+    Mangler *Mang;
+    LoopInfo *LI;
+    const Module *TheModule;
+    const MCAsmInfo* TAsm;
+    const MCRegisterInfo *MRI;
+    const MCObjectFileInfo *MOFI;
+    MCContext *TCtx;
+    const TargetData* TD;
+    
+    std::map<const ConstantFP *, unsigned> FPConstantMap;
+    std::set<Function*> intrinsicPrototypesAlreadyGenerated;
+    std::set<const Argument*> ByValParams;
+    unsigned FPCounter;
+    unsigned OpaqueCounter;
+    DenseMap<const Value*, unsigned> AnonValueNumbers;
+    unsigned NextAnonValueNumber;
+    
+    std::string includeName;
+
+    /// UnnamedStructIDs - This contains a unique ID for each struct that is
+    /// either anonymous or has no name.
+    DenseMap<StructType*, unsigned> UnnamedStructIDs;
+    DenseMap<ArrayType *, unsigned> ArrayIDs;
+
+  public:
+    static char ID;
+    explicit CWriter(formatted_raw_ostream &o, const char *incname)
+      : FunctionPass(ID), Out(o), IL(0), Mang(0), LI(0),
+        TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0),
+        OpaqueCounter(0), NextAnonValueNumber(0), 
+        includeName(incname ? incname : "generic_defs.h") {
+      initializeLoopInfoPass(*PassRegistry::getPassRegistry());
+      FPCounter = 0;
+    }
+
+    virtual const char *getPassName() const { return "C backend"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LoopInfo>();
+      AU.setPreservesAll();
+    }
+
+    virtual bool doInitialization(Module &M);
+
+    bool runOnFunction(Function &F) {
+     // Do not codegen any 'available_externally' functions at all, they have
+     // definitions outside the translation unit.
+     if (F.hasAvailableExternallyLinkage())
+       return false;
+
+      LI = &getAnalysis<LoopInfo>();
+
+      // Get rid of intrinsics we can't handle.
+      lowerIntrinsics(F);
+
+      // Output all floating point constants that cannot be printed accurately.
+      printFloatingPointConstants(F);
+
+      printFunction(F);
+      return false;
+    }
+
+    virtual bool doFinalization(Module &M) {
+      // Free memory...
+      delete IL;
+      delete TD;
+      delete Mang;
+      delete TCtx;
+      delete TAsm;
+      delete MRI;
+      delete MOFI;
+      FPConstantMap.clear();
+      ByValParams.clear();
+      intrinsicPrototypesAlreadyGenerated.clear();
+      UnnamedStructIDs.clear();
+      ArrayIDs.clear();
+      return false;
+    }
+
+    raw_ostream &printType(raw_ostream &Out, Type *Ty,
+                           bool isSigned = false,
+                           const std::string &VariableName = "",
+                           bool IgnoreName = false,
+                           const AttrListPtr &PAL = AttrListPtr());
+    raw_ostream &printSimpleType(raw_ostream &Out, Type *Ty,
+                                 bool isSigned,
+                                 const std::string &NameSoFar = "");
+
+    void printStructReturnPointerFunctionType(raw_ostream &Out,
+                                              const AttrListPtr &PAL,
+                                              PointerType *Ty);
+
+    std::string getStructName(StructType *ST);
+    std::string getArrayName(ArrayType *AT);
+    
+    /// writeOperandDeref - Print the result of dereferencing the specified
+    /// operand with '*'.  This is equivalent to printing '*' then using
+    /// writeOperand, but avoids excess syntax in some cases.
+    void writeOperandDeref(Value *Operand) {
+      if (isAddressExposed(Operand)) {
+        // Already something with an address exposed.
+        writeOperandInternal(Operand);
+      } else {
+        Out << "*(";
+        writeOperand(Operand);
+        Out << ")";
+      }
+    }
+
+    void writeOperand(Value *Operand, bool Static = false);
+    void writeInstComputationInline(Instruction &I);
+    void writeOperandInternal(Value *Operand, bool Static = false);
+    void writeOperandWithCast(Value* Operand, unsigned Opcode);
+    void writeOperandWithCast(Value* Operand, const ICmpInst &I);
+    bool writeInstructionCast(const Instruction &I);
+
+    void writeMemoryAccess(Value *Operand, Type *OperandType,
+                           bool IsVolatile, unsigned Alignment);
+
+  private :
+    std::string InterpretASMConstraint(InlineAsm::ConstraintInfo& c);
+
+    void lowerIntrinsics(Function &F);
+    /// Prints the definition of the intrinsic function F. Supports the 
+    /// intrinsics which need to be explicitly defined in the CBackend.
+    void printIntrinsicDefinition(const Function &F, raw_ostream &Out);
+
+    void printModuleTypes();
+    void printContainedStructs(Type *Ty, SmallPtrSet<Type *, 16> &);
+    void printContainedArrays(ArrayType *ATy, SmallPtrSet<Type *, 16> &);
+    void printFloatingPointConstants(Function &F);
+    void printFloatingPointConstants(const Constant *C);
+    void printFunctionSignature(const Function *F, bool Prototype);
+
+    void printFunction(Function &);
+    void printBasicBlock(BasicBlock *BB);
+    void printLoop(Loop *L);
+
+    bool printCast(unsigned opcode, Type *SrcTy, Type *DstTy);
+    void printConstant(Constant *CPV, bool Static);
+    void printConstantWithCast(Constant *CPV, unsigned Opcode);
+    bool printConstExprCast(const ConstantExpr *CE, bool Static);
+    void printConstantArray(ConstantArray *CPA, bool Static);
+    void printConstantVector(ConstantVector *CV, bool Static);
+
+    /// isAddressExposed - Return true if the specified value's name needs to
+    /// have its address taken in order to get a C value of the correct type.
+    /// This happens for global variables, byval parameters, and direct allocas.
+    bool isAddressExposed(const Value *V) const {
+      if (const Argument *A = dyn_cast<Argument>(V))
+        return ByValParams.count(A);
+      return isa<GlobalVariable>(V) || isDirectAlloca(V);
+    }
+
+    // isInlinableInst - Attempt to inline instructions into their uses to build
+    // trees as much as possible.  To do this, we have to consistently decide
+    // what is acceptable to inline, so that variable declarations don't get
+    // printed and an extra copy of the expr is not emitted.
+    //
+    static bool isInlinableInst(const Instruction &I) {
+      // Always inline cmp instructions, even if they are shared by multiple
+      // expressions.  GCC generates horrible code if we don't.
+      if (isa<CmpInst>(I))
+        return true;
+
+      // Must be an expression, must be used exactly once.  If it is dead, we
+      // emit it inline where it would go.
+      if (I.getType() == Type::getVoidTy(I.getContext()) || !I.hasOneUse() ||
+          isa<TerminatorInst>(I) || isa<CallInst>(I) || isa<PHINode>(I) ||
+          isa<LoadInst>(I) || isa<VAArgInst>(I) || isa<InsertElementInst>(I) ||
+          isa<InsertValueInst>(I) || isa<ExtractValueInst>(I))
+        // Don't inline a load across a store or other bad things!
+        return false;
+
+      // Must not be used in inline asm, extractelement, or shufflevector.
+      if (I.hasOneUse()) {
+        const Instruction &User = cast<Instruction>(*I.use_back());
+        if (isInlineAsm(User) || isa<ExtractElementInst>(User) ||
+            isa<ShuffleVectorInst>(User) || isa<AtomicRMWInst>(User) ||
+            isa<AtomicCmpXchgInst>(User))
+          return false;
+      }
+
+      // Only inline instruction it if it's use is in the same BB as the inst.
+      return I.getParent() == cast<Instruction>(I.use_back())->getParent();
+    }
+
+    // isDirectAlloca - Define fixed sized allocas in the entry block as direct
+    // variables which are accessed with the & operator.  This causes GCC to
+    // generate significantly better code than to emit alloca calls directly.
+    //
+    static const AllocaInst *isDirectAlloca(const Value *V) {
+      const AllocaInst *AI = dyn_cast<AllocaInst>(V);
+      if (!AI) return 0;
+      if (AI->isArrayAllocation())
+        return 0;   // FIXME: we can also inline fixed size array allocas!
+      if (AI->getParent() != &AI->getParent()->getParent()->getEntryBlock())
+        return 0;
+      return AI;
+    }
+
+    // isInlineAsm - Check if the instruction is a call to an inline asm chunk.
+    static bool isInlineAsm(const Instruction& I) {
+      if (const CallInst *CI = dyn_cast<CallInst>(&I))
+        return isa<InlineAsm>(CI->getCalledValue());
+      return false;
+    }
+
+    // Instruction visitation functions
+    friend class InstVisitor<CWriter>;
+
+    void visitReturnInst(ReturnInst &I);
+    void visitBranchInst(BranchInst &I);
+    void visitSwitchInst(SwitchInst &I);
+    void visitIndirectBrInst(IndirectBrInst &I);
+    void visitInvokeInst(InvokeInst &I) {
+      llvm_unreachable("Lowerinvoke pass didn't work!");
+    }
+    void visitUnwindInst(UnwindInst &I) {
+      llvm_unreachable("Lowerinvoke pass didn't work!");
+    }
+    void visitResumeInst(ResumeInst &I) {
+      llvm_unreachable("DwarfEHPrepare pass didn't work!");
+    }
+    void visitUnreachableInst(UnreachableInst &I);
+
+    void visitPHINode(PHINode &I);
+    void visitBinaryOperator(Instruction &I);
+    void visitICmpInst(ICmpInst &I);
+    void visitFCmpInst(FCmpInst &I);
+
+    void visitCastInst (CastInst &I);
+    void visitSelectInst(SelectInst &I);
+    void visitCallInst (CallInst &I);
+    void visitInlineAsm(CallInst &I);
+    bool visitBuiltinCall(CallInst &I, Intrinsic::ID ID, bool &WroteCallee);
+
+    void visitAllocaInst(AllocaInst &I);
+    void visitLoadInst  (LoadInst   &I);
+    void visitStoreInst (StoreInst  &I);
+    void visitGetElementPtrInst(GetElementPtrInst &I);
+    void visitVAArgInst (VAArgInst &I);
+
+    void visitInsertElementInst(InsertElementInst &I);
+    void visitExtractElementInst(ExtractElementInst &I);
+    void visitShuffleVectorInst(ShuffleVectorInst &SVI);
+
+    void visitInsertValueInst(InsertValueInst &I);
+    void visitExtractValueInst(ExtractValueInst &I);
+
+    void visitAtomicRMWInst(AtomicRMWInst &I);
+    void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I);
+
+    void visitInstruction(Instruction &I) {
+#ifndef NDEBUG
+      errs() << "C Writer does not know about " << I;
+#endif
+      llvm_unreachable(0);
+    }
+
+    void outputLValue(Instruction *I) {
+      Out << "  " << GetValueName(I) << " = ";
+    }
+
+    bool isGotoCodeNecessary(BasicBlock *From, BasicBlock *To);
+    void printPHICopiesForSuccessor(BasicBlock *CurBlock,
+                                    BasicBlock *Successor, unsigned Indent);
+    void printBranchToBlock(BasicBlock *CurBlock, BasicBlock *SuccBlock,
+                            unsigned Indent);
+    void printGEPExpression(Value *Ptr, gep_type_iterator I,
+                            gep_type_iterator E, bool Static);
+
+    std::string GetValueName(const Value *Operand);
+  };
+}
+
+char CWriter::ID = 0;
+
+
+
+static std::string CBEMangle(const std::string &S) {
+  std::string Result;
+
+  for (unsigned i = 0, e = S.size(); i != e; ++i)
+    if (isalnum(S[i]) || S[i] == '_') {
+      Result += S[i];
+    } else {
+      Result += '_';
+      Result += 'A'+(S[i]&15);
+      Result += 'A'+((S[i]>>4)&15);
+      Result += '_';
+    }
+  return Result;
+}
+
+std::string CWriter::getStructName(StructType *ST) {
+  if (!ST->isLiteral() && !ST->getName().empty())
+    return CBEMangle("l_"+ST->getName().str());
+  
+  return "l_unnamed_" + utostr(UnnamedStructIDs[ST]);
+}
+
+std::string CWriter::getArrayName(ArrayType *AT) {
+  return "l_array_" + utostr(ArrayIDs[AT]);
+}
+
+
+/// printStructReturnPointerFunctionType - This is like printType for a struct
+/// return type, except, instead of printing the type as void (*)(Struct*, ...)
+/// print it as "Struct (*)(...)", for struct return functions.
+void CWriter::printStructReturnPointerFunctionType(raw_ostream &Out,
+                                                   const AttrListPtr &PAL,
+                                                   PointerType *TheTy) {
+  FunctionType *FTy = cast<FunctionType>(TheTy->getElementType());
+  std::string tstr;
+  raw_string_ostream FunctionInnards(tstr);
+  FunctionInnards << " (*) (";
+  bool PrintedType = false;
+
+  FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end();
+  Type *RetTy = cast<PointerType>(*I)->getElementType();
+  unsigned Idx = 1;
+  for (++I, ++Idx; I != E; ++I, ++Idx) {
+    if (PrintedType)
+      FunctionInnards << ", ";
+    Type *ArgTy = *I;
+    if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+      assert(ArgTy->isPointerTy());
+      ArgTy = cast<PointerType>(ArgTy)->getElementType();
+    }
+    printType(FunctionInnards, ArgTy,
+        /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt), "");
+    PrintedType = true;
+  }
+  if (FTy->isVarArg()) {
+    if (!PrintedType)
+      FunctionInnards << " int"; //dummy argument for empty vararg functs
+    FunctionInnards << ", ...";
+  } else if (!PrintedType) {
+    FunctionInnards << "void";
+  }
+  FunctionInnards << ')';
+  printType(Out, RetTy,
+      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), FunctionInnards.str());
+}
+
+raw_ostream &
+CWriter::printSimpleType(raw_ostream &Out, Type *Ty, bool isSigned,
+                         const std::string &NameSoFar) {
+  assert((Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) &&
+         "Invalid type for printSimpleType");
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:   return Out << "void " << NameSoFar;
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1)
+      return Out << "bool " << NameSoFar;
+    else if (NumBits <= 8)
+      return Out << (isSigned?"":"u") << "int8_t " << NameSoFar;
+    else if (NumBits <= 16)
+      return Out << (isSigned?"":"u") << "int16_t " << NameSoFar;
+    else if (NumBits <= 32)
+      return Out << (isSigned?"":"u") << "int32_t " << NameSoFar;
+    else if (NumBits <= 64)
+      return Out << (isSigned?"":"u") << "int64_t "<< NameSoFar;
+    else {
+      assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
+      return Out << (isSigned?"llvmInt128":"llvmUInt128") << " " << NameSoFar;
+    }
+  }
+  case Type::FloatTyID:  return Out << "float "   << NameSoFar;
+  case Type::DoubleTyID: return Out << "double "  << NameSoFar;
+  // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is
+  // present matches host 'long double'.
+  case Type::X86_FP80TyID:
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:  return Out << "long double " << NameSoFar;
+
+  case Type::X86_MMXTyID:
+    return printSimpleType(Out, Type::getInt32Ty(Ty->getContext()), isSigned,
+                     " __attribute__((vector_size(64))) " + NameSoFar);
+
+  case Type::VectorTyID: {
+    VectorType *VTy = cast<VectorType>(Ty);
+#if 1
+    const char *suffix = NULL;
+    const Type *eltTy = VTy->getElementType();
+    if (eltTy->isFloatTy())
+        suffix = "f";
+    else if (eltTy->isDoubleTy())
+        suffix = "d";
+    else {
+        assert(eltTy->isIntegerTy());
+        switch (eltTy->getPrimitiveSizeInBits()) {
+        case 1:
+            suffix = "i1"; 
+            break;
+        case 8:
+            suffix = "i8"; 
+            break;
+        case 16:
+            suffix = "i16"; 
+            break;
+        case 32:
+            suffix = "i32"; 
+            break;
+        case 64:
+            suffix = "i64"; 
+            break;
+        default:
+            report_fatal_error("Only integer types of size 8/16/32/64 are "
+                               "supported by the C++ backend.");
+        }
+    }
+
+    return Out << "__vec" << VTy->getNumElements() << "_" << suffix << " " << 
+        NameSoFar;
+#else
+    return printSimpleType(Out, VTy->getElementType(), isSigned,
+                     " __attribute__((vector_size(" +
+                     utostr(TD->getTypeAllocSize(VTy)) + " ))) " + NameSoFar);
+#endif
+  }
+
+  default:
+#ifndef NDEBUG
+    errs() << "Unknown primitive type: " << *Ty << "\n";
+#endif
+    llvm_unreachable(0);
+  }
+}
+
+// Pass the Type* and the variable name and this prints out the variable
+// declaration.
+//
+raw_ostream &CWriter::printType(raw_ostream &Out, Type *Ty,
+                                bool isSigned, const std::string &NameSoFar,
+                                bool IgnoreName, const AttrListPtr &PAL) {
+  if (Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) {
+    printSimpleType(Out, Ty, isSigned, NameSoFar);
+    return Out;
+  }
+
+  switch (Ty->getTypeID()) {
+  case Type::FunctionTyID: {
+    FunctionType *FTy = cast<FunctionType>(Ty);
+    std::string tstr;
+    raw_string_ostream FunctionInnards(tstr);
+    FunctionInnards << " (" << NameSoFar << ") (";
+    unsigned Idx = 1;
+    for (FunctionType::param_iterator I = FTy->param_begin(),
+           E = FTy->param_end(); I != E; ++I) {
+      Type *ArgTy = *I;
+      if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+        assert(ArgTy->isPointerTy());
+        ArgTy = cast<PointerType>(ArgTy)->getElementType();
+      }
+      if (I != FTy->param_begin())
+        FunctionInnards << ", ";
+      printType(FunctionInnards, ArgTy,
+        /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt), "");
+      ++Idx;
+    }
+    if (FTy->isVarArg()) {
+      if (!FTy->getNumParams())
+        FunctionInnards << " int"; //dummy argument for empty vaarg functs
+      FunctionInnards << ", ...";
+    } else if (!FTy->getNumParams()) {
+      FunctionInnards << "void";
+    }
+    FunctionInnards << ')';
+    printType(Out, FTy->getReturnType(),
+      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), FunctionInnards.str());
+    return Out;
+  }
+  case Type::StructTyID: {
+    StructType *STy = cast<StructType>(Ty);
+    
+    // Check to see if the type is named.
+    if (!IgnoreName)
+      return Out << getStructName(STy) << ' ' << NameSoFar;
+    
+    Out << "struct " << NameSoFar << " {\n";
+
+    // print initialization func
+    if (STy->getNumElements() > 0) {
+        Out << "  static " << NameSoFar << " init(";
+        unsigned Idx = 0;
+        for (StructType::element_iterator I = STy->element_begin(),
+                 E = STy->element_end(); I != E; ++I, ++Idx) {
+            char buf[64];
+            sprintf(buf, "v%d", Idx);
+            printType(Out, *I, false, buf);
+            if (Idx + 1 < STy->getNumElements())
+                Out << ", ";
+        }
+        Out << ") {\n";
+        Out << "    " << NameSoFar << " ret;\n";
+        for (Idx = 0; Idx < STy->getNumElements(); ++Idx)
+            Out << "    ret.field" << Idx << " = v" << Idx << ";\n";
+        Out << "    return ret;\n";
+        Out << "  }\n";
+    }
+
+    unsigned Idx = 0;
+    for (StructType::element_iterator I = STy->element_begin(),
+           E = STy->element_end(); I != E; ++I) {
+      Out << "  ";
+      printType(Out, *I, false, "field" + utostr(Idx++));
+      Out << ";\n";
+    }
+    Out << '}';
+    if (STy->isPacked())
+      Out << " __attribute__ ((packed))";
+    return Out;
+  }
+
+  case Type::PointerTyID: {
+    PointerType *PTy = cast<PointerType>(Ty);
+    std::string ptrName = "*" + NameSoFar;
+
+    if (PTy->getElementType()->isArrayTy() ||
+        PTy->getElementType()->isVectorTy())
+      ptrName = "(" + ptrName + ")";
+
+    if (!PAL.isEmpty())
+      // Must be a function ptr cast!
+      return printType(Out, PTy->getElementType(), false, ptrName, true, PAL);
+    return printType(Out, PTy->getElementType(), false, ptrName);
+  }
+
+  case Type::ArrayTyID: {
+    ArrayType *ATy = cast<ArrayType>(Ty);
+
+    // Check to see if the type is named.
+    if (!IgnoreName)
+      return Out << getArrayName(ATy) << ' ' << NameSoFar;
+
+    unsigned NumElements = (unsigned)ATy->getNumElements();
+    if (NumElements == 0) NumElements = 1;
+    // Arrays are wrapped in structs to allow them to have normal
+    // value semantics (avoiding the array "decay").
+    Out << "struct " << NameSoFar << " {\n";
+    // init func
+    Out << "  static " << NameSoFar << " init(";
+    for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+        char buf[64];
+        sprintf(buf, "v%d", Idx);
+        printType(Out, ATy->getElementType(), false, buf);
+        if (Idx + 1 < NumElements)
+            Out << ", ";
+    }
+    Out << ") {\n";
+    Out << "    " << NameSoFar << " ret;\n";
+    for (unsigned Idx = 0; Idx < NumElements; ++Idx)
+        Out << "    ret.array[" << Idx << "] = v" << Idx << ";\n";
+    Out << "    return ret;\n";
+    Out << "  }\n  ";
+
+    printType(Out, ATy->getElementType(), false,
+              "array[" + utostr(NumElements) + "]");
+    return Out << ";\n} ";
+  }
+
+  default:
+    llvm_unreachable("Unhandled case in getTypeProps!");
+  }
+
+  return Out;
+}
+
+void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
+
+  // As a special case, print the array as a string if it is an array of
+  // ubytes or an array of sbytes with positive values.
+  //
+  Type *ETy = CPA->getType()->getElementType();
+  // MMP: this looks like a bug: both sides of the || are the same
+  bool isString = (ETy == Type::getInt8Ty(CPA->getContext()) ||
+                   ETy == Type::getInt8Ty(CPA->getContext()));
+
+  // Make sure the last character is a null char, as automatically added by C
+  if (isString && (CPA->getNumOperands() == 0 ||
+                   !cast<Constant>(*(CPA->op_end()-1))->isNullValue()))
+    isString = false;
+
+  if (isString) {
+    Out << '\"';
+    // Keep track of whether the last number was a hexadecimal escape.
+    bool LastWasHex = false;
+
+    // Do not include the last character, which we know is null
+    for (unsigned i = 0, e = CPA->getNumOperands()-1; i != e; ++i) {
+      unsigned char C = (unsigned char)(cast<ConstantInt>(CPA->getOperand(i))->getZExtValue());
+
+      // Print it out literally if it is a printable character.  The only thing
+      // to be careful about is when the last letter output was a hex escape
+      // code, in which case we have to be careful not to print out hex digits
+      // explicitly (the C compiler thinks it is a continuation of the previous
+      // character, sheesh...)
+      //
+      if (isprint(C) && (!LastWasHex || !isxdigit(C))) {
+        LastWasHex = false;
+        if (C == '"' || C == '\\')
+          Out << "\\" << (char)C;
+        else
+          Out << (char)C;
+      } else {
+        LastWasHex = false;
+        switch (C) {
+        case '\n': Out << "\\n"; break;
+        case '\t': Out << "\\t"; break;
+        case '\r': Out << "\\r"; break;
+        case '\v': Out << "\\v"; break;
+        case '\a': Out << "\\a"; break;
+        case '\"': Out << "\\\""; break;
+        case '\'': Out << "\\\'"; break;
+        default:
+          Out << "\\x";
+          Out << (char)(( C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'));
+          Out << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
+          LastWasHex = true;
+          break;
+        }
+      }
+    }
+    Out << '\"';
+  } else {
+    Out << '{';
+    if (CPA->getNumOperands()) {
+      Out << ' ';
+      printConstant(cast<Constant>(CPA->getOperand(0)), Static);
+      for (unsigned i = 1, e = CPA->getNumOperands(); i != e; ++i) {
+        Out << ", ";
+        printConstant(cast<Constant>(CPA->getOperand(i)), Static);
+      }
+    }
+    Out << " }";
+  }
+}
+
+void CWriter::printConstantVector(ConstantVector *CP, bool Static) {
+  if (CP->getNumOperands()) {
+    Out << ' ';
+    printConstant(cast<Constant>(CP->getOperand(0)), Static);
+    for (unsigned i = 1, e = CP->getNumOperands(); i != e; ++i) {
+      Out << ", ";
+      printConstant(cast<Constant>(CP->getOperand(i)), Static);
+    }
+  }
+}
+
+// isFPCSafeToPrint - Returns true if we may assume that CFP may be written out
+// textually as a double (rather than as a reference to a stack-allocated
+// variable). We decide this by converting CFP to a string and back into a
+// double, and then checking whether the conversion results in a bit-equal
+// double to the original value of CFP. This depends on us and the target C
+// compiler agreeing on the conversion process (which is pretty likely since we
+// only deal in IEEE FP).
+//
+static bool isFPCSafeToPrint(const ConstantFP *CFP) {
+  bool ignored;
+  // Do long doubles in hex for now.
+  if (CFP->getType() != Type::getFloatTy(CFP->getContext()) &&
+      CFP->getType() != Type::getDoubleTy(CFP->getContext()))
+    return false;
+  APFloat APF = APFloat(CFP->getValueAPF());  // copy
+  if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
+    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
+#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
+  char Buffer[100];
+  sprintf(Buffer, "%a", APF.convertToDouble());
+  if (!strncmp(Buffer, "0x", 2) ||
+      !strncmp(Buffer, "-0x", 3) ||
+      !strncmp(Buffer, "+0x", 3))
+    return APF.bitwiseIsEqual(APFloat(atof(Buffer)));
+  return false;
+#else
+  std::string StrVal = ftostr(APF);
+
+  while (StrVal[0] == ' ')
+    StrVal.erase(StrVal.begin());
+
+  // Check to make sure that the stringized number is not some string like "Inf"
+  // or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
+  if ((StrVal[0] >= '0' && StrVal[0] <= '9') ||
+      ((StrVal[0] == '-' || StrVal[0] == '+') &&
+       (StrVal[1] >= '0' && StrVal[1] <= '9')))
+    // Reparse stringized version!
+    return APF.bitwiseIsEqual(APFloat(atof(StrVal.c_str())));
+  return false;
+#endif
+}
+
+/// Print out the casting for a cast operation. This does the double casting
+/// necessary for conversion to the destination type, if necessary.
+/// Return value indicates whether a closing paren is needed.
+/// @brief Print a cast
+bool CWriter::printCast(unsigned opc, Type *SrcTy, Type *DstTy) {
+  if (isa<const VectorType>(DstTy)) {
+      assert(isa<const VectorType>(SrcTy));
+      switch (opc) {
+      case Instruction::UIToFP:   Out << "__cast_uitofp("; break;
+      case Instruction::SIToFP:   Out << "__cast_sitofp("; break;
+      case Instruction::IntToPtr: llvm_unreachable("Invalid vector cast");
+      case Instruction::Trunc:    Out << "__cast_trunc("; break;
+      case Instruction::BitCast:  Out << "__cast_bits("; break;
+      case Instruction::FPExt:    Out << "__cast_fpext("; break;
+      case Instruction::FPTrunc:  Out << "__cast_fptrunc("; break;
+      case Instruction::ZExt:     Out << "__cast_zext("; break;
+      case Instruction::PtrToInt: llvm_unreachable("Invalid vector cast");
+      case Instruction::FPToUI:   Out << "__cast_fptoui("; break;
+      case Instruction::SExt:     Out << "__cast_sext("; break;
+      case Instruction::FPToSI:   Out << "__cast_fptosi("; break;
+      default:
+          llvm_unreachable("Invalid cast opcode");
+      }
+
+      // print a call to the constructor for the destination type for the
+      // first arg; this bogus first parameter is only used to convey the
+      // desired return type to the callee.
+      printType(Out, DstTy);
+      Out << "(), ";
+
+      return true;
+  }
+
+  // Print the destination type cast
+  switch (opc) {
+    case Instruction::BitCast: {
+        if (DstTy->isPointerTy()) {
+            Out << '(';
+            printType(Out, DstTy);
+            Out << ')';
+            break;
+        }
+        else {
+            Out << "__cast_bits((";
+            printType(Out, DstTy);
+            Out << ")0, ";
+            return true;
+        }
+    }
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::IntToPtr:
+    case Instruction::Trunc:
+    case Instruction::FPExt:
+    case Instruction::FPTrunc: // For these the DstTy sign doesn't matter
+      Out << '(';
+      printType(Out, DstTy);
+      Out << ')';
+      break;
+    case Instruction::ZExt:
+    case Instruction::PtrToInt:
+    case Instruction::FPToUI: // For these, make sure we get an unsigned dest
+      Out << '(';
+      printSimpleType(Out, DstTy, false);
+      Out << ')';
+      break;
+    case Instruction::SExt:
+    case Instruction::FPToSI: // For these, make sure we get a signed dest
+      Out << '(';
+      printSimpleType(Out, DstTy, true);
+      Out << ')';
+      break;
+    default:
+      llvm_unreachable("Invalid cast opcode");
+  }
+
+  // Print the source type cast
+  switch (opc) {
+    case Instruction::UIToFP:
+    case Instruction::ZExt:
+      Out << '(';
+      printSimpleType(Out, SrcTy, false);
+      Out << ')';
+      break;
+    case Instruction::SIToFP:
+    case Instruction::SExt:
+      Out << '(';
+      printSimpleType(Out, SrcTy, true);
+      Out << ')';
+      break;
+    case Instruction::IntToPtr:
+    case Instruction::PtrToInt:
+      // Avoid "cast to pointer from integer of different size" warnings
+      Out << "(unsigned long)";
+      break;
+    case Instruction::Trunc:
+    case Instruction::BitCast:
+    case Instruction::FPExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPToSI:
+    case Instruction::FPToUI:
+      break; // These don't need a source cast.
+    default:
+      llvm_unreachable("Invalid cast opcode");
+      break;
+  }
+  return false;
+}
+
+// printConstant - The LLVM Constant to C Constant converter.
+void CWriter::printConstant(Constant *CPV, bool Static) {
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) {
+    if (isa<VectorType>(CPV->getType())) {
+        assert(CE->getOpcode() == Instruction::BitCast);
+        ConstantExpr *Op = dyn_cast<ConstantExpr>(CE->getOperand(0));
+        assert(Op && Op->getOpcode() == Instruction::BitCast);
+        assert(isa<VectorType>(Op->getOperand(0)->getType()));
+
+        Out << "(__cast_bits(";
+        printType(Out, CE->getType());
+        Out << "(), ";
+        printConstant(Op->getOperand(0), Static);
+        Out << "))";
+        return;
+    }
+    switch (CE->getOpcode()) {
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast: {
+      if (CE->getOpcode() == Instruction::BitCast &&
+          CE->getType()->isPointerTy() == false) {
+          Out << "__cast_bits((";
+          printType(Out, CE->getType());
+          Out << ")0, ";
+          printConstant(CE->getOperand(0), Static);
+          Out << ")";
+          return;
+      }
+
+      Out << "(";
+      bool closeParen = printCast(CE->getOpcode(), CE->getOperand(0)->getType(),
+                                  CE->getType());
+      if (CE->getOpcode() == Instruction::SExt &&
+          CE->getOperand(0)->getType() == Type::getInt1Ty(CPV->getContext())) {
+        // Make sure we really sext from bool here by subtracting from 0
+        Out << "0-";
+      }
+      printConstant(CE->getOperand(0), Static);
+      if (CE->getType() == Type::getInt1Ty(CPV->getContext()) &&
+          (CE->getOpcode() == Instruction::Trunc ||
+           CE->getOpcode() == Instruction::FPToUI ||
+           CE->getOpcode() == Instruction::FPToSI ||
+           CE->getOpcode() == Instruction::PtrToInt)) {
+        // Make sure we really truncate to bool here by anding with 1
+        Out << "&1u";
+      }
+      Out << ')';
+      if (closeParen)
+          Out << ')';
+      return;
+    }
+    case Instruction::GetElementPtr:
+        assert(!isa<VectorType>(CPV->getType()));
+        Out << "(";
+        printGEPExpression(CE->getOperand(0), gep_type_begin(CPV),
+                           gep_type_end(CPV), Static);
+        Out << ")";
+        return;
+    case Instruction::Select:
+        assert(!isa<VectorType>(CPV->getType()));
+        Out << '(';
+        printConstant(CE->getOperand(0), Static);
+        Out << '?';
+        printConstant(CE->getOperand(1), Static);
+        Out << ':';
+        printConstant(CE->getOperand(2), Static);
+        Out << ')';
+        return;
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    {
+      assert(!isa<VectorType>(CPV->getType()));
+      Out << '(';
+      bool NeedsClosingParens = printConstExprCast(CE, Static);
+      printConstantWithCast(CE->getOperand(0), CE->getOpcode());
+      switch (CE->getOpcode()) {
+      case Instruction::Add:
+      case Instruction::FAdd: Out << " + "; break;
+      case Instruction::Sub:
+      case Instruction::FSub: Out << " - "; break;
+      case Instruction::Mul:
+      case Instruction::FMul: Out << " * "; break;
+      case Instruction::URem:
+      case Instruction::SRem:
+      case Instruction::FRem: Out << " % "; break;
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::FDiv: Out << " / "; break;
+      case Instruction::And: Out << " & "; break;
+      case Instruction::Or:  Out << " | "; break;
+      case Instruction::Xor: Out << " ^ "; break;
+      case Instruction::Shl: Out << " << "; break;
+      case Instruction::LShr:
+      case Instruction::AShr: Out << " >> "; break;
+      case Instruction::ICmp:
+        switch (CE->getPredicate()) {
+          case ICmpInst::ICMP_EQ: Out << " == "; break;
+          case ICmpInst::ICMP_NE: Out << " != "; break;
+          case ICmpInst::ICMP_SLT:
+          case ICmpInst::ICMP_ULT: Out << " < "; break;
+          case ICmpInst::ICMP_SLE:
+          case ICmpInst::ICMP_ULE: Out << " <= "; break;
+          case ICmpInst::ICMP_SGT:
+          case ICmpInst::ICMP_UGT: Out << " > "; break;
+          case ICmpInst::ICMP_SGE:
+          case ICmpInst::ICMP_UGE: Out << " >= "; break;
+          default: llvm_unreachable("Illegal ICmp predicate");
+        }
+        break;
+      default: llvm_unreachable("Illegal opcode here!");
+      }
+      printConstantWithCast(CE->getOperand(1), CE->getOpcode());
+      if (NeedsClosingParens)
+        Out << "))";
+      Out << ')';
+      return;
+    }
+    case Instruction::FCmp: {
+      assert(!isa<VectorType>(CPV->getType()));
+      Out << '(';
+      bool NeedsClosingParens = printConstExprCast(CE, Static);
+      if (CE->getPredicate() == FCmpInst::FCMP_FALSE)
+        Out << "0";
+      else if (CE->getPredicate() == FCmpInst::FCMP_TRUE)
+        Out << "1";
+      else {
+        const char* op = 0;
+        switch (CE->getPredicate()) {
+        default: llvm_unreachable("Illegal FCmp predicate");
+        case FCmpInst::FCMP_ORD: op = "ord"; break;
+        case FCmpInst::FCMP_UNO: op = "uno"; break;
+        case FCmpInst::FCMP_UEQ: op = "ueq"; break;
+        case FCmpInst::FCMP_UNE: op = "une"; break;
+        case FCmpInst::FCMP_ULT: op = "ult"; break;
+        case FCmpInst::FCMP_ULE: op = "ule"; break;
+        case FCmpInst::FCMP_UGT: op = "ugt"; break;
+        case FCmpInst::FCMP_UGE: op = "uge"; break;
+        case FCmpInst::FCMP_OEQ: op = "oeq"; break;
+        case FCmpInst::FCMP_ONE: op = "one"; break;
+        case FCmpInst::FCMP_OLT: op = "olt"; break;
+        case FCmpInst::FCMP_OLE: op = "ole"; break;
+        case FCmpInst::FCMP_OGT: op = "ogt"; break;
+        case FCmpInst::FCMP_OGE: op = "oge"; break;
+        }
+        Out << "llvm_fcmp_" << op << "(";
+        printConstantWithCast(CE->getOperand(0), CE->getOpcode());
+        Out << ", ";
+        printConstantWithCast(CE->getOperand(1), CE->getOpcode());
+        Out << ")";
+      }
+      if (NeedsClosingParens)
+        Out << "))";
+      Out << ')';
+      return;
+    }
+    default:
+#ifndef NDEBUG
+      errs() << "CWriter Error: Unhandled constant expression: "
+           << *CE << "\n";
+#endif
+      llvm_unreachable(0);
+    }
+  } else if (isa<UndefValue>(CPV) && CPV->getType()->isSingleValueType()) {
+    if (CPV->getType()->isVectorTy()) {
+      printType(Out, CPV->getType());
+      Out << "( /* UNDEF */)";
+      return;
+    }
+
+    Out << "((";
+    printType(Out, CPV->getType()); // sign doesn't matter
+    Out << ")/*UNDEF*/";
+    Out << "0)";
+    return;
+  }
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+    Type* Ty = CI->getType();
+    if (Ty == Type::getInt1Ty(CPV->getContext()))
+      Out << (CI->getZExtValue() ? '1' : '0');
+    else if (Ty == Type::getInt32Ty(CPV->getContext()))
+      Out << CI->getZExtValue() << 'u';
+    else if (Ty->getPrimitiveSizeInBits() > 32) {
+      assert(Ty->getPrimitiveSizeInBits() == 64);
+      Out << CI->getZExtValue() << "ull";
+    }
+    else {
+      Out << "((";
+      printSimpleType(Out, Ty, false) << ')';
+      if (CI->isMinValue(true))
+        Out << CI->getZExtValue() << 'u';
+      else
+        Out << CI->getSExtValue();
+      Out << ')';
+    }
+    return;
+  }
+
+  switch (CPV->getType()->getTypeID()) {
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::X86_FP80TyID:
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID: {
+    ConstantFP *FPC = cast<ConstantFP>(CPV);
+    std::map<const ConstantFP*, unsigned>::iterator I = FPConstantMap.find(FPC);
+    if (I != FPConstantMap.end()) {
+      // Because of FP precision problems we must load from a stack allocated
+      // value that holds the value in hex.
+      Out << "(*(" << (FPC->getType() == Type::getFloatTy(CPV->getContext()) ?
+                       "float" :
+                       FPC->getType() == Type::getDoubleTy(CPV->getContext()) ?
+                       "double" :
+                       "long double")
+          << "*)&FPConstant" << I->second << ')';
+    } else {
+      double V;
+      if (FPC->getType() == Type::getFloatTy(CPV->getContext()))
+        V = FPC->getValueAPF().convertToFloat();
+      else if (FPC->getType() == Type::getDoubleTy(CPV->getContext()))
+        V = FPC->getValueAPF().convertToDouble();
+      else {
+        // Long double.  Convert the number to double, discarding precision.
+        // This is not awesome, but it at least makes the CBE output somewhat
+        // useful.
+        APFloat Tmp = FPC->getValueAPF();
+        bool LosesInfo;
+        Tmp.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &LosesInfo);
+        V = Tmp.convertToDouble();
+      }
+
+      if (IsNAN(V)) {
+        // The value is NaN
+
+        // FIXME the actual NaN bits should be emitted.
+        // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN,
+        // it's 0x7ff4.
+        const unsigned long QuietNaN = 0x7ff8UL;
+        //const unsigned long SignalNaN = 0x7ff4UL;
+
+        // We need to grab the first part of the FP #
+        char Buffer[100];
+
+        uint64_t ll = DoubleToBits(V);
+        sprintf(Buffer, "0x%llx", static_cast<long long>(ll));
+
+        std::string Num(&Buffer[0], &Buffer[6]);
+        unsigned long Val = strtoul(Num.c_str(), 0, 16);
+
+        if (FPC->getType() == Type::getFloatTy(FPC->getContext()))
+          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\""
+              << Buffer << "\") /*nan*/ ";
+        else
+          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\""
+              << Buffer << "\") /*nan*/ ";
+      } else if (IsInf(V)) {
+        // The value is Inf
+        if (V < 0) Out << '-';
+        Out << "LLVM_INF" <<
+            (FPC->getType() == Type::getFloatTy(FPC->getContext()) ? "F" : "")
+            << " /*inf*/ ";
+      } else {
+        std::string Num;
+#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
+        // Print out the constant as a floating point number.
+        char Buffer[100];
+        sprintf(Buffer, "%a", V);
+        Num = Buffer;
+#else
+        Num = ftostr(FPC->getValueAPF());
+#endif
+       Out << Num;
+      }
+    }
+    break;
+  }
+
+  case Type::ArrayTyID:
+    if (Static)
+      // arrays are wrapped in structs...
+      Out << "{ ";
+    else {
+      // call init func of the struct it's wrapped in...
+      printType(Out, CPV->getType());
+      Out << "::init(";
+    }
+    if (ConstantArray *CA = dyn_cast<ConstantArray>(CPV)) {
+      printConstantArray(CA, Static);
+    } else {
+      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
+      ArrayType *AT = cast<ArrayType>(CPV->getType());
+      if (AT->getNumElements()) {
+        Out << ' ';
+        Constant *CZ = Constant::getNullValue(AT->getElementType());
+        printConstant(CZ, Static);
+        for (unsigned i = 1, e = (unsigned)AT->getNumElements(); i != e; ++i) {
+          Out << ", ";
+          printConstant(CZ, Static);
+        }
+      }
+    }
+    if (Static)
+        Out << " }";
+    else
+        Out << ")";
+    break;
+
+  case Type::VectorTyID:
+    printType(Out, CPV->getType());
+    Out << "(";
+
+    if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
+      printConstantVector(CV, Static);
+    } else {
+      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
+      VectorType *VT = cast<VectorType>(CPV->getType());
+      Constant *CZ = Constant::getNullValue(VT->getElementType());
+      printConstant(CZ, Static);
+      for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) {
+        Out << ", ";
+        printConstant(CZ, Static);
+      }
+    }
+    Out << ")";
+    break;
+
+  case Type::StructTyID:
+    if (!Static) {
+      // call init func...
+      printType(Out, CPV->getType());
+      Out << "::init";
+    }
+    if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
+      StructType *ST = cast<StructType>(CPV->getType());
+      Out << '(';
+      if (ST->getNumElements()) {
+        Out << ' ';
+        printConstant(Constant::getNullValue(ST->getElementType(0)), Static);
+        for (unsigned i = 1, e = ST->getNumElements(); i != e; ++i) {
+          Out << ", ";
+          printConstant(Constant::getNullValue(ST->getElementType(i)), Static);
+        }
+      }
+      Out << ')';
+    } else {
+      Out << '(';
+      if (CPV->getNumOperands()) {
+        Out << ' ';
+        printConstant(cast<Constant>(CPV->getOperand(0)), Static);
+        for (unsigned i = 1, e = CPV->getNumOperands(); i != e; ++i) {
+          Out << ", ";
+          printConstant(cast<Constant>(CPV->getOperand(i)), Static);
+        }
+      }
+      Out << ')';
+    }
+    break;
+
+  case Type::PointerTyID:
+    if (isa<ConstantPointerNull>(CPV)) {
+      Out << "((";
+      printType(Out, CPV->getType()); // sign doesn't matter
+      Out << ")/*NULL*/0)";
+      break;
+    } else if (GlobalValue *GV = dyn_cast<GlobalValue>(CPV)) {
+      writeOperand(GV, Static);
+      break;
+    }
+    // FALL THROUGH
+  default:
+#ifndef NDEBUG
+    errs() << "Unknown constant type: " << *CPV << "\n";
+#endif
+    llvm_unreachable(0);
+  }
+}
+
+// Some constant expressions need to be casted back to the original types
+// because their operands were casted to the expected type. This function takes
+// care of detecting that case and printing the cast for the ConstantExpr.
+bool CWriter::printConstExprCast(const ConstantExpr* CE, bool Static) {
+  bool NeedsExplicitCast = false;
+  Type *Ty = CE->getOperand(0)->getType();
+  bool TypeIsSigned = false;
+  switch (CE->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // We need to cast integer arithmetic so that it is always performed
+    // as unsigned, to avoid undefined behavior on overflow.
+  case Instruction::LShr:
+  case Instruction::URem:
+  case Instruction::UDiv: NeedsExplicitCast = true; break;
+  case Instruction::AShr:
+  case Instruction::SRem:
+  case Instruction::SDiv: NeedsExplicitCast = true; TypeIsSigned = true; break;
+  case Instruction::SExt:
+    Ty = CE->getType();
+    NeedsExplicitCast = true;
+    TypeIsSigned = true;
+    break;
+  case Instruction::ZExt:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    Ty = CE->getType();
+    NeedsExplicitCast = true;
+    break;
+  default: break;
+  }
+  if (NeedsExplicitCast) {
+    Out << "((";
+    if (Ty->isIntegerTy() && Ty != Type::getInt1Ty(Ty->getContext()))
+      printSimpleType(Out, Ty, TypeIsSigned);
+    else
+      printType(Out, Ty); // not integer, sign doesn't matter
+    Out << ")(";
+  }
+  return NeedsExplicitCast;
+}
+
+//  Print a constant assuming that it is the operand for a given Opcode. The
+//  opcodes that care about sign need to cast their operands to the expected
+//  type before the operation proceeds. This function does the casting.
+void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
+
+  // Extract the operand's type, we'll need it.
+  Type* OpTy = CPV->getType();
+
+  // Indicate whether to do the cast or not.
+  bool shouldCast = false;
+  bool typeIsSigned = false;
+
+  // Based on the Opcode for which this Constant is being written, determine
+  // the new type to which the operand should be casted by setting the value
+  // of OpTy. If we change OpTy, also set shouldCast to true so it gets
+  // casted below.
+  switch (Opcode) {
+    default:
+      // for most instructions, it doesn't matter
+      break;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+      // We need to cast integer arithmetic so that it is always performed
+      // as unsigned, to avoid undefined behavior on overflow.
+    case Instruction::LShr:
+    case Instruction::UDiv:
+    case Instruction::URem:
+      shouldCast = true;
+      break;
+    case Instruction::AShr:
+    case Instruction::SDiv:
+    case Instruction::SRem:
+      shouldCast = true;
+      typeIsSigned = true;
+      break;
+  }
+
+  // Write out the casted constant if we should, otherwise just write the
+  // operand.
+  if (shouldCast) {
+    Out << "((";
+    printSimpleType(Out, OpTy, typeIsSigned);
+    Out << ")";
+    printConstant(CPV, false);
+    Out << ")";
+  } else
+    printConstant(CPV, false);
+}
+
+std::string CWriter::GetValueName(const Value *Operand) {
+
+  // Resolve potential alias.
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(Operand)) {
+    if (const Value *V = GA->resolveAliasedGlobal(false))
+      Operand = V;
+  }
+
+  // Mangle globals with the standard mangler interface for LLC compatibility.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(Operand)) {
+    SmallString<128> Str;
+    Mang->getNameWithPrefix(Str, GV, false);
+    return CBEMangle(Str.str().str());
+  }
+
+  std::string Name = Operand->getName();
+
+  if (Name.empty()) { // Assign unique names to local temporaries.
+    unsigned &No = AnonValueNumbers[Operand];
+    if (No == 0)
+      No = ++NextAnonValueNumber;
+    Name = "tmp__" + utostr(No);
+  }
+
+  std::string VarName;
+  VarName.reserve(Name.capacity());
+
+  for (std::string::iterator I = Name.begin(), E = Name.end();
+       I != E; ++I) {
+    char ch = *I;
+
+    if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+          (ch >= '0' && ch <= '9') || ch == '_')) {
+      char buffer[5];
+      sprintf(buffer, "_%x_", ch);
+      VarName += buffer;
+    } else
+      VarName += ch;
+  }
+
+  return VarName + "_llvm_cbe";
+}
+
+/// writeInstComputationInline - Emit the computation for the specified
+/// instruction inline, with no destination provided.
+void CWriter::writeInstComputationInline(Instruction &I) {
+  // We can't currently support integer types other than 1, 8, 16, 32, 64.
+  // Validate this.
+  Type *Ty = I.getType();
+  if (Ty->isIntegerTy() && (Ty!=Type::getInt1Ty(I.getContext()) &&
+        Ty!=Type::getInt8Ty(I.getContext()) &&
+        Ty!=Type::getInt16Ty(I.getContext()) &&
+        Ty!=Type::getInt32Ty(I.getContext()) &&
+        Ty!=Type::getInt64Ty(I.getContext()))) {
+      report_fatal_error("The C backend does not currently support integer "
+                        "types of widths other than 1, 8, 16, 32, 64.\n"
+                        "This is being tracked as PR 4158.");
+  }
+
+  // If this is a non-trivial bool computation, make sure to truncate down to
+  // a 1 bit value.  This is important because we want "add i1 x, y" to return
+  // "0" when x and y are true, not "2" for example.
+  bool NeedBoolTrunc = false;
+  if (I.getType() == Type::getInt1Ty(I.getContext()) &&
+      !isa<ICmpInst>(I) && !isa<FCmpInst>(I))
+    NeedBoolTrunc = true;
+
+  if (NeedBoolTrunc)
+    Out << "((";
+
+  visit(I);
+
+  if (NeedBoolTrunc)
+    Out << ")&1)";
+}
+
+
+void CWriter::writeOperandInternal(Value *Operand, bool Static) {
+  if (Instruction *I = dyn_cast<Instruction>(Operand))
+    // Should we inline this instruction to build a tree?
+    if (isInlinableInst(*I) && !isDirectAlloca(I)) {
+      Out << '(';
+      writeInstComputationInline(*I);
+      Out << ')';
+      return;
+    }
+
+  Constant* CPV = dyn_cast<Constant>(Operand);
+
+  if (CPV && !isa<GlobalValue>(CPV))
+    printConstant(CPV, Static);
+  else
+    Out << GetValueName(Operand);
+}
+
+void CWriter::writeOperand(Value *Operand, bool Static) {
+  bool isAddressImplicit = isAddressExposed(Operand);
+  if (isAddressImplicit)
+    Out << "(&";  // Global variables are referenced as their addresses by llvm
+
+  writeOperandInternal(Operand, Static);
+
+  if (isAddressImplicit)
+    Out << ')';
+}
+
+// Some instructions need to have their result value casted back to the
+// original types because their operands were casted to the expected type.
+// This function takes care of detecting that case and printing the cast
+// for the Instruction.
+bool CWriter::writeInstructionCast(const Instruction &I) {
+  Type *Ty = I.getOperand(0)->getType();
+  switch (I.getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // We need to cast integer arithmetic so that it is always performed
+    // as unsigned, to avoid undefined behavior on overflow.
+  case Instruction::LShr:
+  case Instruction::URem:
+  case Instruction::UDiv:
+    Out << "((";
+    printSimpleType(Out, Ty, false);
+    Out << ")(";
+    return true;
+  case Instruction::AShr:
+  case Instruction::SRem:
+  case Instruction::SDiv:
+    Out << "((";
+    printSimpleType(Out, Ty, true);
+    Out << ")(";
+    return true;
+  default: break;
+  }
+  return false;
+}
+
+// Write the operand with a cast to another type based on the Opcode being used.
+// This will be used in cases where an instruction has specific type
+// requirements (usually signedness) for its operands.
+void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
+
+  // Extract the operand's type, we'll need it.
+  Type* OpTy = Operand->getType();
+
+  // Indicate whether to do the cast or not.
+  bool shouldCast = false;
+
+  // Indicate whether the cast should be to a signed type or not.
+  bool castIsSigned = false;
+
+  // Based on the Opcode for which this Operand is being written, determine
+  // the new type to which the operand should be casted by setting the value
+  // of OpTy. If we change OpTy, also set shouldCast to true.
+  switch (Opcode) {
+    default:
+      // for most instructions, it doesn't matter
+      break;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+      // We need to cast integer arithmetic so that it is always performed
+      // as unsigned, to avoid undefined behavior on overflow.
+    case Instruction::LShr:
+    case Instruction::UDiv:
+    case Instruction::URem: // Cast to unsigned first
+      shouldCast = true;
+      castIsSigned = false;
+      break;
+    case Instruction::GetElementPtr:
+    case Instruction::AShr:
+    case Instruction::SDiv:
+    case Instruction::SRem: // Cast to signed first
+      shouldCast = true;
+      castIsSigned = true;
+      break;
+  }
+
+  // Write out the casted operand if we should, otherwise just write the
+  // operand.
+  if (shouldCast) {
+    Out << "((";
+    printSimpleType(Out, OpTy, castIsSigned);
+    Out << ")";
+    writeOperand(Operand);
+    Out << ")";
+  } else
+    writeOperand(Operand);
+}
+
+// Write the operand with a cast to another type based on the icmp predicate
+// being used.
+void CWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
+  // This has to do a cast to ensure the operand has the right signedness.
+  // Also, if the operand is a pointer, we make sure to cast to an integer when
+  // doing the comparison both for signedness and so that the C compiler doesn't
+  // optimize things like "p < NULL" to false (p may contain an integer value
+  // f.e.).
+  bool shouldCast = Cmp.isRelational();
+
+  // Write out the casted operand if we should, otherwise just write the
+  // operand.
+  if (!shouldCast) {
+    writeOperand(Operand);
+    return;
+  }
+
+  // Should this be a signed comparison?  If so, convert to signed.
+  bool castIsSigned = Cmp.isSigned();
+
+  // If the operand was a pointer, convert to a large integer type.
+  Type* OpTy = Operand->getType();
+  if (OpTy->isPointerTy())
+    OpTy = TD->getIntPtrType(Operand->getContext());
+
+  Out << "((";
+  printSimpleType(Out, OpTy, castIsSigned);
+  Out << ")";
+  writeOperand(Operand);
+  Out << ")";
+}
+
+// generateCompilerSpecificCode - This is where we add conditional compilation
+// directives to cater to specific compilers as need be.
+//
+static void generateCompilerSpecificCode(formatted_raw_ostream& Out,
+                                         const TargetData *TD) {
+  // Alloca, ...
+  Out << "#include <stdlib.h>\n"
+      << "#include <stdint.h>\n"
+      << "/* get a declaration for alloca */\n"
+      << "#ifdef _MSC_VER\n"
+      << "#include <malloc.h>\n"
+      << "#define alloca _alloca\n"
+      << "#else\n"
+      << "#include <alloca.h>\n"
+      << "#endif\n\n";
+
+  // We output GCC specific attributes to preserve 'linkonce'ness on globals.
+  // If we aren't being compiled with GCC, just drop these attributes.
+  Out << "#ifndef __GNUC__  /* Can only support \"linkonce\" vars with GCC */\n"
+      << "#define __attribute__(X)\n"
+      << "#endif\n\n";
+
+  // On Mac OS X, "external weak" is spelled "__attribute__((weak_import))".
+  Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
+      << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n"
+      << "#elif defined(__GNUC__)\n"
+      << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n"
+      << "#else\n"
+      << "#define __EXTERNAL_WEAK__\n"
+      << "#endif\n\n";
+
+  // For now, turn off the weak linkage attribute on Mac OS X. (See above.)
+  Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
+      << "#define __ATTRIBUTE_WEAK__\n"
+      << "#elif defined(__GNUC__)\n"
+      << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n"
+      << "#else\n"
+      << "#define __ATTRIBUTE_WEAK__\n"
+      << "#endif\n\n";
+
+  // Add hidden visibility support. FIXME: APPLE_CC?
+  Out << "#if defined(__GNUC__)\n"
+      << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n"
+      << "#endif\n\n";
+
+  // Define NaN and Inf as GCC builtins if using GCC, as 0 otherwise
+  // From the GCC documentation:
+  //
+  //   double __builtin_nan (const char *str)
+  //
+  // This is an implementation of the ISO C99 function nan.
+  //
+  // Since ISO C99 defines this function in terms of strtod, which we do
+  // not implement, a description of the parsing is in order. The string is
+  // parsed as by strtol; that is, the base is recognized by leading 0 or
+  // 0x prefixes. The number parsed is placed in the significand such that
+  // the least significant bit of the number is at the least significant
+  // bit of the significand. The number is truncated to fit the significand
+  // field provided. The significand is forced to be a quiet NaN.
+  //
+  // This function, if given a string literal, is evaluated early enough
+  // that it is considered a compile-time constant.
+  //
+  //   float __builtin_nanf (const char *str)
+  //
+  // Similar to __builtin_nan, except the return type is float.
+  //
+  //   double __builtin_inf (void)
+  //
+  // Similar to __builtin_huge_val, except a warning is generated if the
+  // target floating-point format does not support infinities. This
+  // function is suitable for implementing the ISO C99 macro INFINITY.
+  //
+  //   float __builtin_inff (void)
+  //
+  // Similar to __builtin_inf, except the return type is float.
+  Out << "#if (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)\n"
+      << "#define LLVM_NAN(NanStr)   __builtin_nan(NanStr)   /* Double */\n"
+      << "#define LLVM_NANF(NanStr)  __builtin_nanf(NanStr)  /* Float */\n"
+      << "#define LLVM_NANS(NanStr)  __builtin_nans(NanStr)  /* Double */\n"
+      << "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n"
+      << "#define LLVM_INF           __builtin_inf()         /* Double */\n"
+      << "#define LLVM_INFF          __builtin_inff()        /* Float */\n"
+      << "//#define LLVM_PREFETCH(addr,rw,locality) "
+                              "__builtin_prefetch(addr,rw,locality)\n"
+      << "//#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n"
+      << "//#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n"
+      << "//#define LLVM_ASM           __asm__\n"
+      << "#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)\n"
+      << "#include <limits>\n"
+      << "#define LLVM_NAN(NanStr)   std::numeric_limits<double>::quiet_NaN()\n"
+      << "#define LLVM_NANF(NanStr)  std::numeric_limits<float>::quiet_NaN()\n"
+      << "#define LLVM_NANS(NanStr)  std::numeric_limits<double>::signaling_NaN()\n"
+      << "#define LLVM_NANSF(NanStr) std::numeric_limits<float>::signaling_NaN()\n"
+      << "#define LLVM_INF           std::numeric_limits<double>::infinity()\n"
+      << "#define LLVM_INFF          std::numeric_limits<float>::infinity()\n"
+      << "//#define LLVM_PREFETCH(addr,rw,locality)            /* PREFETCH */\n"
+      << "//#define __ATTRIBUTE_CTOR__\n"
+      << "//#define __ATTRIBUTE_DTOR__\n"
+      << "//#define LLVM_ASM(X)\n"
+      << "#else\n"
+      << "#error \"Not MSVC, clang, or g++?\"\n"
+      << "#endif\n\n";
+
+  Out << "#if defined(__clang__) || defined(__INTEL_COMPILER) || "
+             "(__GNUC__ < 4) /* Old GCCs, or compilers not GCC */ \n"
+      << "#define __builtin_stack_save() 0   /* not implemented */\n"
+      << "#define __builtin_stack_restore(X) /* noop */\n"
+      << "#endif\n\n";
+
+#if 0
+  // Output typedefs for 128-bit integers. If these are needed with a
+  // 32-bit target or with a C compiler that doesn't support mode(TI),
+  // more drastic measures will be needed.
+  Out << "#if __GNUC__ && __LP64__ /* 128-bit integer types */\n"
+      << "typedef int __attribute__((mode(TI))) llvmInt128;\n"
+      << "typedef unsigned __attribute__((mode(TI))) llvmUInt128;\n"
+      << "#endif\n\n";
+#endif
+
+  // Output target-specific code that should be inserted into main.
+  Out << "#define CODE_FOR_MAIN() /* Any target-specific code for main()*/\n";
+}
+
+/// FindStaticTors - Given a static ctor/dtor list, unpack its contents into
+/// the StaticTors set.
+static void FindStaticTors(GlobalVariable *GV, std::set<Function*> &StaticTors){
+  ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!InitList) return;
+
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i))){
+      if (CS->getNumOperands() != 2) return;  // Not array of 2-element structs.
+
+      if (CS->getOperand(1)->isNullValue())
+        return;  // Found a null terminator, exit printing.
+      Constant *FP = CS->getOperand(1);
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP))
+        if (CE->isCast())
+          FP = CE->getOperand(0);
+      if (Function *F = dyn_cast<Function>(FP))
+        StaticTors.insert(F);
+    }
+}
+
+enum SpecialGlobalClass {
+  NotSpecial = 0,
+  GlobalCtors, GlobalDtors,
+  NotPrinted
+};
+
+/// getGlobalVariableClass - If this is a global that is specially recognized
+/// by LLVM, return a code that indicates how we should handle it.
+static SpecialGlobalClass getGlobalVariableClass(const GlobalVariable *GV) {
+  // If this is a global ctors/dtors list, handle it now.
+  if (GV->hasAppendingLinkage() && GV->use_empty()) {
+    if (GV->getName() == "llvm.global_ctors")
+      return GlobalCtors;
+    else if (GV->getName() == "llvm.global_dtors")
+      return GlobalDtors;
+  }
+
+  // Otherwise, if it is other metadata, don't print it.  This catches things
+  // like debug information.
+  if (GV->getSection() == "llvm.metadata")
+    return NotPrinted;
+
+  return NotSpecial;
+}
+
+// PrintEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+static void PrintEscapedString(const char *Str, unsigned Length,
+                               raw_ostream &Out) {
+  for (unsigned i = 0; i != Length; ++i) {
+    unsigned char C = Str[i];
+    if (isprint(C) && C != '\\' && C != '"')
+      Out << C;
+    else if (C == '\\')
+      Out << "\\\\";
+    else if (C == '\"')
+      Out << "\\\"";
+    else if (C == '\t')
+      Out << "\\t";
+    else
+      Out << "\\x" << hexdigit(C >> 4) << hexdigit(C & 0x0F);
+  }
+}
+
+// PrintEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+static void PrintEscapedString(const std::string &Str, raw_ostream &Out) {
+  PrintEscapedString(Str.c_str(), Str.size(), Out);
+}
+
+bool CWriter::doInitialization(Module &M) {
+  FunctionPass::doInitialization(M);
+
+  // Initialize
+  TheModule = &M;
+
+  TD = new TargetData(&M);
+  IL = new IntrinsicLowering(*TD);
+  IL->AddPrototypes(M);
+
+#if 0
+  std::string Triple = TheModule->getTargetTriple();
+  if (Triple.empty())
+    Triple = llvm::sys::getDefaultTargetTriple();
+
+  std::string E;
+  if (const Target *Match = TargetRegistry::lookupTarget(Triple, E))
+    TAsm = Match->createMCAsmInfo(Triple);
+#endif
+  TAsm = new CBEMCAsmInfo();
+  MRI  = new MCRegisterInfo();
+  TCtx = new MCContext(*TAsm, *MRI, NULL);
+  Mang = new Mangler(*TCtx, *TD);
+
+  // Keep track of which functions are static ctors/dtors so they can have
+  // an attribute added to their prototypes.
+  std::set<Function*> StaticCtors, StaticDtors;
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    switch (getGlobalVariableClass(I)) {
+    default: break;
+    case GlobalCtors:
+      FindStaticTors(I, StaticCtors);
+      break;
+    case GlobalDtors:
+      FindStaticTors(I, StaticDtors);
+      break;
+    }
+  }
+
+  Out << "/*******************************************************************\n";
+  Out << "  This file has been automatically generated by ispc\n";
+  Out << "  DO NOT EDIT THIS FILE DIRECTLY\n";
+  Out << " *******************************************************************/\n\n";
+
+  // get declaration for alloca
+  Out << "/* Provide Declarations */\n";
+  Out << "#include <stdarg.h>\n";      // Varargs support
+  Out << "#include <setjmp.h>\n";      // Unwind support
+  Out << "#include <limits.h>\n";      // With overflow intrinsics support.
+  Out << "#include <stdlib.h>\n";
+  Out << "#include <string.h>\n";
+  Out << "#ifdef _MSC_VER\n";
+  Out << "  #define NOMINMAX\n";
+  Out << "  #include <windows.h>\n";
+  Out << "#endif // _MSC_VER\n";
+
+  Out << "#include \"" << includeName << "\"\n";
+
+  generateCompilerSpecificCode(Out, TD);
+
+  // Provide a definition for `bool' if not compiling with a C++ compiler.
+  Out << "\n"
+      << "#ifndef __cplusplus\ntypedef unsigned char bool;\n#endif\n"
+
+      << "\n\n/* Support for floating point constants */\n"
+      << "typedef uint64_t ConstantDoubleTy;\n"
+      << "typedef uint32_t ConstantFloatTy;\n"
+      << "typedef struct { unsigned long long f1; unsigned short f2; "
+         "unsigned short pad[3]; } ConstantFP80Ty;\n"
+      // This is used for both kinds of 128-bit long double; meaning differs.
+      << "typedef struct { uint64_t f1, f2; } ConstantFP128Ty;\n"
+      << "\n\n/* Global Declarations */\n\n";
+
+  // First output all the declarations for the program, because C requires
+  // Functions & globals to be declared before they are used.
+  //
+  if (!M.getModuleInlineAsm().empty()) {
+    Out << "/* Module asm statements */\n"
+        << "asm(";
+
+    // Split the string into lines, to make it easier to read the .ll file.
+    std::string Asm = M.getModuleInlineAsm();
+    size_t CurPos = 0;
+    size_t NewLine = Asm.find_first_of('\n', CurPos);
+    while (NewLine != std::string::npos) {
+      // We found a newline, print the portion of the asm string from the
+      // last newline up to this newline.
+      Out << "\"";
+      PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.begin()+NewLine),
+                         Out);
+      Out << "\\n\"\n";
+      CurPos = NewLine+1;
+      NewLine = Asm.find_first_of('\n', CurPos);
+    }
+    Out << "\"";
+    PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.end()), Out);
+    Out << "\");\n"
+        << "/* End Module asm statements */\n";
+  }
+
+  // Loop over the symbol table, emitting all named constants.
+  printModuleTypes();
+
+  // Global variable declarations...
+  if (!M.global_empty()) {
+    Out << "\n/* External Global Variable Declarations */\n";
+    for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I) {
+
+      if (I->hasExternalLinkage() || I->hasExternalWeakLinkage() ||
+          I->hasCommonLinkage())
+        Out << "extern ";
+      else if (I->hasDLLImportLinkage())
+        Out << "__declspec(dllimport) ";
+      else
+        continue; // Internal Global
+
+      // Thread Local Storage
+      if (I->isThreadLocal())
+        Out << "__thread ";
+
+      printType(Out, I->getType()->getElementType(), false, GetValueName(I));
+
+      if (I->hasExternalWeakLinkage())
+         Out << " __EXTERNAL_WEAK__";
+      Out << ";\n";
+    }
+  }
+
+  // Function declarations
+  Out << "\n/* Function Declarations */\n";
+  Out << "extern \"C\" {\n";
+  Out << "int puts(unsigned char *);\n";
+  Out << "unsigned int putchar(unsigned int);\n";
+  Out << "int fflush(void *);\n";
+  Out << "int printf(const unsigned char *, ...);\n";
+
+  // Store the intrinsics which will be declared/defined below.
+  SmallVector<const Function*, 8> intrinsicsToDefine;
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    // Don't print declarations for intrinsic functions.
+    // Store the used intrinsics, which need to be explicitly defined.
+    if (I->isIntrinsic()) {
+      switch (I->getIntrinsicID()) {
+        default:
+          break;
+        case Intrinsic::uadd_with_overflow:
+        case Intrinsic::sadd_with_overflow:
+          intrinsicsToDefine.push_back(I);
+          break;
+      }
+      continue;
+    }
+
+    if (I->getName() == "setjmp" || I->getName() == "abort" ||
+        I->getName() == "longjmp" || I->getName() == "_setjmp" ||
+        I->getName() == "memset" || I->getName() == "memset_pattern16" ||
+        I->getName() == "puts" ||
+        I->getName() == "printf" || I->getName() == "putchar" ||
+        I->getName() == "fflush")
+      continue;
+
+    // Don't redeclare ispc's own intrinsics
+    std::string name = I->getName();
+    if (name.size() > 2 && name[0] == '_' && name[1] == '_')
+        continue;
+
+    if (I->hasExternalWeakLinkage())
+      Out << "extern ";
+    printFunctionSignature(I, true);
+    if (I->hasWeakLinkage() || I->hasLinkOnceLinkage())
+      Out << " __ATTRIBUTE_WEAK__";
+    if (I->hasExternalWeakLinkage())
+      Out << " __EXTERNAL_WEAK__";
+    if (StaticCtors.count(I))
+      Out << " __ATTRIBUTE_CTOR__";
+    if (StaticDtors.count(I))
+      Out << " __ATTRIBUTE_DTOR__";
+    if (I->hasHiddenVisibility())
+      Out << " __HIDDEN__";
+
+    if (I->hasName() && I->getName()[0] == 1)
+      Out << " LLVM_ASM(\"" << I->getName().substr(1) << "\")";
+
+    Out << ";\n";
+  }
+  Out << "}\n";
+
+  // Output the global variable declarations
+  if (!M.global_empty()) {
+    Out << "\n\n/* Global Variable Declarations */\n";
+    for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I)
+      if (!I->isDeclaration()) {
+        // Ignore special globals, such as debug info.
+        if (getGlobalVariableClass(I))
+          continue;
+
+        if (I->hasLocalLinkage())
+          continue;
+        else
+          Out << "extern ";
+
+        // Thread Local Storage
+        if (I->isThreadLocal())
+          Out << "__thread ";
+
+        printType(Out, I->getType()->getElementType(), false,
+                  GetValueName(I));
+
+        if (I->hasLinkOnceLinkage())
+          Out << " __attribute__((common))";
+        else if (I->hasCommonLinkage())     // FIXME is this right?
+          Out << " __ATTRIBUTE_WEAK__";
+        else if (I->hasWeakLinkage())
+          Out << " __ATTRIBUTE_WEAK__";
+        else if (I->hasExternalWeakLinkage())
+          Out << " __EXTERNAL_WEAK__";
+        if (I->hasHiddenVisibility())
+          Out << " __HIDDEN__";
+        Out << ";\n";
+      }
+  }
+
+  // Output the global variable definitions and contents...
+  if (!M.global_empty()) {
+    Out << "\n\n/* Global Variable Definitions and Initialization */\n";
+    for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I)
+      if (!I->isDeclaration()) {
+        // Ignore special globals, such as debug info.
+        if (getGlobalVariableClass(I))
+          continue;
+
+        if (I->hasLocalLinkage())
+          Out << "static ";
+        else if (I->hasDLLImportLinkage())
+          Out << "__declspec(dllimport) ";
+        else if (I->hasDLLExportLinkage())
+          Out << "__declspec(dllexport) ";
+
+        // Thread Local Storage
+        if (I->isThreadLocal())
+          Out << "__thread ";
+
+        printType(Out, I->getType()->getElementType(), false,
+                  GetValueName(I));
+        if (I->hasLinkOnceLinkage())
+          Out << " __attribute__((common))";
+        else if (I->hasWeakLinkage())
+          Out << " __ATTRIBUTE_WEAK__";
+        else if (I->hasCommonLinkage())
+          Out << " __ATTRIBUTE_WEAK__";
+
+        if (I->hasHiddenVisibility())
+          Out << " __HIDDEN__";
+
+        // If the initializer is not null, emit the initializer.  If it is null,
+        // we try to avoid emitting large amounts of zeros.  The problem with
+        // this, however, occurs when the variable has weak linkage.  In this
+        // case, the assembler will complain about the variable being both weak
+        // and common, so we disable this optimization.
+        // FIXME common linkage should avoid this problem.
+        if (!I->getInitializer()->isNullValue()) {
+          Out << " = " ;
+          writeOperand(I->getInitializer(), true);
+        } else if (I->hasWeakLinkage()) {
+          // We have to specify an initializer, but it doesn't have to be
+          // complete.  If the value is an aggregate, print out { 0 }, and let
+          // the compiler figure out the rest of the zeros.
+          Out << " = " ;
+          if (I->getInitializer()->getType()->isStructTy() ||
+              I->getInitializer()->getType()->isVectorTy()) {
+            Out << "{ 0 }";
+          } else if (I->getInitializer()->getType()->isArrayTy()) {
+            // As with structs and vectors, but with an extra set of braces
+            // because arrays are wrapped in structs.
+            Out << "{ { 0 } }";
+          } else {
+            // Just print it out normally.
+            writeOperand(I->getInitializer(), true);
+          }
+        }
+        Out << ";\n";
+      }
+  }
+
+  if (!M.empty())
+    Out << "\n\n/* Function Bodies */\n";
+
+  // Emit some helper functions for dealing with FCMP instruction's
+  // predicates
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ord(A X, B Y) { ";
+  Out << "return X == X && Y == Y; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_uno(A X, B Y) { ";
+  Out << "return X != X || Y != Y; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ueq(A X, B Y) { ";
+  Out << "return X == Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_une(A X, B Y) { ";
+  Out << "return X != Y; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ult(A X, B Y) { ";
+  Out << "return X <  Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ugt(A X, B Y) { ";
+  Out << "return X >  Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ule(A X, B Y) { ";
+  Out << "return X <= Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_uge(A X, B Y) { ";
+  Out << "return X >= Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_oeq(A X, B Y) { ";
+  Out << "return X == Y ; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_one(A X, B Y) { ";
+  Out << "return X != Y && llvm_fcmp_ord(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_olt(A X, B Y) { ";
+  Out << "return X <  Y ; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ogt(A X, B Y) { ";
+  Out << "return X >  Y ; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ole(A X, B Y) { ";
+  Out << "return X <= Y ; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_oge(A X, B Y) { ";
+  Out << "return X >= Y ; }\n";
+  Out << "template <typename A> A *Memset(A *ptr, int count, size_t len) { ";
+  Out << "return (A *)memset(ptr, count, len); }\n";
+
+  // Emit definitions of the intrinsics.
+  for (SmallVector<const Function*, 8>::const_iterator
+       I = intrinsicsToDefine.begin(),
+       E = intrinsicsToDefine.end(); I != E; ++I) {
+    printIntrinsicDefinition(**I, Out);
+  }
+
+  return false;
+}
+
+
+/// Output all floating point constants that cannot be printed accurately...
+void CWriter::printFloatingPointConstants(Function &F) {
+  // Scan the module for floating point constants.  If any FP constant is used
+  // in the function, we want to redirect it here so that we do not depend on
+  // the precision of the printed form, unless the printed form preserves
+  // precision.
+  //
+  for (constant_iterator I = constant_begin(&F), E = constant_end(&F);
+       I != E; ++I)
+    printFloatingPointConstants(*I);
+
+  Out << '\n';
+}
+
+void CWriter::printFloatingPointConstants(const Constant *C) {
+  // If this is a constant expression, recursively check for constant fp values.
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i)
+      printFloatingPointConstants(CE->getOperand(i));
+    return;
+  }
+
+  // Otherwise, check for a FP constant that we need to print.
+  const ConstantFP *FPC = dyn_cast<ConstantFP>(C);
+  if (FPC == 0 ||
+      // Do not put in FPConstantMap if safe.
+      isFPCSafeToPrint(FPC) ||
+      // Already printed this constant?
+      FPConstantMap.count(FPC))
+    return;
+
+  FPConstantMap[FPC] = FPCounter;  // Number the FP constants
+
+  if (FPC->getType() == Type::getDoubleTy(FPC->getContext())) {
+    double Val = FPC->getValueAPF().convertToDouble();
+    uint64_t i = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+    Out << "static const ConstantDoubleTy FPConstant" << FPCounter++
+    << " = 0x" << utohexstr(i)
+    << "ULL;    /* " << Val << " */\n";
+  } else if (FPC->getType() == Type::getFloatTy(FPC->getContext())) {
+    float Val = FPC->getValueAPF().convertToFloat();
+    uint32_t i = (uint32_t)FPC->getValueAPF().bitcastToAPInt().
+    getZExtValue();
+    Out << "static const ConstantFloatTy FPConstant" << FPCounter++
+    << " = 0x" << utohexstr(i)
+    << "U;    /* " << Val << " */\n";
+  } else if (FPC->getType() == Type::getX86_FP80Ty(FPC->getContext())) {
+    // api needed to prevent premature destruction
+    APInt api = FPC->getValueAPF().bitcastToAPInt();
+    const uint64_t *p = api.getRawData();
+    Out << "static const ConstantFP80Ty FPConstant" << FPCounter++
+    << " = { 0x" << utohexstr(p[0])
+    << "ULL, 0x" << utohexstr((uint16_t)p[1]) << ",{0,0,0}"
+    << "}; /* Long double constant */\n";
+  } else if (FPC->getType() == Type::getPPC_FP128Ty(FPC->getContext()) ||
+             FPC->getType() == Type::getFP128Ty(FPC->getContext())) {
+    APInt api = FPC->getValueAPF().bitcastToAPInt();
+    const uint64_t *p = api.getRawData();
+    Out << "static const ConstantFP128Ty FPConstant" << FPCounter++
+    << " = { 0x"
+    << utohexstr(p[0]) << ", 0x" << utohexstr(p[1])
+    << "}; /* Long double constant */\n";
+
+  } else {
+    llvm_unreachable("Unknown float type!");
+  }
+}
+
+
+/// printSymbolTable - Run through symbol table looking for type names.  If a
+/// type name is found, emit its declaration...
+///
+void CWriter::printModuleTypes() {
+  Out << "\n/* Helper union for bitcasts */\n";
+  Out << "typedef union {\n";
+  Out << "  unsigned int Int32;\n";
+  Out << "  unsigned long long Int64;\n";
+  Out << "  float Float;\n";
+  Out << "  double Double;\n";
+  Out << "} llvmBitCastUnion;\n";
+
+  // Get all of the struct types used in the module.
+  std::vector<StructType*> StructTypes;
+  TheModule->findUsedStructTypes(StructTypes);
+
+  // Get all of the array types used in the module
+  std::vector<ArrayType*> ArrayTypes;
+  findUsedArrayTypes(TheModule, ArrayTypes);
+
+  if (StructTypes.empty() && ArrayTypes.empty())
+      return;
+
+  Out << "/* Structure and array forward declarations */\n";
+
+  unsigned NextTypeID = 0;
+  
+  // If any of them are missing names, add a unique ID to UnnamedStructIDs.
+  // Print out forward declarations for structure types.
+  for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
+    StructType *ST = StructTypes[i];
+
+    if (ST->isLiteral() || ST->getName().empty())
+      UnnamedStructIDs[ST] = NextTypeID++;
+
+    std::string Name = getStructName(ST);
+
+    Out << "struct " << Name << ";\n";
+  }
+
+  for (unsigned i = 0, e = ArrayTypes.size(); i != e; ++i) {
+      ArrayType *AT = ArrayTypes[i];
+      ArrayIDs[AT] = NextTypeID++;
+      std::string Name = getArrayName(AT);
+      Out << "struct " << Name << ";\n";
+  }
+  Out << '\n';
+
+  // Keep track of which types have been printed so far.
+  SmallPtrSet<Type *, 16> StructArrayPrinted;
+
+  // Loop over all structures then push them into the stack so they are
+  // printed in the correct order.
+  //
+  Out << "/* Structure and array contents */\n";
+  for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
+    if (StructTypes[i]->isStructTy())
+      // Only print out used types!
+      printContainedStructs(StructTypes[i], StructArrayPrinted);
+  }
+
+  for (unsigned i = 0, e = ArrayTypes.size(); i != e; ++i)
+    printContainedArrays(ArrayTypes[i], StructArrayPrinted);
+
+  Out << '\n';
+}
+
+// Push the struct onto the stack and recursively push all structs
+// this one depends on.
+//
+// TODO:  Make this work properly with vector types
+//
+void CWriter::printContainedStructs(Type *Ty,
+                                    SmallPtrSet<Type *, 16> &Printed) {
+  // Don't walk through pointers.
+  if (Ty->isPointerTy() || Ty->isPrimitiveType() || Ty->isIntegerTy())
+    return;
+
+  // Print all contained types first.
+  for (Type::subtype_iterator I = Ty->subtype_begin(),
+       E = Ty->subtype_end(); I != E; ++I)
+    printContainedStructs(*I, Printed);
+
+  if (StructType *ST = dyn_cast<StructType>(Ty)) {
+    // Check to see if we have already printed this struct.
+    if (!Printed.insert(Ty)) return;
+    
+    // Print structure type out.
+    printType(Out, ST, false, getStructName(ST), true);
+    Out << ";\n\n";
+  }
+  if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+      if (!Printed.insert(Ty)) return;
+
+      printType(Out, AT, false, getArrayName(AT), true);
+      Out << ";\n\n";
+  }
+}
+
+void CWriter::printContainedArrays(ArrayType *ATy,
+                                   SmallPtrSet<Type *, 16> &Printed) {
+  if (!Printed.insert(ATy))
+      return;
+
+  ArrayType *ChildTy = dyn_cast<ArrayType>(ATy->getElementType());
+  if (ChildTy != NULL)
+      printContainedArrays(ChildTy, Printed);
+
+  printType(Out, ATy, false, getArrayName(ATy), true);
+  Out << ";\n\n";
+}
+
+void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
+  /// isStructReturn - Should this function actually return a struct by-value?
+  bool isStructReturn = F->hasStructRetAttr();
+
+  if (F->hasLocalLinkage()) Out << "static ";
+  if (F->hasDLLImportLinkage()) Out << "__declspec(dllimport) ";
+  if (F->hasDLLExportLinkage()) Out << "__declspec(dllexport) ";
+  switch (F->getCallingConv()) {
+   case CallingConv::X86_StdCall:
+    Out << "__attribute__((stdcall)) ";
+    break;
+   case CallingConv::X86_FastCall:
+    Out << "__attribute__((fastcall)) ";
+    break;
+   case CallingConv::X86_ThisCall:
+    Out << "__attribute__((thiscall)) ";
+    break;
+   default:
+    break;
+  }
+
+  // Loop over the arguments, printing them...
+  FunctionType *FT = cast<FunctionType>(F->getFunctionType());
+  const AttrListPtr &PAL = F->getAttributes();
+
+  std::string tstr;
+  raw_string_ostream FunctionInnards(tstr);
+
+  // Print out the name...
+  FunctionInnards << GetValueName(F) << '(';
+
+  bool PrintedArg = false;
+  if (!F->isDeclaration()) {
+    if (!F->arg_empty()) {
+      Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+      unsigned Idx = 1;
+
+      // If this is a struct-return function, don't print the hidden
+      // struct-return argument.
+      if (isStructReturn) {
+        assert(I != E && "Invalid struct return function!");
+        ++I;
+        ++Idx;
+      }
+
+      std::string ArgName;
+      for (; I != E; ++I) {
+        if (PrintedArg) FunctionInnards << ", ";
+        if (I->hasName() || !Prototype)
+          ArgName = GetValueName(I);
+        else
+          ArgName = "";
+        Type *ArgTy = I->getType();
+        if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+          ArgTy = cast<PointerType>(ArgTy)->getElementType();
+          ByValParams.insert(I);
+        }
+        printType(FunctionInnards, ArgTy,
+            /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt),
+            ArgName);
+        PrintedArg = true;
+        ++Idx;
+      }
+    }
+  } else {
+    // Loop over the arguments, printing them.
+    FunctionType::param_iterator I = FT->param_begin(), E = FT->param_end();
+    unsigned Idx = 1;
+
+    // If this is a struct-return function, don't print the hidden
+    // struct-return argument.
+    if (isStructReturn) {
+      assert(I != E && "Invalid struct return function!");
+      ++I;
+      ++Idx;
+    }
+
+    for (; I != E; ++I) {
+      if (PrintedArg) FunctionInnards << ", ";
+      Type *ArgTy = *I;
+      if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+        assert(ArgTy->isPointerTy());
+        ArgTy = cast<PointerType>(ArgTy)->getElementType();
+      }
+      printType(FunctionInnards, ArgTy,
+             /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt));
+      PrintedArg = true;
+      ++Idx;
+    }
+  }
+
+  if (!PrintedArg && FT->isVarArg()) {
+    FunctionInnards << "int vararg_dummy_arg";
+    PrintedArg = true;
+  }
+
+  // Finish printing arguments... if this is a vararg function, print the ...,
+  // unless there are no known types, in which case, we just emit ().
+  //
+  if (FT->isVarArg() && PrintedArg) {
+    FunctionInnards << ",...";  // Output varargs portion of signature!
+  } else if (!FT->isVarArg() && !PrintedArg) {
+    FunctionInnards << "void"; // ret() -> ret(void) in C.
+  }
+  FunctionInnards << ')';
+
+  // Get the return tpe for the function.
+  Type *RetTy;
+  if (!isStructReturn)
+    RetTy = F->getReturnType();
+  else {
+    // If this is a struct-return function, print the struct-return type.
+    RetTy = cast<PointerType>(FT->getParamType(0))->getElementType();
+  }
+
+  // Print out the return type and the signature built above.
+  printType(Out, RetTy,
+            /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt),
+            FunctionInnards.str());
+}
+
+static inline bool isFPIntBitCast(const Instruction &I) {
+  if (!isa<BitCastInst>(I))
+    return false;
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DstTy = I.getType();
+  return (SrcTy->isFloatingPointTy() && DstTy->isIntegerTy()) ||
+         (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
+}
+
+void CWriter::printFunction(Function &F) {
+  /// isStructReturn - Should this function actually return a struct by-value?
+  bool isStructReturn = F.hasStructRetAttr();
+
+  printFunctionSignature(&F, false);
+  Out << " {\n";
+
+  // If this is a struct return function, handle the result with magic.
+  if (isStructReturn) {
+    Type *StructTy =
+      cast<PointerType>(F.arg_begin()->getType())->getElementType();
+    Out << "  ";
+    printType(Out, StructTy, false, "StructReturn");
+    Out << ";  /* Struct return temporary */\n";
+
+    Out << "  ";
+    printType(Out, F.arg_begin()->getType(), false,
+              GetValueName(F.arg_begin()));
+    Out << " = &StructReturn;\n";
+  }
+
+  bool PrintedVar = false;
+
+  // print local variable information for the function
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    if (const AllocaInst *AI = isDirectAlloca(&*I)) {
+      Out << "  ";
+      printType(Out, AI->getAllocatedType(), false, GetValueName(AI));
+      Out << ";    /* Address-exposed local */\n";
+      PrintedVar = true;
+    } else if (I->getType() != Type::getVoidTy(F.getContext()) &&
+               !isInlinableInst(*I)) {
+      Out << "  ";
+      printType(Out, I->getType(), false, GetValueName(&*I));
+      Out << ";\n";
+
+      if (isa<PHINode>(*I)) {  // Print out PHI node temporaries as well...
+        Out << "  ";
+        printType(Out, I->getType(), false,
+                  GetValueName(&*I)+"__PHI");
+        Out << ";\n";
+      }
+      PrintedVar = true;
+    }
+    // We need a temporary for the BitCast to use so it can pluck a value out
+    // of a union to do the BitCast. This is separate from the need for a
+    // variable to hold the result of the BitCast.
+    if (isFPIntBitCast(*I)) {
+      Out << "  llvmBitCastUnion " << GetValueName(&*I)
+          << "__BITCAST_TEMPORARY;\n";
+      PrintedVar = true;
+    }
+  }
+
+  if (PrintedVar)
+    Out << '\n';
+
+  if (F.hasExternalLinkage() && F.getName() == "main")
+    Out << "  CODE_FOR_MAIN();\n";
+
+  // print the basic blocks
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Loop *L = LI->getLoopFor(BB)) {
+      if (L->getHeader() == BB && L->getParentLoop() == 0)
+        printLoop(L);
+    } else {
+      printBasicBlock(BB);
+    }
+  }
+
+  Out << "}\n\n";
+}
+
+void CWriter::printLoop(Loop *L) {
+  Out << "  do {     /* Syntactic loop '" << L->getHeader()->getName()
+      << "' to make GCC happy */\n";
+  for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i) {
+    BasicBlock *BB = L->getBlocks()[i];
+    Loop *BBLoop = LI->getLoopFor(BB);
+    if (BBLoop == L)
+      printBasicBlock(BB);
+    else if (BB == BBLoop->getHeader() && BBLoop->getParentLoop() == L)
+      printLoop(BBLoop);
+  }
+  Out << "  } while (1); /* end of syntactic loop '"
+      << L->getHeader()->getName() << "' */\n";
+}
+
+void CWriter::printBasicBlock(BasicBlock *BB) {
+
+  // Don't print the label for the basic block if there are no uses, or if
+  // the only terminator use is the predecessor basic block's terminator.
+  // We have to scan the use list because PHI nodes use basic blocks too but
+  // do not require a label to be generated.
+  //
+  bool NeedsLabel = false;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    if (isGotoCodeNecessary(*PI, BB)) {
+      NeedsLabel = true;
+      break;
+    }
+
+  if (NeedsLabel) Out << GetValueName(BB) << ": {\n";
+
+  // Output all of the instructions in the basic block...
+  for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E;
+       ++II) {
+    if (!isInlinableInst(*II) && !isDirectAlloca(II)) {
+      if (II->getType() != Type::getVoidTy(BB->getContext()) &&
+          !isInlineAsm(*II))
+        outputLValue(II);
+      else
+        Out << "  ";
+      writeInstComputationInline(*II);
+      Out << ";\n";
+    }
+  }
+
+  // Don't emit prefix or suffix for the terminator.
+  visit(*BB->getTerminator());
+  if (NeedsLabel) Out << "}\n"; // workaround g++ bug
+}
+
+
+// Specific Instruction type classes... note that all of the casts are
+// necessary because we use the instruction classes as opaque types...
+//
+void CWriter::visitReturnInst(ReturnInst &I) {
+  // If this is a struct return function, return the temporary struct.
+  bool isStructReturn = I.getParent()->getParent()->hasStructRetAttr();
+
+  if (isStructReturn) {
+    Out << "  return StructReturn;\n";
+    return;
+  }
+
+  // Don't output a void return if this is the last basic block in the function
+  if (I.getNumOperands() == 0 &&
+      &*--I.getParent()->getParent()->end() == I.getParent() &&
+      !I.getParent()->size() == 1) {
+    return;
+  }
+
+  Out << "  return";
+  if (I.getNumOperands()) {
+    Out << ' ';
+    writeOperand(I.getOperand(0));
+  }
+  Out << ";\n";
+}
+
+void CWriter::visitSwitchInst(SwitchInst &SI) {
+
+  Value* Cond = SI.getCondition();
+
+  Out << "  switch (";
+  writeOperand(Cond);
+  Out << ") {\n  default:\n";
+  printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
+  printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
+  Out << ";\n";
+
+  unsigned NumCases = SI.getNumCases();
+  // Skip the first item since that's the default case.
+  for (unsigned i = 1; i < NumCases; ++i) {
+    ConstantInt* CaseVal = SI.getCaseValue(i);
+    BasicBlock* Succ = SI.getSuccessor(i);
+    Out << "  case ";
+    writeOperand(CaseVal);
+    Out << ":\n";
+    printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
+    printBranchToBlock(SI.getParent(), Succ, 2);
+    if (Function::iterator(Succ) == llvm::next(Function::iterator(SI.getParent())))
+      Out << "    break;\n";
+  }
+
+  Out << "  }\n";
+}
+
+void CWriter::visitIndirectBrInst(IndirectBrInst &IBI) {
+  Out << "  goto *(void*)(";
+  writeOperand(IBI.getOperand(0));
+  Out << ");\n";
+}
+
+void CWriter::visitUnreachableInst(UnreachableInst &I) {
+  Out << "  /*UNREACHABLE*/;\n";
+}
+
+bool CWriter::isGotoCodeNecessary(BasicBlock *From, BasicBlock *To) {
+  /// FIXME: This should be reenabled, but loop reordering safe!!
+  return true;
+
+  if (llvm::next(Function::iterator(From)) != Function::iterator(To))
+    return true;  // Not the direct successor, we need a goto.
+
+  //isa<SwitchInst>(From->getTerminator())
+
+  if (LI->getLoopFor(From) != LI->getLoopFor(To))
+    return true;
+  return false;
+}
+
+void CWriter::printPHICopiesForSuccessor (BasicBlock *CurBlock,
+                                          BasicBlock *Successor,
+                                          unsigned Indent) {
+  for (BasicBlock::iterator I = Successor->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    // Now we have to do the printing.
+    Value *IV = PN->getIncomingValueForBlock(CurBlock);
+    if (!isa<UndefValue>(IV)) {
+      Out << std::string(Indent, ' ');
+      Out << "  " << GetValueName(I) << "__PHI = ";
+      writeOperand(IV);
+      Out << ";   /* for PHI node */\n";
+    }
+  }
+}
+
+void CWriter::printBranchToBlock(BasicBlock *CurBB, BasicBlock *Succ,
+                                 unsigned Indent) {
+  if (isGotoCodeNecessary(CurBB, Succ)) {
+    Out << std::string(Indent, ' ') << "  goto ";
+    writeOperand(Succ);
+    Out << ";\n";
+  }
+}
+
+// Branch instruction printing - Avoid printing out a branch to a basic block
+// that immediately succeeds the current one.
+//
+void CWriter::visitBranchInst(BranchInst &I) {
+
+  if (I.isConditional()) {
+    if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(0))) {
+      Out << "  if (";
+      writeOperand(I.getCondition());
+      Out << ") {\n";
+
+      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 2);
+      printBranchToBlock(I.getParent(), I.getSuccessor(0), 2);
+
+      if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(1))) {
+        Out << "  } else {\n";
+        printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
+        printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
+      }
+    } else {
+      // First goto not necessary, assume second one is...
+      Out << "  if (!";
+      writeOperand(I.getCondition());
+      Out << ") {\n";
+
+      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
+      printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
+    }
+
+    Out << "  }\n";
+  } else {
+    printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 0);
+    printBranchToBlock(I.getParent(), I.getSuccessor(0), 0);
+  }
+  Out << "\n";
+}
+
+// PHI nodes get copied into temporary values at the end of predecessor basic
+// blocks.  We now need to copy these temporary values into the REAL value for
+// the PHI.
+void CWriter::visitPHINode(PHINode &I) {
+  writeOperand(&I);
+  Out << "__PHI";
+}
+
+
+void CWriter::visitBinaryOperator(Instruction &I) {
+  // binary instructions, shift instructions, setCond instructions.
+  assert(!I.getType()->isPointerTy());
+
+  if (isa<const VectorType>(I.getOperand(0)->getType())) {
+      const char *intrinsic = NULL;
+      switch (I.getOpcode()) {
+      case Instruction::Add:  intrinsic = "__add";  break;
+      case Instruction::FAdd: intrinsic = "__add";  break;
+      case Instruction::Sub:  intrinsic = "__sub";  break;
+      case Instruction::FSub: intrinsic = "__sub";  break;
+      case Instruction::Mul:  intrinsic = "__mul";  break;
+      case Instruction::FMul: intrinsic = "__mul";  break;
+      case Instruction::URem: intrinsic = "__urem"; break;
+      case Instruction::SRem: intrinsic = "__srem"; break;
+      case Instruction::FRem: intrinsic = "__frem"; break;
+      case Instruction::UDiv: intrinsic = "__udiv"; break;
+      case Instruction::SDiv: intrinsic = "__sdiv"; break;
+      case Instruction::FDiv: intrinsic = "__div";  break;
+      case Instruction::And:  intrinsic = "__and";  break;
+      case Instruction::Or:   intrinsic = "__or";   break;
+      case Instruction::Xor:  intrinsic = "__xor";  break;
+      case Instruction::Shl : intrinsic = "__shl";  break;
+      case Instruction::LShr: intrinsic = "__lshr"; break;
+      case Instruction::AShr: intrinsic = "__ashr"; break;
+      default:
+#ifndef NDEBUG
+          errs() << "Invalid operator type!" << I;
+#endif
+          llvm_unreachable(0);
+      }
+      Out << intrinsic;
+      Out << "(";
+      writeOperand(I.getOperand(0));
+      Out << ", ";
+      writeOperand(I.getOperand(1));
+      Out << ")";
+      return;
+  }
+
+  // We must cast the results of binary operations which might be promoted.
+  bool needsCast = false;
+  if ((I.getType() == Type::getInt8Ty(I.getContext())) ||
+      (I.getType() == Type::getInt16Ty(I.getContext()))
+      || (I.getType() == Type::getFloatTy(I.getContext()))) {
+    needsCast = true;
+    Out << "((";
+    printType(Out, I.getType(), false);
+    Out << ")(";
+  }
+
+  // If this is a negation operation, print it out as such.  For FP, we don't
+  // want to print "-0.0 - X".
+  if (BinaryOperator::isNeg(&I)) {
+    Out << "-(";
+    writeOperand(BinaryOperator::getNegArgument(cast<BinaryOperator>(&I)));
+    Out << ")";
+  } else if (BinaryOperator::isFNeg(&I)) {
+    Out << "-(";
+    writeOperand(BinaryOperator::getFNegArgument(cast<BinaryOperator>(&I)));
+    Out << ")";
+  } else if (I.getOpcode() == Instruction::FRem) {
+    // Output a call to fmod/fmodf instead of emitting a%b
+    if (I.getType() == Type::getFloatTy(I.getContext()))
+      Out << "fmodf(";
+    else if (I.getType() == Type::getDoubleTy(I.getContext()))
+      Out << "fmod(";
+    else  // all 3 flavors of long double
+      Out << "fmodl(";
+    writeOperand(I.getOperand(0));
+    Out << ", ";
+    writeOperand(I.getOperand(1));
+    Out << ")";
+  } else {
+
+    // Write out the cast of the instruction's value back to the proper type
+    // if necessary.
+    bool NeedsClosingParens = writeInstructionCast(I);
+
+    // Certain instructions require the operand to be forced to a specific type
+    // so we use writeOperandWithCast here instead of writeOperand. Similarly
+    // below for operand 1
+    writeOperandWithCast(I.getOperand(0), I.getOpcode());
+
+    switch (I.getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd: Out << " + "; break;
+    case Instruction::Sub:
+    case Instruction::FSub: Out << " - "; break;
+    case Instruction::Mul:
+    case Instruction::FMul: Out << " * "; break;
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem: Out << " % "; break;
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv: Out << " / "; break;
+    case Instruction::And:  Out << " & "; break;
+    case Instruction::Or:   Out << " | "; break;
+    case Instruction::Xor:  Out << " ^ "; break;
+    case Instruction::Shl : Out << " << "; break;
+    case Instruction::LShr:
+    case Instruction::AShr: Out << " >> "; break;
+    default:
+#ifndef NDEBUG
+       errs() << "Invalid operator type!" << I;
+#endif
+       llvm_unreachable(0);
+    }
+
+    writeOperandWithCast(I.getOperand(1), I.getOpcode());
+    if (NeedsClosingParens)
+      Out << "))";
+  }
+
+  if (needsCast) {
+    Out << "))";
+  }
+}
+
+void CWriter::visitICmpInst(ICmpInst &I) {
+  bool isVector = isa<VectorType>(I.getOperand(0)->getType());
+
+  if (isVector) {
+      switch (I.getPredicate()) {
+      case ICmpInst::ICMP_EQ:  Out << "__equal"; break;
+      case ICmpInst::ICMP_NE:  Out << "__not_equal"; break;
+      case ICmpInst::ICMP_ULE: Out << "__unsigned_less_equal"; break;
+      case ICmpInst::ICMP_SLE: Out << "__signed_less_equal"; break;
+      case ICmpInst::ICMP_UGE: Out << "__unsigned_greater_equal"; break;
+      case ICmpInst::ICMP_SGE: Out << "__signed_greater_equal"; break;
+      case ICmpInst::ICMP_ULT: Out << "__unsigned_less_than"; break;
+      case ICmpInst::ICMP_SLT: Out << "__signed_less_than"; break;
+      case ICmpInst::ICMP_UGT: Out << "__unsigned_greater_than"; break;
+      case ICmpInst::ICMP_SGT: Out << "__signed_greater_than"; break;
+      default: llvm_unreachable(0);
+      }
+      Out << "(";
+      writeOperand(I.getOperand(0));
+      Out << ", ";
+      writeOperand(I.getOperand(1));
+      Out << ")";
+      return;
+  }
+
+  // Write out the cast of the instruction's value back to the proper type
+  // if necessary.
+  bool NeedsClosingParens = writeInstructionCast(I);
+
+  // Certain icmp predicate require the operand to be forced to a specific type
+  // so we use writeOperandWithCast here instead of writeOperand. Similarly
+  // below for operand 1
+  writeOperandWithCast(I.getOperand(0), I);
+
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_EQ:  Out << " == "; break;
+  case ICmpInst::ICMP_NE:  Out << " != "; break;
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_SLE: Out << " <= "; break;
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE: Out << " >= "; break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT: Out << " < "; break;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT: Out << " > "; break;
+  default:
+#ifndef NDEBUG
+    errs() << "Invalid icmp predicate!" << I;
+#endif
+    llvm_unreachable(0);
+  }
+
+  writeOperandWithCast(I.getOperand(1), I);
+  if (NeedsClosingParens)
+    Out << "))";
+}
+
+void CWriter::visitFCmpInst(FCmpInst &I) {
+  bool isVector = isa<VectorType>(I.getOperand(0)->getType());
+
+  if (I.getPredicate() == FCmpInst::FCMP_FALSE) {
+    if (isVector)
+      report_fatal_error("FIXME: vector FCMP_FALSE");
+    else
+      Out << "0";
+    return;
+  }
+  if (I.getPredicate() == FCmpInst::FCMP_TRUE) {
+    if (isVector)
+      report_fatal_error("FIXME: vector FCMP_TRUE");
+    else
+      Out << "1";
+    return;
+  }
+
+  if (isVector) {
+      switch (I.getPredicate()) {
+      default: llvm_unreachable("Illegal FCmp predicate");
+      case FCmpInst::FCMP_ORD: Out << "__ordered("; break;
+      case FCmpInst::FCMP_UNO: Out << "__cmpunord("; break;
+      case FCmpInst::FCMP_UEQ: Out << "__ucomeq("; break;
+      case FCmpInst::FCMP_UNE: Out << "__ucomneq("; break;
+      case FCmpInst::FCMP_ULT: Out << "__ucomlt("; break;
+      case FCmpInst::FCMP_ULE: Out << "__ucomle("; break;
+      case FCmpInst::FCMP_UGT: Out << "__ucomgt("; break;
+      case FCmpInst::FCMP_UGE: Out << "__ucomge("; break;
+      case FCmpInst::FCMP_OEQ: Out << "__equal("; break;
+      case FCmpInst::FCMP_ONE: Out << "__not_equal("; break;
+      case FCmpInst::FCMP_OLT: Out << "__less_than("; break;
+      case FCmpInst::FCMP_OLE: Out << "__less_equal("; break;
+      case FCmpInst::FCMP_OGT: Out << "__greater_than("; break;
+      case FCmpInst::FCMP_OGE: Out << "__greater_equal("; break;
+      }
+  }
+  else {
+  const char* op = 0;
+  switch (I.getPredicate()) {
+  default: llvm_unreachable("Illegal FCmp predicate");
+  case FCmpInst::FCMP_ORD: op = "ord"; break;
+  case FCmpInst::FCMP_UNO: op = "uno"; break;
+  case FCmpInst::FCMP_UEQ: op = "ueq"; break;
+  case FCmpInst::FCMP_UNE: op = "une"; break;
+  case FCmpInst::FCMP_ULT: op = "ult"; break;
+  case FCmpInst::FCMP_ULE: op = "ule"; break;
+  case FCmpInst::FCMP_UGT: op = "ugt"; break;
+  case FCmpInst::FCMP_UGE: op = "uge"; break;
+  case FCmpInst::FCMP_OEQ: op = "oeq"; break;
+  case FCmpInst::FCMP_ONE: op = "one"; break;
+  case FCmpInst::FCMP_OLT: op = "olt"; break;
+  case FCmpInst::FCMP_OLE: op = "ole"; break;
+  case FCmpInst::FCMP_OGT: op = "ogt"; break;
+  case FCmpInst::FCMP_OGE: op = "oge"; break;
+  }
+
+  Out << "llvm_fcmp_" << op << "(";
+  }
+
+  // Write the first operand
+  writeOperand(I.getOperand(0));
+  Out << ", ";
+  // Write the second operand
+  writeOperand(I.getOperand(1));
+  Out << ")";
+}
+
+static const char * getFloatBitCastField(Type *Ty) {
+  switch (Ty->getTypeID()) {
+    default: llvm_unreachable("Invalid Type");
+    case Type::FloatTyID:  return "Float";
+    case Type::DoubleTyID: return "Double";
+    case Type::IntegerTyID: {
+      unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+      if (NumBits <= 32)
+        return "Int32";
+      else
+        return "Int64";
+    }
+  }
+}
+
+void CWriter::visitCastInst(CastInst &I) {
+  Type *DstTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  if (isFPIntBitCast(I)) {
+    Out << '(';
+    // These int<->float and long<->double casts need to be handled specially
+    Out << GetValueName(&I) << "__BITCAST_TEMPORARY."
+        << getFloatBitCastField(I.getOperand(0)->getType()) << " = ";
+    writeOperand(I.getOperand(0));
+    Out << ", " << GetValueName(&I) << "__BITCAST_TEMPORARY."
+        << getFloatBitCastField(I.getType());
+    Out << ')';
+    return;
+  }
+
+  Out << '(';
+  bool closeParen = printCast(I.getOpcode(), SrcTy, DstTy);
+
+  // Make a sext from i1 work by subtracting the i1 from 0 (an int).
+  if (SrcTy == Type::getInt1Ty(I.getContext()) &&
+      I.getOpcode() == Instruction::SExt)
+    Out << "0-";
+
+  writeOperand(I.getOperand(0));
+
+  if (DstTy == Type::getInt1Ty(I.getContext()) &&
+      (I.getOpcode() == Instruction::Trunc ||
+       I.getOpcode() == Instruction::FPToUI ||
+       I.getOpcode() == Instruction::FPToSI ||
+       I.getOpcode() == Instruction::PtrToInt)) {
+    // Make sure we really get a trunc to bool by anding the operand with 1
+    Out << "&1u";
+  }
+  Out << ')';
+  if (closeParen)
+      Out << ')';
+}
+
+void CWriter::visitSelectInst(SelectInst &I) {
+  if (llvm::isa<VectorType>(I.getType())) {
+      Out << "__select(";
+      writeOperand(I.getCondition());
+      Out << ", ";
+      writeOperand(I.getTrueValue());
+      Out << ", ";
+      writeOperand(I.getFalseValue());
+      Out << ")";
+      return;
+  }
+
+  Out << "((";
+  writeOperand(I.getCondition());
+  Out << ") ? (";
+  writeOperand(I.getTrueValue());
+  Out << ") : (";
+  writeOperand(I.getFalseValue());
+  Out << "))";
+}
+
+// Returns the macro name or value of the max or min of an integer type
+// (as defined in limits.h).
+static void printLimitValue(IntegerType &Ty, bool isSigned, bool isMax,
+                            raw_ostream &Out) {
+  const char* type;
+  const char* sprefix = "";
+
+  unsigned NumBits = Ty.getBitWidth();
+  if (NumBits <= 8) {
+    type = "CHAR";
+    sprefix = "S";
+  } else if (NumBits <= 16) {
+    type = "SHRT";
+  } else if (NumBits <= 32) {
+    type = "INT";
+  } else if (NumBits <= 64) {
+    type = "LLONG";
+  } else {
+    llvm_unreachable("Bit widths > 64 not implemented yet");
+  }
+
+  if (isSigned)
+    Out << sprefix << type << (isMax ? "_MAX" : "_MIN");
+  else
+    Out << "U" << type << (isMax ? "_MAX" : "0");
+}
+
+#ifndef NDEBUG
+static bool isSupportedIntegerSize(IntegerType &T) {
+  return T.getBitWidth() == 8 || T.getBitWidth() == 16 ||
+         T.getBitWidth() == 32 || T.getBitWidth() == 64;
+}
+#endif
+
+void CWriter::printIntrinsicDefinition(const Function &F, raw_ostream &Out) {
+  FunctionType *funT = F.getFunctionType();
+  Type *retT = F.getReturnType();
+  IntegerType *elemT = cast<IntegerType>(funT->getParamType(1));
+
+  assert(isSupportedIntegerSize(*elemT) &&
+         "CBackend does not support arbitrary size integers.");
+  assert(cast<StructType>(retT)->getElementType(0) == elemT &&
+         elemT == funT->getParamType(0) && funT->getNumParams() == 2);
+
+  switch (F.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Unsupported Intrinsic.");
+  case Intrinsic::uadd_with_overflow:
+    // static inline Rty uadd_ixx(unsigned ixx a, unsigned ixx b) {
+    //   Rty r;
+    //   r.field0 = a + b;
+    //   r.field1 = (r.field0 < a);
+    //   return r;
+    // }
+    Out << "static inline ";
+    printType(Out, retT);
+    Out << GetValueName(&F);
+    Out << "(";
+    printSimpleType(Out, elemT, false);
+    Out << "a,";
+    printSimpleType(Out, elemT, false);
+    Out << "b) {\n  ";
+    printType(Out, retT);
+    Out << "r;\n";
+    Out << "  r.field0 = a + b;\n";
+    Out << "  r.field1 = (r.field0 < a);\n";
+    Out << "  return r;\n}\n";
+    break;
+    
+  case Intrinsic::sadd_with_overflow:            
+    // static inline Rty sadd_ixx(ixx a, ixx b) {
+    //   Rty r;
+    //   r.field1 = (b > 0 && a > XX_MAX - b) ||
+    //              (b < 0 && a < XX_MIN - b);
+    //   r.field0 = r.field1 ? 0 : a + b;
+    //   return r;
+    // }
+    Out << "static ";
+    printType(Out, retT);
+    Out << GetValueName(&F);
+    Out << "(";
+    printSimpleType(Out, elemT, true);
+    Out << "a,";
+    printSimpleType(Out, elemT, true);
+    Out << "b) {\n  ";
+    printType(Out, retT);
+    Out << "r;\n";
+    Out << "  r.field1 = (b > 0 && a > ";
+    printLimitValue(*elemT, true, true, Out);
+    Out << " - b) || (b < 0 && a < ";
+    printLimitValue(*elemT, true, false, Out);
+    Out << " - b);\n";
+    Out << "  r.field0 = r.field1 ? 0 : a + b;\n";
+    Out << "  return r;\n}\n";
+    break;
+  }
+}
+
+void CWriter::lowerIntrinsics(Function &F) {
+  // This is used to keep track of intrinsics that get generated to a lowered
+  // function. We must generate the prototypes before the function body which
+  // will only be expanded on first use (by the loop below).
+  std::vector<Function*> prototypesToGen;
+
+  // Examine all the instructions in this function to find the intrinsics that
+  // need to be lowered.
+  for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
+      if (CallInst *CI = dyn_cast<CallInst>(I++))
+        if (Function *F = CI->getCalledFunction())
+          switch (F->getIntrinsicID()) {
+          case Intrinsic::not_intrinsic:
+          case Intrinsic::vastart:
+          case Intrinsic::vacopy:
+          case Intrinsic::vaend:
+          case Intrinsic::returnaddress:
+          case Intrinsic::frameaddress:
+          case Intrinsic::setjmp:
+          case Intrinsic::longjmp:
+          case Intrinsic::memset:
+          case Intrinsic::prefetch:
+          case Intrinsic::powi:
+          case Intrinsic::x86_sse_cmp_ss:
+          case Intrinsic::x86_sse_cmp_ps:
+          case Intrinsic::x86_sse2_cmp_sd:
+          case Intrinsic::x86_sse2_cmp_pd:
+          case Intrinsic::ppc_altivec_lvsl:
+          case Intrinsic::uadd_with_overflow:
+          case Intrinsic::sadd_with_overflow:
+              // We directly implement these intrinsics
+            break;
+          default:
+            // If this is an intrinsic that directly corresponds to a GCC
+            // builtin, we handle it.
+            const char *BuiltinName = "";
+#define GET_GCC_BUILTIN_NAME
+#include "llvm/Intrinsics.gen"
+#undef GET_GCC_BUILTIN_NAME
+            // If we handle it, don't lower it.
+            if (BuiltinName[0]) break;
+
+            // All other intrinsic calls we must lower.
+            Instruction *Before = 0;
+            if (CI != &BB->front())
+              Before = prior(BasicBlock::iterator(CI));
+
+            IL->LowerIntrinsicCall(CI);
+            if (Before) {        // Move iterator to instruction after call
+              I = Before; ++I;
+            } else {
+              I = BB->begin();
+            }
+            // If the intrinsic got lowered to another call, and that call has
+            // a definition then we need to make sure its prototype is emitted
+            // before any calls to it.
+            if (CallInst *Call = dyn_cast<CallInst>(I))
+              if (Function *NewF = Call->getCalledFunction())
+                if (!NewF->isDeclaration())
+                  prototypesToGen.push_back(NewF);
+
+            break;
+          }
+
+  // We may have collected some prototypes to emit in the loop above.
+  // Emit them now, before the function that uses them is emitted. But,
+  // be careful not to emit them twice.
+  std::vector<Function*>::iterator I = prototypesToGen.begin();
+  std::vector<Function*>::iterator E = prototypesToGen.end();
+  for ( ; I != E; ++I) {
+    if (intrinsicPrototypesAlreadyGenerated.insert(*I).second) {
+      Out << '\n';
+      printFunctionSignature(*I, true);
+      Out << ";\n";
+    }
+  }
+}
+
+void CWriter::visitCallInst(CallInst &I) {
+  if (isa<InlineAsm>(I.getCalledValue()))
+    return visitInlineAsm(I);
+
+  bool WroteCallee = false;
+
+  // Handle intrinsic function calls first...
+  if (Function *F = I.getCalledFunction())
+    if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
+      if (visitBuiltinCall(I, ID, WroteCallee))
+        return;
+
+  Value *Callee = I.getCalledValue();
+
+  PointerType  *PTy   = cast<PointerType>(Callee->getType());
+  FunctionType *FTy   = cast<FunctionType>(PTy->getElementType());
+
+  // If this is a call to a struct-return function, assign to the first
+  // parameter instead of passing it to the call.
+  const AttrListPtr &PAL = I.getAttributes();
+  bool hasByVal = I.hasByValArgument();
+  bool isStructRet = I.hasStructRetAttr();
+  if (isStructRet) {
+    writeOperandDeref(I.getArgOperand(0));
+    Out << " = ";
+  }
+
+  if (I.isTailCall()) Out << " /*tail*/ ";
+
+  if (!WroteCallee) {
+    // If this is an indirect call to a struct return function, we need to cast
+    // the pointer. Ditto for indirect calls with byval arguments.
+    bool NeedsCast = (hasByVal || isStructRet) && !isa<Function>(Callee);
+
+    // GCC is a real PITA.  It does not permit codegening casts of functions to
+    // function pointers if they are in a call (it generates a trap instruction
+    // instead!).  We work around this by inserting a cast to void* in between
+    // the function and the function pointer cast.  Unfortunately, we can't just
+    // form the constant expression here, because the folder will immediately
+    // nuke it.
+    //
+    // Note finally, that this is completely unsafe.  ANSI C does not guarantee
+    // that void* and function pointers have the same size. :( To deal with this
+    // in the common case, we handle casts where the number of arguments passed
+    // match exactly.
+    //
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Callee))
+      if (CE->isCast())
+        if (Function *RF = dyn_cast<Function>(CE->getOperand(0))) {
+          NeedsCast = true;
+          Callee = RF;
+        }
+
+    if (NeedsCast) {
+      // Ok, just cast the pointer type.
+      Out << "((";
+      if (isStructRet)
+        printStructReturnPointerFunctionType(Out, PAL,
+                             cast<PointerType>(I.getCalledValue()->getType()));
+      else if (hasByVal)
+        printType(Out, I.getCalledValue()->getType(), false, "", true, PAL);
+      else
+        printType(Out, I.getCalledValue()->getType());
+      Out << ")(void*)";
+    }
+    writeOperand(Callee);
+    if (NeedsCast) Out << ')';
+  }
+
+  Out << '(';
+
+  bool PrintedArg = false;
+  if(FTy->isVarArg() && !FTy->getNumParams()) {
+    Out << "0 /*dummy arg*/";
+    PrintedArg = true;
+  }
+
+  unsigned NumDeclaredParams = FTy->getNumParams();
+  CallSite CS(&I);
+  CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
+  unsigned ArgNo = 0;
+  if (isStructRet) {   // Skip struct return argument.
+    ++AI;
+    ++ArgNo;
+  }
+
+
+  for (; AI != AE; ++AI, ++ArgNo) {
+    if (PrintedArg) Out << ", ";
+    if (ArgNo < NumDeclaredParams &&
+        (*AI)->getType() != FTy->getParamType(ArgNo)) {
+      Out << '(';
+      printType(Out, FTy->getParamType(ArgNo),
+            /*isSigned=*/PAL.paramHasAttr(ArgNo+1, Attribute::SExt));
+      Out << ')';
+    }
+    // Check if the argument is expected to be passed by value.
+    if (I.paramHasAttr(ArgNo+1, Attribute::ByVal))
+      writeOperandDeref(*AI);
+    else
+      writeOperand(*AI);
+    PrintedArg = true;
+  }
+  Out << ')';
+}
+
+/// visitBuiltinCall - Handle the call to the specified builtin.  Returns true
+/// if the entire call is handled, return false if it wasn't handled, and
+/// optionally set 'WroteCallee' if the callee has already been printed out.
+bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
+                               bool &WroteCallee) {
+  switch (ID) {
+  default: {
+    // If this is an intrinsic that directly corresponds to a GCC
+    // builtin, we emit it here.
+    const char *BuiltinName = "";
+    Function *F = I.getCalledFunction();
+#define GET_GCC_BUILTIN_NAME
+#include "llvm/Intrinsics.gen"
+#undef GET_GCC_BUILTIN_NAME
+    assert(BuiltinName[0] && "Unknown LLVM intrinsic!");
+
+    Out << BuiltinName;
+    WroteCallee = true;
+    return false;
+  }
+  case Intrinsic::vastart:
+    Out << "0; ";
+
+    Out << "va_start(*(va_list*)";
+    writeOperand(I.getArgOperand(0));
+    Out << ", ";
+    // Output the last argument to the enclosing function.
+    if (I.getParent()->getParent()->arg_empty())
+      Out << "vararg_dummy_arg";
+    else
+      writeOperand(--I.getParent()->getParent()->arg_end());
+    Out << ')';
+    return true;
+  case Intrinsic::vaend:
+    if (!isa<ConstantPointerNull>(I.getArgOperand(0))) {
+      Out << "0; va_end(*(va_list*)";
+      writeOperand(I.getArgOperand(0));
+      Out << ')';
+    } else {
+      Out << "va_end(*(va_list*)0)";
+    }
+    return true;
+  case Intrinsic::vacopy:
+    Out << "0; ";
+    Out << "va_copy(*(va_list*)";
+    writeOperand(I.getArgOperand(0));
+    Out << ", *(va_list*)";
+    writeOperand(I.getArgOperand(1));
+    Out << ')';
+    return true;
+  case Intrinsic::returnaddress:
+    Out << "__builtin_return_address(";
+    writeOperand(I.getArgOperand(0));
+    Out << ')';
+    return true;
+  case Intrinsic::frameaddress:
+    Out << "__builtin_frame_address(";
+    writeOperand(I.getArgOperand(0));
+    Out << ')';
+    return true;
+  case Intrinsic::powi:
+    Out << "__builtin_powi(";
+    writeOperand(I.getArgOperand(0));
+    Out << ", ";
+    writeOperand(I.getArgOperand(1));
+    Out << ')';
+    return true;
+  case Intrinsic::setjmp:
+    Out << "setjmp(*(jmp_buf*)";
+    writeOperand(I.getArgOperand(0));
+    Out << ')';
+    return true;
+  case Intrinsic::longjmp:
+    Out << "longjmp(*(jmp_buf*)";
+    writeOperand(I.getArgOperand(0));
+    Out << ", ";
+    writeOperand(I.getArgOperand(1));
+    Out << ')';
+    return true;
+  case Intrinsic::memset:
+    Out << "Memset(";
+    writeOperand(I.getArgOperand(0));
+    Out << ", ";
+    writeOperand(I.getArgOperand(1));
+    Out << ", ";
+    writeOperand(I.getArgOperand(2));
+    Out << ')';
+    return true;
+  case Intrinsic::prefetch:
+    Out << "LLVM_PREFETCH((const void *)";
+    writeOperand(I.getArgOperand(0));
+    Out << ", ";
+    writeOperand(I.getArgOperand(1));
+    Out << ", ";
+    writeOperand(I.getArgOperand(2));
+    Out << ")";
+    return true;
+  case Intrinsic::stacksave:
+    // Emit this as: Val = 0; *((void**)&Val) = __builtin_stack_save()
+    // to work around GCC bugs (see PR1809).
+    Out << "0; *((void**)&" << GetValueName(&I)
+        << ") = __builtin_stack_save()";
+    return true;
+  case Intrinsic::x86_sse_cmp_ss:
+  case Intrinsic::x86_sse_cmp_ps:
+  case Intrinsic::x86_sse2_cmp_sd:
+  case Intrinsic::x86_sse2_cmp_pd:
+    Out << '(';
+    printType(Out, I.getType());
+    Out << ')';
+    // Multiple GCC builtins multiplex onto this intrinsic.
+    switch (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()) {
+    default: llvm_unreachable("Invalid llvm.x86.sse.cmp!");
+    case 0: Out << "__builtin_ia32_cmpeq"; break;
+    case 1: Out << "__builtin_ia32_cmplt"; break;
+    case 2: Out << "__builtin_ia32_cmple"; break;
+    case 3: Out << "__builtin_ia32_cmpunord"; break;
+    case 4: Out << "__builtin_ia32_cmpneq"; break;
+    case 5: Out << "__builtin_ia32_cmpnlt"; break;
+    case 6: Out << "__builtin_ia32_cmpnle"; break;
+    case 7: Out << "__builtin_ia32_cmpord"; break;
+    }
+    if (ID == Intrinsic::x86_sse_cmp_ps || ID == Intrinsic::x86_sse2_cmp_pd)
+      Out << 'p';
+    else
+      Out << 's';
+    if (ID == Intrinsic::x86_sse_cmp_ss || ID == Intrinsic::x86_sse_cmp_ps)
+      Out << 's';
+    else
+      Out << 'd';
+
+    Out << "(";
+    writeOperand(I.getArgOperand(0));
+    Out << ", ";
+    writeOperand(I.getArgOperand(1));
+    Out << ")";
+    return true;
+  case Intrinsic::ppc_altivec_lvsl:
+    Out << '(';
+    printType(Out, I.getType());
+    Out << ')';
+    Out << "__builtin_altivec_lvsl(0, (void*)";
+    writeOperand(I.getArgOperand(0));
+    Out << ")";
+    return true;
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::sadd_with_overflow:
+    Out << GetValueName(I.getCalledFunction()) << "(";
+    writeOperand(I.getArgOperand(0));
+    Out << ", ";
+    writeOperand(I.getArgOperand(1));
+    Out << ")";
+    return true;
+  }
+}
+
+//This converts the llvm constraint string to something gcc is expecting.
+//TODO: work out platform independent constraints and factor those out
+//      of the per target tables
+//      handle multiple constraint codes
+std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
+  assert(c.Codes.size() == 1 && "Too many asm constraint codes to handle");
+
+  // Grab the translation table from MCAsmInfo if it exists.
+  const MCAsmInfo *TargetAsm;
+  std::string Triple = TheModule->getTargetTriple();
+  if (Triple.empty())
+#if defined(LLVM_3_1) || defined(LLVM_3_1svn)
+    Triple = llvm::sys::getDefaultTargetTriple();
+#else
+    Triple = llvm::sys::getHostTriple();
+#endif
+
+  std::string E;
+  if (const Target *Match = TargetRegistry::lookupTarget(Triple, E))
+    TargetAsm = Match->createMCAsmInfo(Triple);
+  else
+    return c.Codes[0];
+
+  const char *const *table = TargetAsm->getAsmCBE();
+
+  // Search the translation table if it exists.
+  for (int i = 0; table && table[i]; i += 2)
+    if (c.Codes[0] == table[i]) {
+      delete TargetAsm;
+      return table[i+1];
+    }
+
+  // Default is identity.
+  delete TargetAsm;
+  return c.Codes[0];
+}
+
+//TODO: import logic from AsmPrinter.cpp
+static std::string gccifyAsm(std::string asmstr) {
+  for (std::string::size_type i = 0; i != asmstr.size(); ++i)
+    if (asmstr[i] == '\n')
+      asmstr.replace(i, 1, "\\n");
+    else if (asmstr[i] == '\t')
+      asmstr.replace(i, 1, "\\t");
+    else if (asmstr[i] == '$') {
+      if (asmstr[i + 1] == '{') {
+        std::string::size_type a = asmstr.find_first_of(':', i + 1);
+        std::string::size_type b = asmstr.find_first_of('}', i + 1);
+        std::string n = "%" +
+          asmstr.substr(a + 1, b - a - 1) +
+          asmstr.substr(i + 2, a - i - 2);
+        asmstr.replace(i, b - i + 1, n);
+        i += n.size() - 1;
+      } else
+        asmstr.replace(i, 1, "%");
+    }
+    else if (asmstr[i] == '%')//grr
+      { asmstr.replace(i, 1, "%%"); ++i;}
+
+  return asmstr;
+}
+
+//TODO: assumptions about what consume arguments from the call are likely wrong
+//      handle communitivity
+void CWriter::visitInlineAsm(CallInst &CI) {
+  InlineAsm* as = cast<InlineAsm>(CI.getCalledValue());
+  InlineAsm::ConstraintInfoVector Constraints = as->ParseConstraints();
+
+  std::vector<std::pair<Value*, int> > ResultVals;
+  if (CI.getType() == Type::getVoidTy(CI.getContext()))
+    ;
+  else if (StructType *ST = dyn_cast<StructType>(CI.getType())) {
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i)
+      ResultVals.push_back(std::make_pair(&CI, (int)i));
+  } else {
+    ResultVals.push_back(std::make_pair(&CI, -1));
+  }
+
+  // Fix up the asm string for gcc and emit it.
+  Out << "__asm__ volatile (\"" << gccifyAsm(as->getAsmString()) << "\"\n";
+  Out << "        :";
+
+  unsigned ValueCount = 0;
+  bool IsFirst = true;
+
+  // Convert over all the output constraints.
+  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
+       E = Constraints.end(); I != E; ++I) {
+
+    if (I->Type != InlineAsm::isOutput) {
+      ++ValueCount;
+      continue;  // Ignore non-output constraints.
+    }
+
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string C = InterpretASMConstraint(*I);
+    if (C.empty()) continue;
+
+    if (!IsFirst) {
+      Out << ", ";
+      IsFirst = false;
+    }
+
+    // Unpack the dest.
+    Value *DestVal;
+    int DestValNo = -1;
+
+    if (ValueCount < ResultVals.size()) {
+      DestVal = ResultVals[ValueCount].first;
+      DestValNo = ResultVals[ValueCount].second;
+    } else
+      DestVal = CI.getArgOperand(ValueCount-ResultVals.size());
+
+    if (I->isEarlyClobber)
+      C = "&"+C;
+
+    Out << "\"=" << C << "\"(" << GetValueName(DestVal);
+    if (DestValNo != -1)
+      Out << ".field" << DestValNo; // Multiple retvals.
+    Out << ")";
+    ++ValueCount;
+  }
+
+
+  // Convert over all the input constraints.
+  Out << "\n        :";
+  IsFirst = true;
+  ValueCount = 0;
+  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
+       E = Constraints.end(); I != E; ++I) {
+    if (I->Type != InlineAsm::isInput) {
+      ++ValueCount;
+      continue;  // Ignore non-input constraints.
+    }
+
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string C = InterpretASMConstraint(*I);
+    if (C.empty()) continue;
+
+    if (!IsFirst) {
+      Out << ", ";
+      IsFirst = false;
+    }
+
+    assert(ValueCount >= ResultVals.size() && "Input can't refer to result");
+    Value *SrcVal = CI.getArgOperand(ValueCount-ResultVals.size());
+
+    Out << "\"" << C << "\"(";
+    if (!I->isIndirect)
+      writeOperand(SrcVal);
+    else
+      writeOperandDeref(SrcVal);
+    Out << ")";
+  }
+
+  // Convert over the clobber constraints.
+  IsFirst = true;
+  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
+       E = Constraints.end(); I != E; ++I) {
+    if (I->Type != InlineAsm::isClobber)
+      continue;  // Ignore non-input constraints.
+
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string C = InterpretASMConstraint(*I);
+    if (C.empty()) continue;
+
+    if (!IsFirst) {
+      Out << ", ";
+      IsFirst = false;
+    }
+
+    Out << '\"' << C << '"';
+  }
+
+  Out << ")";
+}
+
+void CWriter::visitAllocaInst(AllocaInst &I) {
+  Out << '(';
+  printType(Out, I.getType());
+  Out << ") alloca(sizeof(";
+  printType(Out, I.getType()->getElementType());
+  Out << ')';
+  if (I.isArrayAllocation()) {
+    Out << " * " ;
+    writeOperand(I.getOperand(0));
+  }
+  Out << ')';
+}
+
+void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
+                                 gep_type_iterator E, bool Static) {
+
+  // If there are no indices, just print out the pointer.
+  if (I == E) {
+    writeOperand(Ptr);
+    return;
+  }
+
+  // Find out if the last index is into a vector.  If so, we have to print this
+  // specially.  Since vectors can't have elements of indexable type, only the
+  // last index could possibly be of a vector element.
+  VectorType *LastIndexIsVector = 0;
+  {
+    for (gep_type_iterator TmpI = I; TmpI != E; ++TmpI)
+      LastIndexIsVector = dyn_cast<VectorType>(*TmpI);
+  }
+
+  Out << "(";
+
+  // If the last index is into a vector, we can't print it as &a[i][j] because
+  // we can't index into a vector with j in GCC.  Instead, emit this as
+  // (((float*)&a[i])+j)
+  if (LastIndexIsVector) {
+    Out << "((";
+    printType(Out, PointerType::getUnqual(LastIndexIsVector->getElementType()));
+    Out << ")(";
+  }
+
+  Out << '&';
+
+  // If the first index is 0 (very typical) we can do a number of
+  // simplifications to clean up the code.
+  Value *FirstOp = I.getOperand();
+  if (!isa<Constant>(FirstOp) || !cast<Constant>(FirstOp)->isNullValue()) {
+    // First index isn't simple, print it the hard way.
+    writeOperand(Ptr);
+  } else {
+    ++I;  // Skip the zero index.
+
+    // Okay, emit the first operand. If Ptr is something that is already address
+    // exposed, like a global, avoid emitting (&foo)[0], just emit foo instead.
+    if (isAddressExposed(Ptr)) {
+      writeOperandInternal(Ptr, Static);
+    } else if (I != E && (*I)->isStructTy()) {
+      // If we didn't already emit the first operand, see if we can print it as
+      // P->f instead of "P[0].f"
+      writeOperand(Ptr);
+      Out << "->field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
+      ++I;  // eat the struct index as well.
+    } else {
+      // Instead of emitting P[0][1], emit (*P)[1], which is more idiomatic.
+      Out << "(*";
+      writeOperand(Ptr);
+      Out << ")";
+    }
+  }
+
+  for (; I != E; ++I) {
+    if ((*I)->isStructTy()) {
+      Out << ".field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
+    } else if ((*I)->isArrayTy()) {
+      Out << ".array[";
+      writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
+      Out << ']';
+    } else if (!(*I)->isVectorTy()) {
+      Out << '[';
+      writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
+      Out << ']';
+    } else {
+      // If the last index is into a vector, then print it out as "+j)".  This
+      // works with the 'LastIndexIsVector' code above.
+      if (isa<Constant>(I.getOperand()) &&
+          cast<Constant>(I.getOperand())->isNullValue()) {
+        Out << "))";  // avoid "+0".
+      } else {
+        Out << ")+(";
+        writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
+        Out << "))";
+      }
+    }
+  }
+  Out << ")";
+}
+
+void CWriter::writeMemoryAccess(Value *Operand, Type *OperandType,
+                                bool IsVolatile, unsigned Alignment) {
+  assert(!isa<VectorType>(OperandType));
+  bool IsUnaligned = Alignment &&
+    Alignment < TD->getABITypeAlignment(OperandType);
+
+  if (!IsUnaligned)
+    Out << '*';
+  if (IsVolatile || IsUnaligned) {
+    Out << "((";
+    if (IsUnaligned)
+      Out << "struct __attribute__ ((packed, aligned(" << Alignment << "))) {";
+    printType(Out, OperandType, false, IsUnaligned ? "data" : "volatile*");
+    if (IsUnaligned) {
+      Out << "; } ";
+      if (IsVolatile) Out << "volatile ";
+      Out << "*";
+    }
+    Out << ")";
+  }
+
+  writeOperand(Operand);
+
+  if (IsVolatile || IsUnaligned) {
+    Out << ')';
+    if (IsUnaligned)
+      Out << "->data";
+  }
+}
+
+void CWriter::visitLoadInst(LoadInst &I) {
+  VectorType *VT = dyn_cast<VectorType>(I.getType());
+  if (VT != NULL) {
+      Out << "__load(";
+      writeOperand(I.getOperand(0));
+      Out << ", " << I.getAlignment();
+      Out << ")";
+      return;
+  }
+
+  writeMemoryAccess(I.getOperand(0), I.getType(), I.isVolatile(),
+                    I.getAlignment());
+}
+
+void CWriter::visitStoreInst(StoreInst &I) {
+  VectorType *VT = dyn_cast<VectorType>(I.getOperand(0)->getType());
+  if (VT != NULL) {
+      Out << "__store(";
+      writeOperand(I.getOperand(1));
+      Out << ", ";
+      writeOperand(I.getOperand(0));
+      Out << ", " << I.getAlignment() << ")";
+      return;
+  }
+
+  writeMemoryAccess(I.getPointerOperand(), I.getOperand(0)->getType(),
+                    I.isVolatile(), I.getAlignment());
+  Out << " = ";
+  Value *Operand = I.getOperand(0);
+  Constant *BitMask = 0;
+  if (IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType()))
+    if (!ITy->isPowerOf2ByteWidth())
+      // We have a bit width that doesn't match an even power-of-2 byte
+      // size. Consequently we must & the value with the type's bit mask
+      BitMask = ConstantInt::get(ITy, ITy->getBitMask());
+  if (BitMask)
+    Out << "((";
+  writeOperand(Operand);
+  if (BitMask) {
+    Out << ") & ";
+    printConstant(BitMask, false);
+    Out << ")";
+  }
+}
+
+void CWriter::visitGetElementPtrInst(GetElementPtrInst &I) {
+  printGEPExpression(I.getPointerOperand(), gep_type_begin(I),
+                     gep_type_end(I), false);
+}
+
+void CWriter::visitVAArgInst(VAArgInst &I) {
+  Out << "va_arg(*(va_list*)";
+  writeOperand(I.getOperand(0));
+  Out << ", ";
+  printType(Out, I.getType());
+  Out << ");\n ";
+}
+
+void CWriter::visitInsertElementInst(InsertElementInst &I) {
+#if 0
+  Type *EltTy = I.getType()->getElementType();
+  writeOperand(I.getOperand(0));
+  Out << ";\n  ";
+  Out << "((";
+  printType(Out, PointerType::getUnqual(EltTy));
+  Out << ")(&" << GetValueName(&I) << "))[";
+  writeOperand(I.getOperand(2));
+  Out << "] = (";
+  writeOperand(I.getOperand(1));
+  Out << ")";
+#else
+  writeOperand(I.getOperand(0));
+  Out << ";\n  ";
+  Out << "__insert_element(&" << GetValueName(&I) << ", ";
+  writeOperand(I.getOperand(2));
+  Out << ", ";
+  writeOperand(I.getOperand(1));
+  Out << ")";
+#endif
+}
+
+void CWriter::visitExtractElementInst(ExtractElementInst &I) {
+  // We know that our operand is not inlined.
+#if 0
+  Out << "((";
+  Type *EltTy =
+    cast<VectorType>(I.getOperand(0)->getType())->getElementType();
+  printType(Out, PointerType::getUnqual(EltTy));
+  Out << ")(&" << GetValueName(I.getOperand(0)) << "))[";
+  writeOperand(I.getOperand(1));
+  Out << "]";
+#else
+  Out << "(__extract_element(";
+  writeOperand(I.getOperand(0));
+  Out << ", ";
+  writeOperand(I.getOperand(1));
+  Out << "))";
+#endif
+}
+
+void CWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  printType(Out, SVI.getType());
+  Out << "(";
+  VectorType *VT = SVI.getType();
+  unsigned NumElts = VT->getNumElements();
+  Type *EltTy = VT->getElementType();
+  VectorType *OpTy = dyn_cast<VectorType>(SVI.getOperand(0)->getType());
+  unsigned OpElts = OpTy->getNumElements();
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (i) Out << ", ";
+    int SrcVal = SVI.getMaskValue(i);
+    if ((unsigned)SrcVal >= 2*OpElts) {
+      Out << " 0/*undef*/ ";
+    } else {
+      Value *Op = SVI.getOperand((unsigned)SrcVal >= OpElts);
+      SrcVal &= OpElts - 1;
+
+      if (isa<ConstantVector>(Op)) {
+        printConstant(cast<ConstantVector>(Op)->getOperand(SrcVal),
+                      false);
+      } else if (isa<ConstantAggregateZero>(Op) || isa<UndefValue>(Op)) {
+        Out << "0";
+      }
+      else {
+        // Do an extractelement of this value from the appropriate input.
+        Out << "((";
+        printType(Out, PointerType::getUnqual(EltTy));
+        Out << ")(&" << GetValueName(Op)
+            << "))[" << SrcVal << "]";
+      }
+    }
+  }
+  Out << ")";
+}
+
+void CWriter::visitInsertValueInst(InsertValueInst &IVI) {
+  // Start by copying the entire aggregate value into the result variable.
+  writeOperand(IVI.getOperand(0));
+  Out << ";\n  ";
+
+  // Then do the insert to update the field.
+  Out << GetValueName(&IVI);
+  for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end();
+       i != e; ++i) {
+    Type *IndexedTy = (b == i) ? IVI.getOperand(0)->getType() :
+       ExtractValueInst::getIndexedType(IVI.getOperand(0)->getType(),
+                                        makeArrayRef(b, i));
+    if (IndexedTy->isArrayTy())
+      Out << ".array[" << *i << "]";
+    else
+      Out << ".field" << *i;
+  }
+  Out << " = ";
+  writeOperand(IVI.getOperand(1));
+}
+
+void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
+  Out << "(";
+  if (isa<UndefValue>(EVI.getOperand(0))) {
+    // FIXME: need to handle these--a 0 initializer won't do...
+    assert(!isa<VectorType>(EVI.getType()));
+    Out << "(";
+    printType(Out, EVI.getType());
+    Out << ") 0/*UNDEF*/";
+  } else {
+    Out << GetValueName(EVI.getOperand(0));
+    for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end();
+         i != e; ++i) {
+      Type *IndexedTy = (b == i) ? EVI.getOperand(0)->getType() :
+        ExtractValueInst::getIndexedType(EVI.getOperand(0)->getType(),
+                                         makeArrayRef(b, i));
+      if (IndexedTy->isArrayTy())
+        Out << ".array[" << *i << "]";
+      else
+        Out << ".field" << *i;
+    }
+  }
+  Out << ")";
+}
+
+void CWriter::visitAtomicRMWInst(AtomicRMWInst &AI) {
+    Out << "(";
+    Out << "__atomic_";
+    switch (AI.getOperation()) {
+    default: llvm_unreachable("Unhandled case in visitAtomicRMWInst!");
+    case AtomicRMWInst::Add:   Out << "add";  break;
+    case AtomicRMWInst::Sub:   Out << "sub";  break;
+    case AtomicRMWInst::Xchg:  Out << "xchg"; break;
+    case AtomicRMWInst::And:   Out << "and";  break;
+    case AtomicRMWInst::Nand:  Out << "nand"; break;
+    case AtomicRMWInst::Or:    Out << "or";   break;
+    case AtomicRMWInst::Xor:   Out << "xor";  break;
+    case AtomicRMWInst::Min:   Out << "min";  break;
+    case AtomicRMWInst::Max:   Out << "max";  break;
+    case AtomicRMWInst::UMin:  Out << "umin"; break;
+    case AtomicRMWInst::UMax:  Out << "umax"; break;
+    }
+    Out << "(";
+    writeOperand(AI.getOperand(0));
+    Out << ", ";
+    writeOperand(AI.getOperand(1));
+    Out << "))";
+}
+
+void CWriter::visitAtomicCmpXchgInst(AtomicCmpXchgInst &ACXI) {
+    Out << "(";
+    Out << "__atomic_cmpxchg(";
+    writeOperand(ACXI.getPointerOperand());
+    Out << ", ";
+    writeOperand(ACXI.getCompareOperand());
+    Out << ", ";
+    writeOperand(ACXI.getNewValOperand());
+    Out << "))";
+}
+
+///////////////////////////////////////////////////////////////////////////
+// SmearCleanupPass
+
+class SmearCleanupPass : public llvm::BasicBlockPass {
+public:
+    SmearCleanupPass(llvm::Module *m, int width)
+        : BasicBlockPass(ID) { module = m; vectorWidth = width; }
+
+    const char *getPassName() const { return "Smear Cleanup Pass"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+    static char ID;
+    llvm::Module *module;
+    int vectorWidth;
+};
+
+
+char SmearCleanupPass::ID = 0;
+
+
+static int
+lChainLength(InsertElementInst *inst) {
+    int length = 0;
+    while (inst != NULL) {
+        ++length;
+        inst = dyn_cast<InsertElementInst>(inst->getOperand(0));
+    }
+    return length;
+}
+
+
+bool
+SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    bool modifiedAny = false;
+
+ restart:
+    for (BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        InsertElementInst *insertInst = 
+            dyn_cast<InsertElementInst>(&*iter);
+        if (insertInst == NULL)
+            continue;
+
+        // Only do this on the last insert in a chain...
+        if (lChainLength(insertInst) != vectorWidth)
+            continue;
+
+        // FIXME: we only want to do this to vectors with width equal to
+        // the target vector width.  But we can't easily get that here, so
+        // for now we at least avoid one case where we definitely don't
+        // want to do this.
+        VectorType *vt = dyn_cast<VectorType>(insertInst->getType());
+        if (vt->getNumElements() == 1)
+            continue;
+
+        Value *toMatch = NULL;
+        while (insertInst != NULL) {
+            Value *insertValue = insertInst->getOperand(1);
+            if (toMatch == NULL)
+                toMatch = insertValue;
+            else if (toMatch != insertValue)
+                goto not_equal;
+
+            insertInst = 
+                dyn_cast<InsertElementInst>(insertInst->getOperand(0));
+        }
+        assert(toMatch != NULL);
+
+        {
+        // FIXME: generalize this/make it not so hard-coded?
+        Type *matchType = toMatch->getType();
+        const char *smearFuncName = NULL;
+
+        switch (matchType->getTypeID()) {
+        case Type::FloatTyID:  smearFuncName = "__smear_float"; break;
+        case Type::DoubleTyID: smearFuncName = "__smear_double"; break;
+        case Type::IntegerTyID: {
+            switch (cast<IntegerType>(matchType)->getBitWidth()) {
+            case 8:  smearFuncName = "__smear_i8";  break;
+            case 16: smearFuncName = "__smear_i16"; break;
+            case 32: smearFuncName = "__smear_i32"; break;
+            case 64: smearFuncName = "__smear_i64"; break;
+            }
+        }
+        default: break;
+        }
+
+        if (smearFuncName != NULL) {
+            Function *smearFunc = module->getFunction(smearFuncName);
+            if (smearFunc == NULL) {
+                Constant *sf = 
+                    module->getOrInsertFunction(smearFuncName, iter->getType(), 
+                                                matchType, NULL);
+                smearFunc = dyn_cast<Function>(sf);
+                assert(smearFunc != NULL);
+                smearFunc->setDoesNotThrow(true);
+                smearFunc->setDoesNotAccessMemory(true);
+            }
+                
+            assert(smearFunc != NULL);
+            Value *args[1] = { toMatch };
+            ArrayRef<llvm::Value *> argArray(&args[0], &args[1]);
+            Instruction *smearCall = 
+                CallInst::Create(smearFunc, argArray, "smear", (Instruction *)NULL);
+
+            ReplaceInstWithInst(iter, smearCall);
+
+            modifiedAny = true;
+            goto restart;
+        }
+        }
+        not_equal:
+            ;
+    }
+
+    return modifiedAny;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// BitcastCleanupPass
+
+class BitcastCleanupPass : public llvm::BasicBlockPass {
+public:
+    BitcastCleanupPass()
+        : BasicBlockPass(ID) { }
+
+    const char *getPassName() const { return "Bitcast Cleanup Pass"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+    static char ID;
+};
+
+char BitcastCleanupPass::ID = 0;
+
+bool
+BitcastCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    bool modifiedAny = false;
+
+ restart:
+    for (BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        BitCastInst *bc = dyn_cast<BitCastInst>(&*iter);
+        if (bc == NULL)
+            continue;
+
+        // We only care about bitcasts from integer types to vector types
+        if (!isa<VectorType>(bc->getType()))
+            continue;
+
+        Value *Op = bc->getOperand(0);
+        if (isa<VectorType>(Op->getType()))
+            continue;
+
+        BitCastInst *opBc = dyn_cast<BitCastInst>(Op);
+        if (opBc == NULL) Op->dump();
+        assert(opBc != NULL);
+
+        assert(isa<VectorType>(opBc->getOperand(0)->getType()));
+        Instruction *newBitCast = new BitCastInst(opBc->getOperand(0), bc->getType(),
+                                                  "replacement_bc", (Instruction *)NULL);
+        ReplaceInstWithInst(iter, newBitCast);
+        modifiedAny = true;
+        goto restart;
+    }
+    return modifiedAny;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                       External Interface declaration
+//===----------------------------------------------------------------------===//
+
+bool
+WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
+             const char *includeName) {
+    PassManager pm;
+#if 0
+    if (const llvm::TargetData *td = targetMachine->getTargetData())
+        pm.add(new llvm::TargetData(*td));
+    else
+        pm.add(new llvm::TargetData(module));
+#endif
+
+    int flags = 0;
+    std::string error;
+    tool_output_file *of = new tool_output_file(fn, error, flags);
+    if (error.size()) {
+        fprintf(stderr, "Error opening output file \"%s\".\n", fn);
+        return false;
+    }
+
+    formatted_raw_ostream fos(of->os());
+
+    pm.add(createGCLoweringPass());
+    pm.add(createLowerInvokePass());
+    pm.add(createCFGSimplificationPass());   // clean up after lower invoke.
+    pm.add(new SmearCleanupPass(module, vectorWidth));
+    pm.add(new BitcastCleanupPass);
+    pm.add(createDeadCodeEliminationPass()); // clean up after smear pass
+//CO    pm.add(createPrintModulePass(&fos));
+    pm.add(new CWriter(fos, includeName));
+    pm.add(createGCInfoDeleter());
+//CO    pm.add(createVerifierPass());
+
+    pm.run(*module);
+
+    return true;
+}
+
+#endif // LLVM_2_9
diff --git a/docs/ispc.txt b/docs/ispc.txt
index 7fcbddf3a09..011ec208ca6 100644
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -56,6 +56,7 @@ Contents:
 
   + `Basic Command-line Options`_
   + `Selecting The Compilation Target`_
+  + `Generating Generic C++ Output`_
   + `Selecting 32 or 64 Bit Addressing`_
   + `The Preprocessor`_
   + `Debugging`_
@@ -432,6 +433,65 @@ Intel® SSE2, use ``--target=sse2``.  (As with the other options in this
 section, see the output of ``ispc --help`` for a full list of supported
 targets.)
 
+Generating Generic C++ Output
+-----------------------------
+
+In addition to generating object files or assembly output for specific
+targets like SSE2, SSE4, and AVX, ``ispc`` provides an option to generate
+"generic" C++ output.  This
+
+As an example, consider the following simple ``ispc`` program:
+
+::
+
+    int foo(int i, int j) {
+        return (i < 0) ? 0 : i + j;
+    }
+
+If this program is compiled with the following command:
+
+::
+
+  ispc foo.ispc --emit-c++ --target=generic-4 -o foo.cpp
+
+Then ``foo()`` is compiled to the following C++ code (after various
+automatically-generated boilerplate code):
+
+::
+
+    __vec4_i32 foo(__vec4_i32 i_llvm_cbe, __vec4_i32 j_llvm_cbe,
+                   __vec4_i1 __mask_llvm_cbe) {
+        return (__select((__signed_less_than(i_llvm_cbe,
+                                             __vec4_i32 (0u, 0u, 0u, 0u))),
+                         __vec4_i32 (0u, 0u, 0u, 0u),
+                        (__add(i_llvm_cbe, j_llvm_cbe))));
+    }
+
+Note that the original computation has been expressed in terms of a number
+of vector types (e.g. ``__vec4_i32`` for a 4-wide vector of 32-bit integers
+and ``__vec4_i1`` for a 4-wide vector of boolean values) and in terms of
+vector operations on these types like ``__add()`` and ``__select()``).
+
+You are then free to provide your own implementations of these types and
+functions.  For example, you might want to target a specific vector ISA, or
+you might want to instrument these functions for performance measurements.
+
+There is an example implementation of 4-wide variants of the required
+functions, suitable for use with the ``generic-4`` target in the file
+``examples/intrinsics/sse4.h``, and there is an example straightforward C
+implementation of the 16-wide variants for the ``generic-16`` target in the
+file ``examples/intrinsics/generic-16.h``.  There is not yet comprehensive
+documentation of these types and the functions that must be provided for
+them when the C++ target is used, but a review of those two files should
+provide the basic context.
+
+If you are using C++ source emission, you may also find the
+``--c++-include-file=<filename>`` command line argument useful; it adds an
+``#include`` statement with the given filename at the top of the emitted
+C++ file; this can be used to easily include specific implementations of
+the vector types and functions.
+
+
 Selecting 32 or 64 Bit Addressing
 ---------------------------------
 
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
new file mode 100644
index 00000000000..ea120abb8c0
--- /dev/null
+++ b/examples/intrinsics/generic-16.h
@@ -0,0 +1,1428 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)  /*__declspec(align(x))*/
+#define POST_ALIGN(x)  
+#define roundf(x) (floorf(x + .5f))
+#define round(x) (floor(x + .5))
+#else
+#define FORCEINLINE __attribute__((always_inline))
+#define PRE_ALIGN(x)
+#define POST_ALIGN(x)  __attribute__ ((aligned(x)))
+#endif
+
+typedef float __vec1_f;
+typedef double __vec1_d;
+typedef int8_t __vec1_i8;
+typedef int16_t __vec1_i16;
+typedef int32_t __vec1_i32;
+typedef int64_t __vec1_i64;
+
+struct __vec16_i1 {
+    __vec16_i1() { }
+    __vec16_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3,
+               uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7,
+               uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11,
+               uint32_t v12, uint32_t v13, uint32_t v14, uint32_t v15) {
+        v = ((v0 & 1) |
+             ((v1 & 1) << 1) |
+             ((v2 & 1) << 2) |
+             ((v3 & 1) << 3) |
+             ((v4 & 1) << 4) |
+             ((v5 & 1) << 5) |
+             ((v6 & 1) << 6) |
+             ((v7 & 1) << 7) |
+             ((v8 & 1) << 8) |
+             ((v9 & 1) << 9) |
+             ((v10 & 1) << 10) |
+             ((v11 & 1) << 11) |
+             ((v12 & 1) << 12) |
+             ((v13 & 1) << 13) |
+             ((v14 & 1) << 14) |
+             ((v15 & 1) << 15));
+    }
+             
+    uint16_t v;
+};
+
+
+template <typename T>
+struct vec16 {
+    vec16() { }
+    vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+          T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
+        v[0] = v0;        v[1] = v1;        v[2] = v2;        v[3] = v3;
+        v[4] = v4;        v[5] = v5;        v[6] = v6;        v[7] = v7;
+        v[8] = v8;        v[9] = v9;        v[10] = v10;      v[11] = v11;
+        v[12] = v12;      v[13] = v13;      v[14] = v14;      v[15] = v15;
+    }
+    T v[16]; 
+};
+
+PRE_ALIGN(64) struct __vec16_f : public vec16<float> { 
+    __vec16_f() { }
+    __vec16_f(float v0, float v1, float v2, float v3, 
+              float v4, float v5, float v6, float v7,
+              float v8, float v9, float v10, float v11, 
+              float v12, float v13, float v14, float v15) 
+        : vec16<float>(v0, v1, v2, v3, v4, v5, v6, v7,
+                       v8, v9, v10, v11, v12, v13, v14, v15) { }
+
+} POST_ALIGN(64);
+
+PRE_ALIGN(128) struct __vec16_d : public vec16<double> { 
+    __vec16_d() { }
+    __vec16_d(double v0, double v1, double v2, double v3, 
+              double v4, double v5, double v6, double v7,
+              double v8, double v9, double v10, double v11, 
+              double v12, double v13, double v14, double v15) 
+        : vec16<double>(v0, v1, v2, v3, v4, v5, v6, v7,
+                        v8, v9, v10, v11, v12, v13, v14, v15) { }
+
+} POST_ALIGN(128);
+
+PRE_ALIGN(16) struct __vec16_i8   : public vec16<int8_t> { 
+    __vec16_i8() { }
+    __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+               int8_t v8, int8_t v9, int8_t v10, int8_t v11, 
+               int8_t v12, int8_t v13, int8_t v14, int8_t v15) 
+        : vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                        v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(16);
+
+PRE_ALIGN(32) struct __vec16_i16  : public vec16<int16_t> { 
+    __vec16_i16() { }
+    __vec16_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
+                int16_t v4, int16_t v5, int16_t v6, int16_t v7,
+                int16_t v8, int16_t v9, int16_t v10, int16_t v11, 
+                int16_t v12, int16_t v13, int16_t v14, int16_t v15) 
+        : vec16<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(32);
+
+PRE_ALIGN(64) struct __vec16_i32  : public vec16<int32_t> { 
+    __vec16_i32() { }
+    __vec16_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
+                int32_t v4, int32_t v5, int32_t v6, int32_t v7,
+                int32_t v8, int32_t v9, int32_t v10, int32_t v11, 
+                int32_t v12, int32_t v13, int32_t v14, int32_t v15) 
+        : vec16<int32_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(64);
+
+static inline int32_t __extract_element(__vec16_i32, int);
+
+PRE_ALIGN(128) struct __vec16_i64  : public vec16<int64_t> { 
+    __vec16_i64() { }
+    __vec16_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
+                int64_t v4, int64_t v5, int64_t v6, int64_t v7,
+                int64_t v8, int64_t v9, int64_t v10, int64_t v11, 
+                int64_t v12, int64_t v13, int64_t v14, int64_t v15) 
+        : vec16<int64_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(128);
+
+///////////////////////////////////////////////////////////////////////////
+// macros...
+
+#define UNARY_OP(TYPE, NAME, OP)            \
+static FORCEINLINE TYPE NAME(TYPE v) {      \
+    TYPE ret;                               \
+    for (int i = 0; i < 16; ++i)            \
+        ret.v[i] = OP(v.v[i]);              \
+    return ret;                             \
+}
+
+#define BINARY_OP(TYPE, NAME, OP)                               \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
+    TYPE ret;                                                   \
+   for (int i = 0; i < 16; ++i)                                 \
+       ret.v[i] = a.v[i] OP b.v[i];                             \
+   return ret;                                                   \
+}
+
+#define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v[i] = (CAST)(a.v[i]) OP (CAST)(b.v[i]);                 \
+   return ret;                                                      \
+}
+
+#define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v[i] = FUNC(a.v[i], b.v[i]);                             \
+   return ret;                                                      \
+}
+
+#define CMP_OP(TYPE, CAST, NAME, OP)                                \
+static FORCEINLINE __vec16_i1 NAME(TYPE a, TYPE b) {                \
+   __vec16_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i;            \
+   return ret;                                                      \
+}
+
+#define INSERT_EXTRACT(VTYPE, STYPE)                                  \
+static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
+    return ((STYPE *)&v)[index];                                      \
+}                                                                     \
+static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
+    ((STYPE *)v)[index] = val;                                        \
+}
+
+#define LOAD_STORE(VTYPE, STYPE)                       \
+static FORCEINLINE VTYPE __load(VTYPE *p, int align) { \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 16; ++i)                       \
+        ret.v[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v, int align) {    \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 16; ++i)                       \
+        ptr[i] = v.v[i];                               \
+}
+
+#define REDUCE_ADD(TYPE, VTYPE, NAME)           \
+static FORCEINLINE TYPE NAME(VTYPE v) {         \
+     TYPE ret = v.v[0];                         \
+     for (int i = 1; i < 16; ++i)               \
+         ret = ret + v.v[i];                    \
+     return ret;                                \
+}
+
+#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
+static FORCEINLINE TYPE NAME(VTYPE v) {                         \
+    TYPE ret = v.v[0];                                          \
+    for (int i = 1; i < 16; ++i)                                \
+        ret = (ret OP (TYPE)v.v[i]) ? ret : (TYPE)v.v[i];       \
+    return ret;                                                 \
+}
+
+#define SELECT(TYPE)                                                \
+static FORCEINLINE TYPE __select(__vec16_i1 mask, TYPE a, TYPE b) { \
+    TYPE ret;                                                       \
+    for (int i = 0; i < 16; ++i)                                    \
+        ret.v[i] = (mask.v & (1<<i)) ? a.v[i] : b.v[i];             \
+    return ret;                                                     \
+}                                                                   \
+static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
+    return cond ? a : b;                                            \
+}
+
+#define SMEAR(VTYPE, NAME, STYPE)               \
+static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {        \
+    VTYPE ret;                                  \
+    for (int i = 0; i < 16; ++i)                \
+        ret.v[i] = v;                           \
+    return ret;                                 \
+}                                               \
+
+#define BROADCAST(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret.v[i] = v.v[index & 0xf];                  \
+    return ret;                                       \
+}                                                     \
+
+#define ROTATE(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret.v[i] = v.v[(i+index) & 0xf];              \
+    return ret;                                       \
+}                                                     \
+
+#define SHUFFLES(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret.v[i] = v.v[__extract_element(index, i) & 0xf];      \
+    return ret;                                       \
+}                                                     \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+        int ii = __extract_element(index, i) & 0x1f;    \
+        ret.v[i] = (ii < 16) ? v0.v[ii] : v1.v[ii-16];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+INSERT_EXTRACT(__vec1_i8, int8_t)
+INSERT_EXTRACT(__vec1_i16, int16_t)
+INSERT_EXTRACT(__vec1_i32, int32_t)
+INSERT_EXTRACT(__vec1_i64, int64_t)
+INSERT_EXTRACT(__vec1_f, float)
+INSERT_EXTRACT(__vec1_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// mask ops
+
+static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) {
+    return mask.v;
+}
+
+static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v ^ b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v | b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
+                                       __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = (a.v & mask.v) | (b.v & ~mask.v);
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) {
+    return (vec.v & (1 << index)) ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, 
+                                         bool val) {
+    if (val == false)
+        vec->v &= ~(1 << index);
+    else
+        vec->v |= (1 << index);
+}
+
+static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p, int align) {
+    uint16_t *ptr = (uint16_t *)p;
+    __vec16_i1 r;
+    r.v = *ptr;
+    return r;
+}
+
+static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v, int align) {
+    uint16_t *ptr = (uint16_t *)p;
+    *ptr = v.v;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// int8
+
+BINARY_OP(__vec16_i8, __add, +)
+BINARY_OP(__vec16_i8, __sub, -)
+BINARY_OP(__vec16_i8, __mul, *)
+
+BINARY_OP(__vec16_i8, __or, |)
+BINARY_OP(__vec16_i8, __and, &)
+BINARY_OP(__vec16_i8, __xor, ^)
+BINARY_OP(__vec16_i8, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i8, uint8_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i8, uint8_t, __urem, %)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __ashr, >>)
+
+CMP_OP(__vec16_i8, int8_t,  __equal, ==)
+CMP_OP(__vec16_i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i8, int8_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i8)
+INSERT_EXTRACT(__vec16_i8, int8_t)
+SMEAR(__vec16_i8, i8, int8_t)
+BROADCAST(__vec16_i8, i8, int8_t)
+ROTATE(__vec16_i8, i8, int8_t)
+SHUFFLES(__vec16_i8, i8, int8_t)
+LOAD_STORE(__vec16_i8, int8_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int16
+
+BINARY_OP(__vec16_i16, __add, +)
+BINARY_OP(__vec16_i16, __sub, -)
+BINARY_OP(__vec16_i16, __mul, *)
+
+BINARY_OP(__vec16_i16, __or, |)
+BINARY_OP(__vec16_i16, __and, &)
+BINARY_OP(__vec16_i16, __xor, ^)
+BINARY_OP(__vec16_i16, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i16, uint16_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i16, uint16_t, __urem, %)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __ashr, >>)
+
+CMP_OP(__vec16_i16, int16_t,  __equal, ==)
+CMP_OP(__vec16_i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i16, int16_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i16)
+INSERT_EXTRACT(__vec16_i16, int16_t)
+SMEAR(__vec16_i16, i16, int16_t)
+BROADCAST(__vec16_i16, i16, int16_t)
+ROTATE(__vec16_i16, i16, int16_t)
+SHUFFLES(__vec16_i16, i16, int16_t)
+LOAD_STORE(__vec16_i16, int16_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int32
+
+BINARY_OP(__vec16_i32, __add, +)
+BINARY_OP(__vec16_i32, __sub, -)
+BINARY_OP(__vec16_i32, __mul, *)
+
+BINARY_OP(__vec16_i32, __or, |)
+BINARY_OP(__vec16_i32, __and, &)
+BINARY_OP(__vec16_i32, __xor, ^)
+BINARY_OP(__vec16_i32, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i32, uint32_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i32, int32_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i32, uint32_t, __urem, %)
+BINARY_OP_CAST(__vec16_i32, int32_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i32, int32_t,  __ashr, >>)
+
+CMP_OP(__vec16_i32, int32_t,  __equal, ==)
+CMP_OP(__vec16_i32, int32_t,  __not_equal, !=)
+CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i32, int32_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i32, int32_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i32, uint32_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i32, int32_t,  __signed_less_than, <)
+CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i32, int32_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i32)
+INSERT_EXTRACT(__vec16_i32, int32_t)
+SMEAR(__vec16_i32, i32, int32_t)
+BROADCAST(__vec16_i32, i32, int32_t)
+ROTATE(__vec16_i32, i32, int32_t)
+SHUFFLES(__vec16_i32, i32, int32_t)
+LOAD_STORE(__vec16_i32, int32_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int64
+
+BINARY_OP(__vec16_i64, __add, +)
+BINARY_OP(__vec16_i64, __sub, -)
+BINARY_OP(__vec16_i64, __mul, *)
+
+BINARY_OP(__vec16_i64, __or, |)
+BINARY_OP(__vec16_i64, __and, &)
+BINARY_OP(__vec16_i64, __xor, ^)
+BINARY_OP(__vec16_i64, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i64, uint64_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i64, int64_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i64, uint64_t, __urem, %)
+BINARY_OP_CAST(__vec16_i64, int64_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
+
+CMP_OP(__vec16_i64, int64_t,  __equal, ==)
+CMP_OP(__vec16_i64, int64_t,  __not_equal, !=)
+CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i64, int64_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i64)
+INSERT_EXTRACT(__vec16_i64, int64_t)
+SMEAR(__vec16_i64, i64, int64_t)
+BROADCAST(__vec16_i64, i64, int64_t)
+ROTATE(__vec16_i64, i64, int64_t)
+SHUFFLES(__vec16_i64, i64, int64_t)
+LOAD_STORE(__vec16_i64, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// float
+
+BINARY_OP(__vec16_f, __add, +)
+BINARY_OP(__vec16_f, __sub, -)
+BINARY_OP(__vec16_f, __mul, *)
+BINARY_OP(__vec16_f, __div, /)
+
+CMP_OP(__vec16_f, float, __equal, ==)
+CMP_OP(__vec16_f, float, __not_equal, !=)
+CMP_OP(__vec16_f, float, __less_than, <)
+CMP_OP(__vec16_f, float, __less_equal, <=)
+CMP_OP(__vec16_f, float, __greater_than, >)
+CMP_OP(__vec16_f, float, __greater_equal, >=)
+
+static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec16_f)
+INSERT_EXTRACT(__vec16_f, float)
+SMEAR(__vec16_f, float, float)
+BROADCAST(__vec16_f, float, float)
+ROTATE(__vec16_f, float, float)
+SHUFFLES(__vec16_f, float, float)
+LOAD_STORE(__vec16_f, float)
+
+///////////////////////////////////////////////////////////////////////////
+// double
+
+BINARY_OP(__vec16_d, __add, +)
+BINARY_OP(__vec16_d, __sub, -)
+BINARY_OP(__vec16_d, __mul, *)
+BINARY_OP(__vec16_d, __div, /)
+
+CMP_OP(__vec16_d, double, __equal, ==)
+CMP_OP(__vec16_d, double, __not_equal, !=)
+CMP_OP(__vec16_d, double, __less_than, <)
+CMP_OP(__vec16_d, double, __less_equal, <=)
+CMP_OP(__vec16_d, double, __greater_than, >)
+CMP_OP(__vec16_d, double, __greater_equal, >=)
+
+static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec16_d)
+INSERT_EXTRACT(__vec16_d, double)
+SMEAR(__vec16_d, double, double)
+BROADCAST(__vec16_d, double, double)
+ROTATE(__vec16_d, double, double)
+SHUFFLES(__vec16_d, double, double)
+LOAD_STORE(__vec16_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// casts
+
+
+#define CAST(TO, STO, FROM, SFROM, FUNC)        \
+static FORCEINLINE TO FUNC(TO, FROM val) {      \
+    TO ret;                                     \
+    for (int i = 0; i < 16; ++i)                \
+        ret.v[i] = (STO)((SFROM)(val.v[i]));    \
+    return ret;                                 \
+}
+
+// sign extension conversions
+CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
+CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext)
+CAST(__vec16_i64, int64_t, __vec16_i8,  int8_t,  __cast_sext)
+CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
+CAST(__vec16_i32, int32_t, __vec16_i8,  int8_t,  __cast_sext)
+CAST(__vec16_i16, int16_t, __vec16_i8,  int8_t,  __cast_sext)
+
+#define CAST_SEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 16; ++i) {                    \
+        ret.v[i] = 0;                                 \
+        if (v.v & (1 << i))                           \
+            ret.v[i] = ~ret.v[i];                     \
+    }                                                 \
+    return ret;                                       \
+}
+
+CAST_SEXT_I1(__vec16_i8)
+CAST_SEXT_I1(__vec16_i16)
+CAST_SEXT_I1(__vec16_i32)
+CAST_SEXT_I1(__vec16_i64)
+
+// zero extension
+CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
+CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
+CAST(__vec16_i64, uint64_t, __vec16_i8,  uint8_t,  __cast_zext)
+CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
+CAST(__vec16_i32, uint32_t, __vec16_i8,  uint8_t,  __cast_zext)
+CAST(__vec16_i16, uint16_t, __vec16_i8,  uint8_t,  __cast_zext)
+
+#define CAST_ZEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 16; ++i)                      \
+        ret.v[i] = (v.v & (1 << i)) ? 1 : 0;          \
+    return ret;                                       \
+}
+
+CAST_ZEXT_I1(__vec16_i8)
+CAST_ZEXT_I1(__vec16_i16)
+CAST_ZEXT_I1(__vec16_i32)
+CAST_ZEXT_I1(__vec16_i64)
+
+// truncations
+CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i16, int16_t, __vec16_i32, int32_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i32, int32_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i16, int16_t, __cast_trunc)
+
+// signed int to float/double
+CAST(__vec16_f, float, __vec16_i8,   int8_t,  __cast_sitofp)
+CAST(__vec16_f, float, __vec16_i16,  int16_t, __cast_sitofp)
+CAST(__vec16_f, float, __vec16_i32,  int32_t, __cast_sitofp)
+CAST(__vec16_f, float, __vec16_i64,  int64_t, __cast_sitofp)
+CAST(__vec16_d, double, __vec16_i8,  int8_t,  __cast_sitofp)
+CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp)
+CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp)
+CAST(__vec16_d, double, __vec16_i64, int64_t, __cast_sitofp)
+
+// unsigned int to float/double
+CAST(__vec16_f, float, __vec16_i8,   uint8_t,  __cast_uitofp)
+CAST(__vec16_f, float, __vec16_i16,  uint16_t, __cast_uitofp)
+CAST(__vec16_f, float, __vec16_i32,  uint32_t, __cast_uitofp)
+CAST(__vec16_f, float, __vec16_i64,  uint64_t, __cast_uitofp)
+CAST(__vec16_d, double, __vec16_i8,  uint8_t,  __cast_uitofp)
+CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp)
+CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp)
+CAST(__vec16_d, double, __vec16_i64, uint64_t, __cast_uitofp)
+
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = (v.v & (1 << i)) ? 1. : 0.;
+    return ret;
+}
+
+// float/double to signed int
+CAST(__vec16_i8,  int8_t,  __vec16_f, float, __cast_fptosi)
+CAST(__vec16_i16, int16_t, __vec16_f, float, __cast_fptosi)
+CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi)
+CAST(__vec16_i64, int64_t, __vec16_f, float, __cast_fptosi)
+CAST(__vec16_i8,  int8_t,  __vec16_d, double, __cast_fptosi)
+CAST(__vec16_i16, int16_t, __vec16_d, double, __cast_fptosi)
+CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi)
+CAST(__vec16_i64, int64_t, __vec16_d, double, __cast_fptosi)
+
+// float/double to unsigned int
+CAST(__vec16_i8,  uint8_t,  __vec16_f, float, __cast_fptoui)
+CAST(__vec16_i16, uint16_t, __vec16_f, float, __cast_fptoui)
+CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui)
+CAST(__vec16_i64, uint64_t, __vec16_f, float, __cast_fptoui)
+CAST(__vec16_i8,  uint8_t,  __vec16_d, double, __cast_fptoui)
+CAST(__vec16_i16, uint16_t, __vec16_d, double, __cast_fptoui)
+CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui)
+CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui)
+
+// float/double conversions
+CAST(__vec16_f, float,  __vec16_d, double, __cast_fptrunc)
+CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
+
+typedef union {
+    int32_t i32;
+    float f;
+    int64_t i64;
+    double d;
+} BitcastUnion;
+
+#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
+static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
+    TO r;                                           \
+    for (int i = 0; i < 16; ++i) {                  \
+        BitcastUnion u;                             \
+        u.FROM_ELT = val.v[i];                      \
+        r.v[i] = u.TO_ELT;                          \
+    }                                               \
+    return r;                                       \
+}
+
+CAST_BITS(__vec16_f,   f,   __vec16_i32, i32)
+CAST_BITS(__vec16_i32, i32, __vec16_f,   f)
+CAST_BITS(__vec16_d,   d,   __vec16_i64, i64)
+CAST_BITS(__vec16_i64, i64, __vec16_d,   d)
+
+#define CAST_BITS_SCALAR(TO, FROM)                  \
+static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
+    union {                                         \
+    TO to;                                          \
+    FROM from;                                      \
+    } u;                                            \
+    u.from = v;                                     \
+    return u.to;                                    \
+}
+
+CAST_BITS_SCALAR(uint32_t, float)
+CAST_BITS_SCALAR(int32_t, float)
+CAST_BITS_SCALAR(float, uint32_t)
+CAST_BITS_SCALAR(float, int32_t)
+CAST_BITS_SCALAR(uint64_t, double)
+CAST_BITS_SCALAR(int64_t, double)
+CAST_BITS_SCALAR(double, uint64_t)
+CAST_BITS_SCALAR(double, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// various math functions
+
+static FORCEINLINE void __fastmath() {
+}
+
+static FORCEINLINE float __round_uniform_float(float v) {
+    return roundf(v);
+}
+
+static FORCEINLINE float __floor_uniform_float(float v)  {
+    return floorf(v);
+}
+
+static FORCEINLINE float __ceil_uniform_float(float v) {
+    return ceilf(v);
+}
+
+static FORCEINLINE double __round_uniform_double(double v) {
+    return round(v);
+}
+
+static FORCEINLINE double __floor_uniform_double(double v) {
+    return floor(v);
+}
+
+static FORCEINLINE double __ceil_uniform_double(double v) {
+    return ceil(v);
+}
+
+UNARY_OP(__vec16_f, __round_varying_float, roundf)
+UNARY_OP(__vec16_f, __floor_varying_float, floorf)
+UNARY_OP(__vec16_f, __ceil_varying_float, ceilf)
+UNARY_OP(__vec16_d, __round_varying_double, round)
+UNARY_OP(__vec16_d, __floor_varying_double, floor)
+UNARY_OP(__vec16_d, __ceil_varying_double, ceil)
+
+// min/max
+
+static FORCEINLINE float __min_uniform_float(float a, float b) { return (a<b) ? a : b; }
+static FORCEINLINE float __max_uniform_float(float a, float b) { return (a>b) ? a : b; }
+static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
+static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32(int32_t a, int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64(int64_t a, int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
+
+
+BINARY_OP_FUNC(__vec16_f, __max_varying_float, __max_uniform_float)
+BINARY_OP_FUNC(__vec16_f, __min_varying_float, __min_uniform_float)
+BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double)
+BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double)
+
+BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32)
+BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32)
+BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32)
+BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32)
+
+BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
+BINARY_OP_FUNC(__vec16_i64, __min_varying_uint64, __min_uniform_uint64)
+
+// sqrt/rsqrt/rcp
+
+static FORCEINLINE float __rsqrt_uniform_float(float v) {
+    return 1.f / sqrtf(v);
+}
+
+static FORCEINLINE float __rcp_uniform_float(float v) {
+    return 1.f / v;
+}
+
+static FORCEINLINE float __sqrt_uniform_float(float v) {
+    return sqrtf(v);
+}
+
+static FORCEINLINE double __sqrt_uniform_double(double v) {
+    return sqrt(v);
+}
+
+UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float)
+UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double)
+
+///////////////////////////////////////////////////////////////////////////
+// bit ops
+
+static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __popcnt_int64(uint64_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & (1<<31)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & (1ull<<63)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// reductions
+
+REDUCE_ADD(float, __vec16_f, __reduce_add_float)
+REDUCE_MINMAX(float, __vec16_f, __reduce_min_float, <)
+REDUCE_MINMAX(float, __vec16_f, __reduce_max_float, >)
+
+REDUCE_ADD(double, __vec16_d, __reduce_add_double)
+REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <)
+REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >)
+
+REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32)
+REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <)
+REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >)
+
+REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32)
+REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <)
+REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >)
+
+REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64)
+REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <)
+REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >)
+
+REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64)
+REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <)
+REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
+
+///////////////////////////////////////////////////////////////////////////
+// masked load/store
+
+static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
+                                              __vec16_i1 mask) {
+    __vec16_i8 ret;
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
+                                                __vec16_i1 mask) {
+    __vec16_i16 ret;
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
+                                                __vec16_i1 mask) {
+    __vec16_i32 ret;
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
+                                                __vec16_i1 mask) {
+    __vec16_i64 ret;
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE void __masked_store_8(unsigned char *p, __vec16_i8 val,
+                                         __vec16_i1 mask) {
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_16(unsigned char *p, __vec16_i16 val,
+                                          __vec16_i1 mask) {
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_32(unsigned char *p, __vec16_i32 val,
+                                          __vec16_i1 mask) {
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
+                                          __vec16_i1 mask) {
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+///////////////////////////////////////////////////////////////////////////
+// gather/scatter
+
+// offsets * offsetScale is in bytes (for all of these)
+
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
+                         __vec16_i1 mask) {                             \
+    VTYPE ret;                                                          \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 16; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]);        \
+            ret.v[i] = *ptr;                                            \
+        }                                                               \
+    return ret;                                                         \
+}
+    
+
+GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
+GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
+GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
+GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
+GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
+GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
+
+#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
+    VTYPE ret;                                              \
+    for (int i = 0; i < 16; ++i)                            \
+        if ((mask.v & (1 << i)) != 0) {                     \
+            STYPE *ptr = (STYPE *)ptrs.v[i];                \
+            ret.v[i] = *ptr;                                \
+        }                                                   \
+    return ret;                                             \
+}
+
+GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __gather32_i8)
+GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __gather64_i8)
+GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __gather32_i16)
+GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16)
+GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
+GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
+GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
+GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
+
+// scatter
+
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
+                             VTYPE val, __vec16_i1 mask) {              \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 16; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]);        \
+            *ptr = val.v[i];                                            \
+        }                                                               \
+}
+    
+
+SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
+SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
+SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
+SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
+SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
+
+#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
+    VTYPE ret;                                                       \
+    for (int i = 0; i < 16; ++i)                                     \
+        if ((mask.v & (1 << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)ptrs.v[i];                         \
+            *ptr = val.v[i];                                         \
+        }                                                            \
+}
+
+SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
+SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16)
+SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
+SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32)
+SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64)
+SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
+
+///////////////////////////////////////////////////////////////////////////
+// packed load/store
+
+static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec16_i32 *val,
+                                                __vec16_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 16; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->v[i] = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+
+
+static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 16; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val.v[i];
+            ++count;
+        }
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
+                                                __vec16_i32 *val,
+                                                __vec16_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 16; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->v[i] = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+
+
+static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
+                                                 __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 16; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val.v[i];
+            ++count;
+        }
+    }
+    return count;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// aos/soa
+
+static FORCEINLINE void __soa_to_aos3_float(__vec16_f v0, __vec16_f v1, __vec16_f v2,
+                                            float *ptr) {
+    for (int i = 0; i < 16; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec16_f *out0, __vec16_f *out1,
+                                            __vec16_f *out2) {
+    for (int i = 0; i < 16; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+    }
+}
+
+static FORCEINLINE void __soa_to_aos4_float(__vec16_f v0, __vec16_f v1, __vec16_f v2,
+                                            __vec16_f v3, float *ptr) {
+    for (int i = 0; i < 16; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+        *ptr++ = __extract_element(v3, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec16_f *out0, __vec16_f *out1,
+                                            __vec16_f *out2, __vec16_f *out3) {
+    for (int i = 0; i < 16; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+        __insert_element(out3, i, *ptr++);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// prefetch
+
+static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *) {
+}
+
+static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *) {
+}
+
+static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
+}
+
+static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
+}
+
+///////////////////////////////////////////////////////////////////////////
+// atomics
+
+static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
+    int32_t old, min;
+    do {
+        old = *((volatile int32_t *)p);
+        min = (old < (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
+    int32_t old, max;
+    do {
+        old = *((volatile int32_t *)p);
+        max = (old > (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
+    uint32_t old, min;
+    do {
+        old = *((volatile uint32_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
+    uint32_t old, max;
+    do {
+        old = *((volatile uint32_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange((LONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
+                                             uint32_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
+    int64_t old, min;
+    do {
+        old = *((volatile int64_t *)p);
+        min = (old < (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
+    int64_t old, max;
+    do {
+        old = *((volatile int64_t *)p);
+        max = (old > (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
+    uint64_t old, min;
+    do {
+        old = *((volatile uint64_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
+    uint64_t old, max;
+    do {
+        old = *((volatile uint64_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange64((LONGLONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
+                                             uint64_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
new file mode 100644
index 00000000000..c9556924aff
--- /dev/null
+++ b/examples/intrinsics/sse4.h
@@ -0,0 +1,3665 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+#include <math.h>
+#include <assert.h>
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif // _MSC_VER
+
+#include <smmintrin.h>
+#include <nmmintrin.h>
+
+#if !defined(__SSE4_2__) && !defined(_MSC_VER)
+#error "SSE 4.2 must be enabled in the C++ compiler to use this header."
+#endif // !__SSE4_2__ && !msvc
+
+#ifdef _MSC_VER
+#define FORCEINLINE __forceinline
+#else
+#define FORCEINLINE __attribute__((always_inline)) inline
+#endif
+
+//CO#undef FORCEINLINE
+//CO#define FORCEINLINE
+
+typedef float __vec1_f;
+typedef double __vec1_d;
+typedef int8_t __vec1_i8;
+typedef int16_t __vec1_i16;
+typedef int32_t __vec1_i32;
+typedef int64_t __vec1_i64;
+
+struct __vec4_i1 {
+    __vec4_i1() { }
+    __vec4_i1(__m128 vv) : v(vv) {  }
+    FORCEINLINE __vec4_i1(__m128i vv) : v(_mm_castsi128_ps(vv)) { }
+    FORCEINLINE __vec4_i1(int a, int b, int c, int d) {
+        v = _mm_castsi128_ps(_mm_set_epi32(d ? -1 : 0, c ? -1 : 0, 
+                                           b ? -1 : 0, a ? -1 : 0));
+    }
+
+    __m128 v;
+};
+
+struct __vec4_f {
+    __vec4_f() { }
+    __vec4_f(__m128 vv) : v(vv) {  }
+    FORCEINLINE __vec4_f(float a, float b, float c, float d) {
+        v = _mm_set_ps(d, c, b, a);
+    }
+    FORCEINLINE __vec4_f(float *p) {
+        v = _mm_loadu_ps(p);
+    }
+
+    FORCEINLINE operator __m128i() const { return _mm_castps_si128(v); }
+
+    __m128 v;
+};
+
+struct __vec4_i64 {
+    __vec4_i64() { }
+    FORCEINLINE __vec4_i64(__m128i a, __m128i b) { v[0] = a; v[1] = b; }
+    FORCEINLINE __vec4_i64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+        v[0] = _mm_set_epi32((b >> 32) & 0xffffffff, b & 0xffffffff, 
+                             (a >> 32) & 0xffffffff, a & 0xffffffff);
+        v[1] = _mm_set_epi32((d >> 32) & 0xffffffff, d & 0xffffffff, 
+                             (c >> 32) & 0xffffffff, c & 0xffffffff);
+    }
+    FORCEINLINE __vec4_i64(uint64_t *p) {
+        v[0] = _mm_loadu_si128((__m128i *)p);
+        v[1] = _mm_loadu_si128((__m128i *)(p+2));
+    }
+    FORCEINLINE uint64_t &operator[](int i) { return ((uint64_t *)v)[i]; }
+
+    __m128i v[2];
+};
+
+struct __vec4_i32 {
+    __vec4_i32() { }
+    FORCEINLINE __vec4_i32(__m128i vv) : v(vv) {  }
+    FORCEINLINE __vec4_i32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+        v = _mm_set_epi32(d, c, b, a);
+    }
+    FORCEINLINE __vec4_i32(uint32_t *p) {
+        v = _mm_loadu_si128((__m128i *)p);
+    }
+
+    FORCEINLINE operator __m128() const { return _mm_castsi128_ps(v); }
+
+    __m128i v;
+};
+
+static inline int32_t __extract_element(__vec4_i32 v, int index);
+
+struct __vec4_i16 {
+    __vec4_i16() { }
+    FORCEINLINE __vec4_i16(__m128i vv) : v(vv) {  }
+    FORCEINLINE __vec4_i16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+        v = _mm_set_epi16(0, 0, 0, 0, d, c, b, a);
+    }
+    FORCEINLINE __vec4_i16(uint16_t *p) {
+        v = _mm_set_epi16(0, 0, 0, 0, p[3], p[2], p[1], p[0]);
+    }
+
+    __m128i v;
+};
+
+
+struct __vec4_i8 {
+    __vec4_i8() { }
+    FORCEINLINE __vec4_i8(__m128i vv) : v(vv) {  }
+    FORCEINLINE __vec4_i8(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
+        v = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, d, c, b, a);
+                         
+    }
+    FORCEINLINE __vec4_i8(uint8_t *p) {
+        v = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 
+                         0, 0, 0, 0, p[3], p[2], p[1], p[0]);
+    }
+
+    __m128i v;
+};
+
+
+struct __vec4_d {
+    __vec4_d() { }
+    FORCEINLINE __vec4_d(__m128d a, __m128d b) { v[0] = a; v[1] = b; }
+    FORCEINLINE __vec4_d(double a, double b, double c, double d) {
+        v[0] = _mm_set_pd(b, a);
+        v[1] = _mm_set_pd(d, c);
+    }
+
+    __m128d v[2];
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// SSE helpers / utility functions
+
+static FORCEINLINE double _mm_extract_pd(__m128d v, int i)  {
+    return ((double *)&v)[i];
+}
+
+static FORCEINLINE float bits_as_float(uint32_t v) {
+    union {
+        uint32_t ui;
+        float f;
+    } u;
+    u.ui = v;
+    return u.f;
+}
+
+template <typename T>
+static FORCEINLINE T __select(bool test, T a, T b) {
+    return test ? a : b;
+}
+
+#define INSERT_EXTRACT(VTYPE, STYPE)                                  \
+    static FORCEINLINE STYPE __extract_element(VTYPE v, int index) { \
+    return ((STYPE *)&v)[index];                                      \
+}                                                    \
+static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
+    ((STYPE *)v)[index] = val;                                        \
+}
+
+INSERT_EXTRACT(__vec1_i8, int8_t)
+INSERT_EXTRACT(__vec1_i16, int16_t)
+INSERT_EXTRACT(__vec1_i32, int32_t)
+INSERT_EXTRACT(__vec1_i64, int64_t)
+INSERT_EXTRACT(__vec1_f, float)
+INSERT_EXTRACT(__vec1_d, double)
+
+#define CAST_BITS_SCALAR(TO, FROM)                  \
+static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
+    union {                                         \
+    TO to;                                          \
+    FROM from;                                      \
+    } u;                                            \
+    u.from = v;                                     \
+    return u.to;                                    \
+}
+
+CAST_BITS_SCALAR(uint32_t, float)
+CAST_BITS_SCALAR(int32_t, float)
+CAST_BITS_SCALAR(float, uint32_t)
+CAST_BITS_SCALAR(float, int32_t)
+CAST_BITS_SCALAR(uint64_t, double)
+CAST_BITS_SCALAR(int64_t, double)
+CAST_BITS_SCALAR(double, uint64_t)
+CAST_BITS_SCALAR(double, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// mask ops
+
+static FORCEINLINE uint32_t __movmsk(__vec4_i1 mask) {
+    return _mm_movemask_ps(mask.v);
+}
+
+static FORCEINLINE __vec4_i1 __and(__vec4_i1 a, __vec4_i1 b) {
+    return _mm_and_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __xor(__vec4_i1 a, __vec4_i1 b) {
+    return _mm_xor_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __or(__vec4_i1 a, __vec4_i1 b) {
+    return _mm_or_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __select(__vec4_i1 mask, __vec4_i1 a, __vec4_i1 b) {
+    return _mm_blendv_ps(b.v, a.v, mask.v);
+}
+
+static FORCEINLINE bool __extract_element(__vec4_i1 v, int index) {
+    return ((int32_t *)&v)[index] ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) {
+    ((int32_t *)v)[index] = val ? -1 : 0;
+}
+
+static FORCEINLINE __vec4_i1 __load(__vec4_i1 *v, int align) {
+    // FIXME: handle align of 16...
+    return _mm_loadu_ps((float *)(&v->v));
+}
+
+static FORCEINLINE void __store(__vec4_i1 *p, __vec4_i1 value, int align) {
+    // FIXME: handle align
+    _mm_storeu_ps((float *)(&p->v), value.v);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// int8
+
+static FORCEINLINE __vec4_i8 __add(__vec4_i8 a, __vec4_i8 b) {
+    return _mm_add_epi8(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i8 __sub(__vec4_i8 a, __vec4_i8 b) {
+    return _mm_sub_epi8(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i8 __mul(__vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i8(_mm_extract_epi8(a.v, 0) * _mm_extract_epi8(b.v, 0),
+                     _mm_extract_epi8(a.v, 1) * _mm_extract_epi8(b.v, 1),
+                     _mm_extract_epi8(a.v, 2) * _mm_extract_epi8(b.v, 2),
+                     _mm_extract_epi8(a.v, 3) * _mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i8 __or(__vec4_i8 a, __vec4_i8 b) {
+    return _mm_or_si128(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i8 __and(__vec4_i8 a, __vec4_i8 b) {
+    return _mm_and_si128(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i8 __xor(__vec4_i8 a, __vec4_i8 b) {
+    return _mm_xor_si128(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i8(_mm_extract_epi8(a.v, 0) << _mm_extract_epi8(b.v, 0),
+                     _mm_extract_epi8(a.v, 1) << _mm_extract_epi8(b.v, 1),
+                     _mm_extract_epi8(a.v, 2) << _mm_extract_epi8(b.v, 2),
+                     _mm_extract_epi8(a.v, 3) << _mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i8 __udiv(__vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) / 
+                     (uint8_t)_mm_extract_epi8(b.v, 0),
+                     (uint8_t)_mm_extract_epi8(a.v, 1) / 
+                     (uint8_t)_mm_extract_epi8(b.v, 1),
+                     (uint8_t)_mm_extract_epi8(a.v, 2) / 
+                     (uint8_t)_mm_extract_epi8(b.v, 2),
+                     (uint8_t)_mm_extract_epi8(a.v, 3) / 
+                     (uint8_t)_mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i8  __sdiv(__vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) / 
+                     (int8_t)_mm_extract_epi8(b.v, 0),
+                     (int8_t)_mm_extract_epi8(a.v, 1) / 
+                     (int8_t)_mm_extract_epi8(b.v, 1),
+                     (int8_t)_mm_extract_epi8(a.v, 2) / 
+                     (int8_t)_mm_extract_epi8(b.v, 2),
+                     (int8_t)_mm_extract_epi8(a.v, 3) / 
+                     (int8_t)_mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i8 __urem(__vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) % 
+                     (uint8_t)_mm_extract_epi8(b.v, 0),
+                     (uint8_t)_mm_extract_epi8(a.v, 1) % 
+                     (uint8_t)_mm_extract_epi8(b.v, 1),
+                     (uint8_t)_mm_extract_epi8(a.v, 2) %
+                     (uint8_t)_mm_extract_epi8(b.v, 2),
+                     (uint8_t)_mm_extract_epi8(a.v, 3) %
+                     (uint8_t)_mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i8  __srem(__vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) % 
+                     (int8_t)_mm_extract_epi8(b.v, 0),
+                     (int8_t)_mm_extract_epi8(a.v, 1) % 
+                     (int8_t)_mm_extract_epi8(b.v, 1),
+                     (int8_t)_mm_extract_epi8(a.v, 2) %
+                     (int8_t)_mm_extract_epi8(b.v, 2),
+                     (int8_t)_mm_extract_epi8(a.v, 3) %
+                     (int8_t)_mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) >>
+                     (uint8_t)_mm_extract_epi8(b.v, 0),
+                     (uint8_t)_mm_extract_epi8(a.v, 1) >>
+                     (uint8_t)_mm_extract_epi8(b.v, 1),
+                     (uint8_t)_mm_extract_epi8(a.v, 2) >>
+                     (uint8_t)_mm_extract_epi8(b.v, 2),
+                     (uint8_t)_mm_extract_epi8(a.v, 3) >>
+                     (uint8_t)_mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >>
+                     (int8_t)_mm_extract_epi8(b.v, 0),
+                     (int8_t)_mm_extract_epi8(a.v, 1) >>
+                     (int8_t)_mm_extract_epi8(b.v, 1),
+                     (int8_t)_mm_extract_epi8(a.v, 2) >>
+                     (int8_t)_mm_extract_epi8(b.v, 2),
+                     (int8_t)_mm_extract_epi8(a.v, 3) >>
+                     (int8_t)_mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) {
+    __m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
+    return __vec4_i1(_mm_extract_epi8(cmp, 0),
+                     _mm_extract_epi8(cmp, 1),
+                     _mm_extract_epi8(cmp, 2),
+                     _mm_extract_epi8(cmp, 3));
+}
+
+static FORCEINLINE __vec4_i1 __not_equal(__vec4_i8 a, __vec4_i8 b) {
+    return __xor(__equal(a, b), __vec4_i1(1, 1, 1, 1));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) <=
+                     (uint8_t)_mm_extract_epi8(b.v, 0),
+                     (uint8_t)_mm_extract_epi8(a.v, 1) <=
+                     (uint8_t)_mm_extract_epi8(b.v, 1),
+                     (uint8_t)_mm_extract_epi8(a.v, 2) <=
+                     (uint8_t)_mm_extract_epi8(b.v, 2),
+                     (uint8_t)_mm_extract_epi8(a.v, 3) <= 
+                     (uint8_t)_mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) >=
+                     (uint8_t)_mm_extract_epi8(b.v, 0),
+                     (uint8_t)_mm_extract_epi8(a.v, 1) >=
+                     (uint8_t)_mm_extract_epi8(b.v, 1),
+                     (uint8_t)_mm_extract_epi8(a.v, 2) >=
+                     (uint8_t)_mm_extract_epi8(b.v, 2),
+                     (uint8_t)_mm_extract_epi8(a.v, 3) >=
+                     (uint8_t)_mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) <
+                     (uint8_t)_mm_extract_epi8(b.v, 0),
+                     (uint8_t)_mm_extract_epi8(a.v, 1) <
+                     (uint8_t)_mm_extract_epi8(b.v, 1),
+                     (uint8_t)_mm_extract_epi8(a.v, 2) <
+                     (uint8_t)_mm_extract_epi8(b.v, 2),
+                     (uint8_t)_mm_extract_epi8(a.v, 3) <
+                     (uint8_t)_mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) >
+                     (uint8_t)_mm_extract_epi8(b.v, 0),
+                     (uint8_t)_mm_extract_epi8(a.v, 1) >
+                     (uint8_t)_mm_extract_epi8(b.v, 1),
+                     (uint8_t)_mm_extract_epi8(a.v, 2) >
+                     (uint8_t)_mm_extract_epi8(b.v, 2),
+                     (uint8_t)_mm_extract_epi8(a.v, 3) >
+                     (uint8_t)_mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i1  __signed_less_than(__vec4_i8 a, __vec4_i8 b) {
+    __m128i cmp = _mm_cmplt_epi8(a.v, b.v);
+    return __vec4_i1(_mm_extract_epi8(cmp, 0),
+                     _mm_extract_epi8(cmp, 1),
+                     _mm_extract_epi8(cmp, 2),
+                     _mm_extract_epi8(cmp, 3));
+}
+
+static FORCEINLINE __vec4_i1  __signed_less_equal(__vec4_i8 a, __vec4_i8 b) {
+    return __or(__signed_less_than(a, b), __equal(a, b));
+}
+
+static FORCEINLINE __vec4_i1  __signed_greater_than(__vec4_i8 a, __vec4_i8 b) {
+    __m128i cmp = _mm_cmpgt_epi8(a.v, b.v);
+    return __vec4_i1(_mm_extract_epi8(cmp, 0),
+                     _mm_extract_epi8(cmp, 1),
+                     _mm_extract_epi8(cmp, 2),
+                     _mm_extract_epi8(cmp, 3));
+}
+
+static FORCEINLINE __vec4_i1  __signed_greater_equal(__vec4_i8 a, __vec4_i8 b) {
+    return __or(__signed_greater_than(a, b), __equal(a, b));
+}
+
+static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b) {
+    return __vec4_i8((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi8(a.v, 0) : 
+                                                        _mm_extract_epi8(b.v, 0),
+                     (_mm_extract_ps(mask.v, 1) != 0) ? _mm_extract_epi8(a.v, 1) : 
+                                                        _mm_extract_epi8(b.v, 1),
+                     (_mm_extract_ps(mask.v, 2) != 0) ? _mm_extract_epi8(a.v, 2) : 
+                                                        _mm_extract_epi8(b.v, 2),
+                     (_mm_extract_ps(mask.v, 3) != 0) ? _mm_extract_epi8(a.v, 3) : 
+                                                        _mm_extract_epi8(b.v, 3));
+}
+
+static FORCEINLINE int8_t __extract_element(__vec4_i8 v, int index) {
+    return ((int8_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
+    ((int8_t *)v)[index] = val;
+}
+
+static FORCEINLINE __vec4_i8 __smear_i8(int8_t v) {
+    return _mm_set1_epi8(v);
+}
+
+static FORCEINLINE __vec4_i8 __broadcast_i8(__vec4_i8 v, int index) {
+    return _mm_set1_epi8(__extract_element(v, index));
+}
+
+static FORCEINLINE __vec4_i8 __rotate_i8(__vec4_i8 v, int delta) {
+    return __vec4_i8(__extract_element(v, delta     & 0x3),
+                     __extract_element(v, (delta+1) & 0x3),
+                     __extract_element(v, (delta+2) & 0x3),
+                     __extract_element(v, (delta+3) & 0x3));
+}
+
+static FORCEINLINE __vec4_i8 __shuffle_i8(__vec4_i8 v, __vec4_i32 index) {
+    return __vec4_i8(__extract_element(v, __extract_element(index, 0) & 0x3),
+                     __extract_element(v, __extract_element(index, 1) & 0x3),
+                     __extract_element(v, __extract_element(index, 2) & 0x3),
+                     __extract_element(v, __extract_element(index, 3) & 0x3));
+}
+
+static FORCEINLINE __vec4_i8 __shuffle2_i8(__vec4_i8 v0, __vec4_i8 v1, 
+                                           __vec4_i32 index) {
+    uint8_t r[4];
+    for (int i = 0; i < 4; ++i) {
+        uint32_t elt = __extract_element(index, i) & 0x7;
+        r[i] = (elt < 4) ? __extract_element(v0, elt) : __extract_element(v1, elt & 0x3);
+    }
+    return __vec4_i8(r[0], r[1], r[2], r[3]);
+}
+
+static FORCEINLINE __vec4_i8 __load(__vec4_i8 *v, int align) {
+    uint8_t *ptr = (uint8_t *)(&v->v);
+    return __vec4_i8(ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+static FORCEINLINE void __store(__vec4_i8 *p, __vec4_i8 value, int align) {
+    uint8_t *ptr = (uint8_t *)(&p->v);
+    ptr[0] = _mm_extract_epi8(value.v, 0);
+    ptr[1] = _mm_extract_epi8(value.v, 1);
+    ptr[2] = _mm_extract_epi8(value.v, 2);
+    ptr[3] = _mm_extract_epi8(value.v, 3);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// int16
+
+static FORCEINLINE __vec4_i16 __add(__vec4_i16 a, __vec4_i16 b) {
+    return _mm_add_epi16(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i16 __sub(__vec4_i16 a, __vec4_i16 b) {
+    return _mm_sub_epi16(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i16 __mul(__vec4_i16 a, __vec4_i16 b) {
+    return _mm_mullo_epi16(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i16 __or(__vec4_i16 a, __vec4_i16 b) {
+    return _mm_or_si128(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i16 __and(__vec4_i16 a, __vec4_i16 b) {
+    return _mm_and_si128(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i16 __xor(__vec4_i16 a, __vec4_i16 b) {
+    return _mm_xor_si128(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, __vec4_i16 b) {
+    return __vec4_i16(_mm_extract_epi16(a.v, 0) << _mm_extract_epi16(b.v, 0),
+                      _mm_extract_epi16(a.v, 1) << _mm_extract_epi16(b.v, 1),
+                      _mm_extract_epi16(a.v, 2) << _mm_extract_epi16(b.v, 2),
+                      _mm_extract_epi16(a.v, 3) << _mm_extract_epi16(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i16 __udiv(__vec4_i16 a, __vec4_i16 b) {
+    return __vec4_i16((uint16_t)_mm_extract_epi16(a.v, 0) /
+                      (uint16_t)_mm_extract_epi16(b.v, 0),
+                      (uint16_t)_mm_extract_epi16(a.v, 1) /
+                      (uint16_t)_mm_extract_epi16(b.v, 1),
+                      (uint16_t)_mm_extract_epi16(a.v, 2) /
+                      (uint16_t)_mm_extract_epi16(b.v, 2),
+                      (uint16_t)_mm_extract_epi16(a.v, 3) /
+                      (uint16_t)_mm_extract_epi16(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i16  __sdiv(__vec4_i16 a, __vec4_i16 b) {
+    return __vec4_i16((int16_t)_mm_extract_epi16(a.v, 0) /
+                      (int16_t)_mm_extract_epi16(b.v, 0),
+                      (int16_t)_mm_extract_epi16(a.v, 1) /
+                      (int16_t)_mm_extract_epi16(b.v, 1),
+                      (int16_t)_mm_extract_epi16(a.v, 2) /
+                      (int16_t)_mm_extract_epi16(b.v, 2),
+                      (int16_t)_mm_extract_epi16(a.v, 3) /
+                      (int16_t)_mm_extract_epi16(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i16 __urem(__vec4_i16 a, __vec4_i16 b) {
+    return __vec4_i16((uint16_t)_mm_extract_epi16(a.v, 0) %
+                      (uint16_t)_mm_extract_epi16(b.v, 0),
+                      (uint16_t)_mm_extract_epi16(a.v, 1) %
+                      (uint16_t)_mm_extract_epi16(b.v, 1),
+                      (uint16_t)_mm_extract_epi16(a.v, 2) %
+                      (uint16_t)_mm_extract_epi16(b.v, 2),
+                      (uint16_t)_mm_extract_epi16(a.v, 3) %
+                      (uint16_t)_mm_extract_epi16(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i16 __srem(__vec4_i16 a, __vec4_i16 b) {
+    return __vec4_i16((int16_t)_mm_extract_epi16(a.v, 0) %
+                      (int16_t)_mm_extract_epi16(b.v, 0),
+                      (int16_t)_mm_extract_epi16(a.v, 1) %
+                      (int16_t)_mm_extract_epi16(b.v, 1),
+                      (int16_t)_mm_extract_epi16(a.v, 2) %
+                      (int16_t)_mm_extract_epi16(b.v, 2),
+                      (int16_t)_mm_extract_epi16(a.v, 3) %
+                      (int16_t)_mm_extract_epi16(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, __vec4_i16 b) {
+    return __vec4_i16((uint16_t)_mm_extract_epi16(a.v, 0) >>
+                      (uint16_t)_mm_extract_epi16(b.v, 0),
+                      (uint16_t)_mm_extract_epi16(a.v, 1) >>
+                      (uint16_t)_mm_extract_epi16(b.v, 1),
+                      (uint16_t)_mm_extract_epi16(a.v, 2) >>
+                      (uint16_t)_mm_extract_epi16(b.v, 2),
+                      (uint16_t)_mm_extract_epi16(a.v, 3) >>
+                      (uint16_t)_mm_extract_epi16(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
+    return __vec4_i16((int16_t)_mm_extract_epi16(a.v, 0) >>
+                      (int16_t)_mm_extract_epi16(b.v, 0),
+                      (int16_t)_mm_extract_epi16(a.v, 1) >>
+                      (int16_t)_mm_extract_epi16(b.v, 1),
+                      (int16_t)_mm_extract_epi16(a.v, 2) >>
+                      (int16_t)_mm_extract_epi16(b.v, 2),
+                      (int16_t)_mm_extract_epi16(a.v, 3) >>
+                      (int16_t)_mm_extract_epi16(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) {
+    __m128i cmp = _mm_cmpeq_epi16(a.v, b.v);
+    return __vec4_i1(_mm_extract_epi16(cmp, 0),
+                     _mm_extract_epi16(cmp, 1),
+                     _mm_extract_epi16(cmp, 2),
+                     _mm_extract_epi16(cmp, 3));
+}
+
+static FORCEINLINE __vec4_i1  __not_equal(__vec4_i16 a, __vec4_i16 b) {
+    return __xor(__equal(a, b), __vec4_i1(1, 1, 1, 1));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i16 a, __vec4_i16 b) {
+    // FIXME: could use the trick that int32 does for the unsigned
+    // comparisons so that we don't need to scalarie them.  (This also
+    // applies to i8s...)
+    return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) <=
+                     (uint16_t)_mm_extract_epi16(b.v, 0),
+                     (uint16_t)_mm_extract_epi16(a.v, 1) <=
+                     (uint16_t)_mm_extract_epi16(b.v, 1),
+                     (uint16_t)_mm_extract_epi16(a.v, 2) <=
+                     (uint16_t)_mm_extract_epi16(b.v, 2),
+                     (uint16_t)_mm_extract_epi16(a.v, 3) <= 
+                     (uint16_t)_mm_extract_epi16(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i16 a, __vec4_i16 b) {
+    return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) >=
+                     (uint16_t)_mm_extract_epi16(b.v, 0),
+                     (uint16_t)_mm_extract_epi16(a.v, 1) >=
+                     (uint16_t)_mm_extract_epi16(b.v, 1),
+                     (uint16_t)_mm_extract_epi16(a.v, 2) >=
+                     (uint16_t)_mm_extract_epi16(b.v, 2),
+                     (uint16_t)_mm_extract_epi16(a.v, 3) >=
+                     (uint16_t)_mm_extract_epi16(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i16 a, __vec4_i16 b) {
+    return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) <
+                     (uint16_t)_mm_extract_epi16(b.v, 0),
+                     (uint16_t)_mm_extract_epi16(a.v, 1) <
+                     (uint16_t)_mm_extract_epi16(b.v, 1),
+                     (uint16_t)_mm_extract_epi16(a.v, 2) <
+                     (uint16_t)_mm_extract_epi16(b.v, 2),
+                     (uint16_t)_mm_extract_epi16(a.v, 3) <
+                     (uint16_t)_mm_extract_epi16(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i16 a, __vec4_i16 b) {
+    return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) >
+                     (uint16_t)_mm_extract_epi16(b.v, 0),
+                     (uint16_t)_mm_extract_epi16(a.v, 1) >
+                     (uint16_t)_mm_extract_epi16(b.v, 1),
+                     (uint16_t)_mm_extract_epi16(a.v, 2) >
+                     (uint16_t)_mm_extract_epi16(b.v, 2),
+                     (uint16_t)_mm_extract_epi16(a.v, 3) >
+                     (uint16_t)_mm_extract_epi16(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i1  __signed_less_than(__vec4_i16 a, __vec4_i16 b) {
+    __m128i cmp = _mm_cmplt_epi16(a.v, b.v);
+    return __vec4_i1(_mm_extract_epi16(cmp, 0),
+                     _mm_extract_epi16(cmp, 1),
+                     _mm_extract_epi16(cmp, 2),
+                     _mm_extract_epi16(cmp, 3));
+}
+
+static FORCEINLINE __vec4_i1  __signed_less_equal(__vec4_i16 a, __vec4_i16 b) {
+    return __or(__signed_less_than(a, b), __equal(a, b));
+}
+
+static FORCEINLINE __vec4_i1  __signed_greater_than(__vec4_i16 a, __vec4_i16 b) {
+    __m128i cmp =  _mm_cmpgt_epi16(a.v, b.v);
+    return __vec4_i1(_mm_extract_epi16(cmp, 0),
+                     _mm_extract_epi16(cmp, 1),
+                     _mm_extract_epi16(cmp, 2),
+                     _mm_extract_epi16(cmp, 3));
+}
+
+static FORCEINLINE __vec4_i1  __signed_greater_equal(__vec4_i16 a, __vec4_i16 b) {
+    return __or(__signed_greater_than(a, b), __equal(a, b));
+}
+
+static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16 b) {
+    return __vec4_i16((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi16(a.v, 0) : 
+                                                         _mm_extract_epi16(b.v, 0),
+                      (_mm_extract_ps(mask.v, 1) != 0) ? _mm_extract_epi16(a.v, 1) : 
+                                                         _mm_extract_epi16(b.v, 1),
+                      (_mm_extract_ps(mask.v, 2) != 0) ? _mm_extract_epi16(a.v, 2) : 
+                                                         _mm_extract_epi16(b.v, 2),
+                      (_mm_extract_ps(mask.v, 3) != 0) ? _mm_extract_epi16(a.v, 3) : 
+                                                         _mm_extract_epi16(b.v, 3));
+}
+
+static FORCEINLINE int16_t __extract_element(__vec4_i16 v, int index) {
+    return ((int16_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val) {
+    ((int16_t *)v)[index] = val;
+}
+
+static FORCEINLINE __vec4_i16 __smear_i16(int16_t v) {
+    return _mm_set1_epi16(v);
+}
+
+static FORCEINLINE __vec4_i16 __broadcast_i16(__vec4_i16 v, int index) {
+    return _mm_set1_epi16(__extract_element(v, index));
+}
+
+static FORCEINLINE __vec4_i16 __rotate_i16(__vec4_i16 v, int delta) {
+    return __vec4_i16(__extract_element(v, delta     & 0x3),
+                      __extract_element(v, (delta+1) & 0x3),
+                      __extract_element(v, (delta+2) & 0x3),
+                      __extract_element(v, (delta+3) & 0x3));
+}
+
+static FORCEINLINE __vec4_i16 __shuffle_i16(__vec4_i16 v, __vec4_i32 index) {
+    return __vec4_i16(__extract_element(v, __extract_element(index, 0) & 0x3),
+                      __extract_element(v, __extract_element(index, 1) & 0x3),
+                      __extract_element(v, __extract_element(index, 2) & 0x3),
+                      __extract_element(v, __extract_element(index, 3) & 0x3));
+}
+
+static FORCEINLINE __vec4_i16 __shuffle2_i16(__vec4_i16 v0, __vec4_i16 v1, 
+                                           __vec4_i32 index) {
+    uint16_t r[4];
+    for (int i = 0; i < 4; ++i) {
+        uint32_t elt = __extract_element(index, i) & 0x7;
+        r[i] = (elt < 4) ? __extract_element(v0, elt) : __extract_element(v1, elt & 0x3);
+    }
+    return __vec4_i16(r[0], r[1], r[2], r[3]);
+}
+
+static FORCEINLINE __vec4_i16 __load(__vec4_i16 *v, int align) {
+    uint16_t *ptr = (uint16_t *)(&v->v);
+    return __vec4_i16(ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+static FORCEINLINE void __store(__vec4_i16 *p, __vec4_i16 value, int align) {
+    uint16_t *ptr = (uint16_t *)(&p->v);
+    ptr[0] = _mm_extract_epi16(value.v, 0);
+    ptr[1] = _mm_extract_epi16(value.v, 1);
+    ptr[2] = _mm_extract_epi16(value.v, 2);
+    ptr[3] = _mm_extract_epi16(value.v, 3);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// int32
+
+static FORCEINLINE __vec4_i32 __add(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_add_epi32(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i32 __sub(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_sub_epi32(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i32 __mul(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_mullo_epi32(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i32 __or(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_or_si128(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i32 __and(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_and_si128(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i32 __xor(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_xor_si128(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, __vec4_i32 b) {
+    // FIXME: if we can determine at compile time that b has the same value
+    // across all elements, then we can use _mm_sll_epi32.
+
+    /* fixme: llvm generates thie code for shift left, which is presumably
+       more efficient than doing each component individually as below.
+
+LCPI0_0:
+        .long   1065353216              ## 0x3f800000
+        .long   1065353216              ## 0x3f800000
+        .long   1065353216              ## 0x3f800000
+        .long   1065353216              ## 0x3f800000
+        .section        __TEXT,__text,regular,pure_instructions
+        .globl  _f___ii
+        .align  4, 0x90
+_f___ii:                                ## @f___ii
+## BB#0:                                ## %allocas
+        pslld   $23, %xmm1
+        paddd   LCPI0_0(%rip), %xmm1
+        cvttps2dq       %xmm1, %xmm1
+        pmulld  %xmm0, %xmm1
+        movdqa  %xmm1, %xmm0
+        ret
+
+     */
+    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) << _mm_extract_epi32(b.v, 0),
+                      (uint32_t)_mm_extract_epi32(a.v, 1) << _mm_extract_epi32(b.v, 1),
+                      (uint32_t)_mm_extract_epi32(a.v, 2) << _mm_extract_epi32(b.v, 2),
+                      (uint32_t)_mm_extract_epi32(a.v, 3) << _mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i32 __udiv(__vec4_i32 a, __vec4_i32 b) {
+    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) / (uint32_t)_mm_extract_epi32(b.v, 0),
+                      (uint32_t)_mm_extract_epi32(a.v, 1) / (uint32_t)_mm_extract_epi32(b.v, 1),
+                      (uint32_t)_mm_extract_epi32(a.v, 2) / (uint32_t)_mm_extract_epi32(b.v, 2),
+                      (uint32_t)_mm_extract_epi32(a.v, 3) / (uint32_t)_mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i32 __sdiv(__vec4_i32 a, __vec4_i32 b) {
+    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) / (int32_t)_mm_extract_epi32(b.v, 0),
+                      (int32_t)_mm_extract_epi32(a.v, 1) / (int32_t)_mm_extract_epi32(b.v, 1),
+                      (int32_t)_mm_extract_epi32(a.v, 2) / (int32_t)_mm_extract_epi32(b.v, 2),
+                      (int32_t)_mm_extract_epi32(a.v, 3) / (int32_t)_mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i32 __urem(__vec4_i32 a, __vec4_i32 b) {
+    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) % (uint32_t)_mm_extract_epi32(b.v, 0),
+                      (uint32_t)_mm_extract_epi32(a.v, 1) % (uint32_t)_mm_extract_epi32(b.v, 1),
+                      (uint32_t)_mm_extract_epi32(a.v, 2) % (uint32_t)_mm_extract_epi32(b.v, 2),
+                      (uint32_t)_mm_extract_epi32(a.v, 3) % (uint32_t)_mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i32 __srem(__vec4_i32 a, __vec4_i32 b) {
+    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) % (int32_t)_mm_extract_epi32(b.v, 0),
+                      (int32_t)_mm_extract_epi32(a.v, 1) % (int32_t)_mm_extract_epi32(b.v, 1),
+                      (int32_t)_mm_extract_epi32(a.v, 2) % (int32_t)_mm_extract_epi32(b.v, 2),
+                      (int32_t)_mm_extract_epi32(a.v, 3) % (int32_t)_mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, __vec4_i32 b) {
+    // FIXME: if we can determine at compile time that b has the same value
+    // across all elements, e.g. using gcc's __builtin_constant_p, then we
+    // can use _mm_srl_epi32.
+    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
+                      (uint32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
+                      (uint32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
+                      (uint32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, __vec4_i32 b) {
+    // FIXME: if we can determine at compile time that b has the same value
+    // across all elements, then we can use _mm_sra_epi32.
+    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
+                      (int32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
+                      (int32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
+                      (int32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i1 __equal(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_cmpeq_epi32(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __not_equal(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_xor_si128(_mm_cmpeq_epi32(a.v, b.v),
+                         _mm_cmpeq_epi32(a.v, a.v));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i32 a, __vec4_i32 b) {
+    a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
+    b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
+    return _mm_or_si128(_mm_cmplt_epi32(a.v, b.v),
+                        _mm_cmpeq_epi32(a.v, b.v));
+}
+
+static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_or_si128(_mm_cmplt_epi32(a.v, b.v),
+                        _mm_cmpeq_epi32(a.v, b.v));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i32 a, __vec4_i32 b) {
+    a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
+    b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
+    return _mm_or_si128(_mm_cmpgt_epi32(a.v, b.v),
+                        _mm_cmpeq_epi32(a.v, b.v));
+}
+
+static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_or_si128(_mm_cmpgt_epi32(a.v, b.v),
+                        _mm_cmpeq_epi32(a.v, b.v));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i32 a, __vec4_i32 b) {
+    a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
+    b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
+    return _mm_cmplt_epi32(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_cmplt_epi32(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i32 a, __vec4_i32 b) {
+    a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
+    b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
+    return _mm_cmpgt_epi32(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_cmpgt_epi32(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i32 __select(__vec4_i1 mask, __vec4_i32 a, __vec4_i32 b) {
+    return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b.v), 
+                                          _mm_castsi128_ps(a.v), mask.v));
+}
+
+static FORCEINLINE __vec4_i32 __smear_i32(int32_t v) {
+    return _mm_set1_epi32(v);
+}
+
+static FORCEINLINE int32_t __extract_element(__vec4_i32 v, int index) {
+    return ((int32_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i32 *v, int index, int32_t val) {
+    ((int32_t *)v)[index] = val;
+}
+
+static FORCEINLINE __vec4_i32 __broadcast_i32(__vec4_i32 v, int index) {
+    return _mm_set1_epi32(__extract_element(v, index));
+}
+
+static FORCEINLINE __vec4_i32 __rotate_i32(__vec4_i32 v, int delta) {
+    return __vec4_i32(__extract_element(v, delta     & 0x3),
+                      __extract_element(v, (delta+1) & 0x3),
+                      __extract_element(v, (delta+2) & 0x3),
+                      __extract_element(v, (delta+3) & 0x3));
+}
+
+static FORCEINLINE __vec4_i32 __shuffle_i32(__vec4_i32 v, __vec4_i32 index) {
+    return __vec4_i32(__extract_element(v, __extract_element(index, 0) & 0x3),
+                      __extract_element(v, __extract_element(index, 1) & 0x3),
+                      __extract_element(v, __extract_element(index, 2) & 0x3),
+                      __extract_element(v, __extract_element(index, 3) & 0x3));
+}
+
+static FORCEINLINE __vec4_i32 __shuffle2_i32(__vec4_i32 v0, __vec4_i32 v1, 
+                                           __vec4_i32 index) {
+    uint32_t r[4];
+    for (int i = 0; i < 4; ++i) {
+        uint32_t elt = __extract_element(index, i) & 0x7;
+        r[i] = (elt < 4) ? __extract_element(v0, elt) : __extract_element(v1, elt & 0x3);
+    }
+    return __vec4_i32(r[0], r[1], r[2], r[3]);
+}
+
+static FORCEINLINE __vec4_i32 __load(__vec4_i32 *v, int align) {
+    // FIXME: handle align of 16...
+    return _mm_loadu_si128((__m128i *)(&v->v));
+}
+
+static void __store(__vec4_i32 *p, __vec4_i32 value, int align) {
+    // FIXME: handle align
+    _mm_storeu_si128((__m128i *)(&p->v), value.v);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// int64
+
+static FORCEINLINE __vec4_i64 __add(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64(_mm_add_epi64(a.v[0], b.v[0]),
+                      _mm_add_epi64(a.v[1], b.v[1]));
+}
+
+static FORCEINLINE __vec4_i64 __sub(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64(_mm_sub_epi64(a.v[0], b.v[0]),
+                      _mm_sub_epi64(a.v[1], b.v[1]));
+}
+
+static FORCEINLINE __vec4_i64 __mul(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64(_mm_extract_epi64(a.v[0], 0) * _mm_extract_epi64(b.v[0], 0),
+                      _mm_extract_epi64(a.v[0], 1) * _mm_extract_epi64(b.v[0], 1),
+                      _mm_extract_epi64(a.v[1], 0) * _mm_extract_epi64(b.v[1], 0),
+                      _mm_extract_epi64(a.v[1], 1) * _mm_extract_epi64(b.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i64 __or(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64(_mm_or_si128(a.v[0], b.v[0]),
+                      _mm_or_si128(a.v[1], b.v[1]));
+}
+
+static FORCEINLINE __vec4_i64 __and(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64(_mm_and_si128(a.v[0], b.v[0]),
+                      _mm_and_si128(a.v[1], b.v[1]));
+}
+
+static FORCEINLINE __vec4_i64 __xor(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64(_mm_xor_si128(a.v[0], b.v[0]),
+                      _mm_xor_si128(a.v[1], b.v[1]));
+}
+
+static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64(_mm_extract_epi64(a.v[0], 0) << _mm_extract_epi64(b.v[0], 0),
+                      _mm_extract_epi64(a.v[0], 1) << _mm_extract_epi64(b.v[0], 1),
+                      _mm_extract_epi64(a.v[1], 0) << _mm_extract_epi64(b.v[1], 0),
+                      _mm_extract_epi64(a.v[1], 1) << _mm_extract_epi64(b.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i64 __udiv(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64((uint64_t)_mm_extract_epi64(a.v[0], 0) /
+                      (uint64_t)_mm_extract_epi64(b.v[0], 0),
+                      (uint64_t)_mm_extract_epi64(a.v[0], 1) /
+                      (uint64_t)_mm_extract_epi64(b.v[0], 1),
+                      (uint64_t)_mm_extract_epi64(a.v[1], 0) /
+                      (uint64_t)_mm_extract_epi64(b.v[1], 0),
+                      (uint64_t)_mm_extract_epi64(a.v[1], 1) /
+                      (uint64_t)_mm_extract_epi64(b.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i64 __sdiv(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) /
+                      (int64_t)_mm_extract_epi64(b.v[0], 0),
+                      (int64_t)_mm_extract_epi64(a.v[0], 1) /
+                      (int64_t)_mm_extract_epi64(b.v[0], 1),
+                      (int64_t)_mm_extract_epi64(a.v[1], 0) /
+                      (int64_t)_mm_extract_epi64(b.v[1], 0),
+                      (int64_t)_mm_extract_epi64(a.v[1], 1) /
+                      (int64_t)_mm_extract_epi64(b.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i64 __urem(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64((uint64_t)_mm_extract_epi64(a.v[0], 0) %
+                      (uint64_t)_mm_extract_epi64(b.v[0], 0),
+                      (uint64_t)_mm_extract_epi64(a.v[0], 1) %
+                      (uint64_t)_mm_extract_epi64(b.v[0], 1),
+                      (uint64_t)_mm_extract_epi64(a.v[1], 0) %
+                      (uint64_t)_mm_extract_epi64(b.v[1], 0),
+                      (uint64_t)_mm_extract_epi64(a.v[1], 1) %
+                      (uint64_t)_mm_extract_epi64(b.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i64 __srem(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) %
+                      (int64_t)_mm_extract_epi64(b.v[0], 0),
+                      (int64_t)_mm_extract_epi64(a.v[0], 1) %
+                      (int64_t)_mm_extract_epi64(b.v[0], 1),
+                      (int64_t)_mm_extract_epi64(a.v[1], 0) %
+                      (int64_t)_mm_extract_epi64(b.v[1], 0),
+                      (int64_t)_mm_extract_epi64(a.v[1], 1) %
+                      (int64_t)_mm_extract_epi64(b.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64((uint64_t)_mm_extract_epi64(a.v[0], 0) >>
+                      (uint64_t)_mm_extract_epi64(b.v[0], 0),
+                      (uint64_t)_mm_extract_epi64(a.v[0], 1) >>
+                      (uint64_t)_mm_extract_epi64(b.v[0], 1),
+                      (uint64_t)_mm_extract_epi64(a.v[1], 0) >>
+                      (uint64_t)_mm_extract_epi64(b.v[1], 0),
+                      (uint64_t)_mm_extract_epi64(a.v[1], 1) >>
+                      (uint64_t)_mm_extract_epi64(b.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >>
+                      (int64_t)_mm_extract_epi64(b.v[0], 0),
+                      (int64_t)_mm_extract_epi64(a.v[0], 1) >>
+                      (int64_t)_mm_extract_epi64(b.v[0], 1),
+                      (int64_t)_mm_extract_epi64(a.v[1], 0) >>
+                      (int64_t)_mm_extract_epi64(b.v[1], 0),
+                      (int64_t)_mm_extract_epi64(a.v[1], 1) >>
+                      (int64_t)_mm_extract_epi64(b.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i1 __equal(__vec4_i64 a, __vec4_i64 b) {
+    __m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]);
+    __m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]);
+    return _mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
+                          _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+static FORCEINLINE __vec4_i1 __not_equal(__vec4_i64 a, __vec4_i64 b) {
+    return __xor(__equal(a, b), __vec4_i1(1, 1, 1, 1));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) <=
+                     (uint64_t)_mm_extract_epi64(b.v[0], 0),
+                     (uint64_t)_mm_extract_epi64(a.v[0], 1) <=
+                     (uint64_t)_mm_extract_epi64(b.v[0], 1),
+                     (uint64_t)_mm_extract_epi64(a.v[1], 0) <=
+                     (uint64_t)_mm_extract_epi64(b.v[1], 0),
+                     (uint64_t)_mm_extract_epi64(a.v[1], 1) <=
+                     (uint64_t)_mm_extract_epi64(b.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) >=
+                     (uint64_t)_mm_extract_epi64(b.v[0], 0),
+                     (uint64_t)_mm_extract_epi64(a.v[0], 1) >=
+                     (uint64_t)_mm_extract_epi64(b.v[0], 1),
+                     (uint64_t)_mm_extract_epi64(a.v[1], 0) >=
+                     (uint64_t)_mm_extract_epi64(b.v[1], 0),
+                     (uint64_t)_mm_extract_epi64(a.v[1], 1) >=
+                     (uint64_t)_mm_extract_epi64(b.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) <
+                     (uint64_t)_mm_extract_epi64(b.v[0], 0),
+                     (uint64_t)_mm_extract_epi64(a.v[0], 1) <
+                     (uint64_t)_mm_extract_epi64(b.v[0], 1),
+                     (uint64_t)_mm_extract_epi64(a.v[1], 0) <
+                     (uint64_t)_mm_extract_epi64(b.v[1], 0),
+                     (uint64_t)_mm_extract_epi64(a.v[1], 1) <
+                     (uint64_t)_mm_extract_epi64(b.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) >
+                     (uint64_t)_mm_extract_epi64(b.v[0], 0),
+                     (uint64_t)_mm_extract_epi64(a.v[0], 1) >
+                     (uint64_t)_mm_extract_epi64(b.v[0], 1),
+                     (uint64_t)_mm_extract_epi64(a.v[1], 0) >
+                     (uint64_t)_mm_extract_epi64(b.v[1], 0),
+                     (uint64_t)_mm_extract_epi64(a.v[1], 1) >
+                     (uint64_t)_mm_extract_epi64(b.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i64 a, __vec4_i64 b) {
+    __m128i cmp0 = _mm_cmpgt_epi64(a.v[0], b.v[0]);
+    __m128i cmp1 = _mm_cmpgt_epi64(a.v[1], b.v[1]);
+    return _mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
+                          _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i64 a, __vec4_i64 b) {
+    return __or(__signed_greater_than(a, b), __equal(a, b));
+}
+
+static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i64 a, __vec4_i64 b) {
+    return __xor(__signed_greater_equal(a, b), __vec4_i1(1, 1, 1, 1));
+}
+
+static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i64 a, __vec4_i64 b) {
+    return __xor(__signed_greater_than(a, b), __vec4_i1(1, 1, 1, 1));
+}
+
+static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64 b) {
+    __m128 m0 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 m1 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(3, 3, 2, 2));
+    __m128d m0d = _mm_castps_pd(m0);
+    __m128d m1d = _mm_castps_pd(m1);
+    __m128d r0 = _mm_blendv_pd(_mm_castsi128_pd(b.v[0]), _mm_castsi128_pd(a.v[0]), m0d);
+    __m128d r1 = _mm_blendv_pd(_mm_castsi128_pd(b.v[1]), _mm_castsi128_pd(a.v[1]), m1d);
+    return __vec4_i64(_mm_castpd_si128(r0), _mm_castpd_si128(r1));
+}
+
+static FORCEINLINE __vec4_i64 __smear_i64(int64_t v) {
+    return __vec4_i64(v, v, v, v);
+}
+
+static FORCEINLINE int64_t __extract_element(__vec4_i64 v, int index) {
+    return ((int64_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i64 *v, int index, int64_t val) {
+    ((int64_t *)v)[index] = val;
+}
+
+static FORCEINLINE __vec4_i64 __broadcast_i64(__vec4_i64 v, int index) {
+    uint64_t val = __extract_element(v, index);
+    return __vec4_i64(val, val, val, val);
+}
+
+static FORCEINLINE __vec4_i64 __rotate_i64(__vec4_i64 v, int delta) {
+    return __vec4_i64(__extract_element(v, delta     & 0x3),
+                      __extract_element(v, (delta+1) & 0x3),
+                      __extract_element(v, (delta+2) & 0x3),
+                      __extract_element(v, (delta+3) & 0x3));
+}
+
+static FORCEINLINE __vec4_i64 __shuffle_i64(__vec4_i64 v, __vec4_i32 index) {
+    return __vec4_i64(__extract_element(v, __extract_element(index, 0) & 0x3),
+                      __extract_element(v, __extract_element(index, 1) & 0x3),
+                      __extract_element(v, __extract_element(index, 2) & 0x3),
+                      __extract_element(v, __extract_element(index, 3) & 0x3));
+}
+
+static FORCEINLINE __vec4_i64 __shuffle2_i64(__vec4_i64 v0, __vec4_i64 v1, 
+                                           __vec4_i32 index) {
+    uint64_t r[4];
+    for (int i = 0; i < 4; ++i) {
+        uint32_t elt = __extract_element(index, i) & 0x7;
+        r[i] = (elt < 4) ? __extract_element(v0, elt) : __extract_element(v1, elt & 0x3);
+    }
+    return __vec4_i64(r[0], r[1], r[2], r[3]);
+}
+
+static FORCEINLINE __vec4_i64 __load(__vec4_i64 *v, int align) {
+    // FIXME: handle align of 16...
+    return __vec4_i64(_mm_loadu_si128((__m128i *)(&v->v[0])),
+                      _mm_loadu_si128((__m128i *)(&v->v[1])));
+}
+
+static FORCEINLINE void __store(__vec4_i64 *p, __vec4_i64 value, int align) {
+    // FIXME: handle align
+    _mm_storeu_si128((__m128i *)(&p->v[0]), value.v[0]);
+    _mm_storeu_si128((__m128i *)(&p->v[1]), value.v[1]);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// float
+
+static FORCEINLINE __vec4_f __add(__vec4_f a, __vec4_f b) {
+    return _mm_add_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_f __sub(__vec4_f a, __vec4_f b) {
+    return _mm_sub_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_f __mul(__vec4_f a, __vec4_f b) {
+    return _mm_mul_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_f __div(__vec4_f a, __vec4_f b) {
+    return _mm_div_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __equal(__vec4_f a, __vec4_f b) {
+    return _mm_cmpeq_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __not_equal(__vec4_f a, __vec4_f b) {
+    return _mm_cmpneq_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __less_than(__vec4_f a, __vec4_f b) {
+    return _mm_cmplt_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __less_equal(__vec4_f a, __vec4_f b) {
+    return _mm_cmple_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __greater_than(__vec4_f a, __vec4_f b) {
+    return _mm_cmpgt_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __greater_equal(__vec4_f a, __vec4_f b) {
+    return _mm_cmpge_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i1 __ordered(__vec4_f a, __vec4_f b) {
+    return _mm_cmpord_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_f __select(__vec4_i1 mask, __vec4_f a, __vec4_f b) {
+    return _mm_blendv_ps(b.v, a.v, mask.v);
+}
+
+static FORCEINLINE __vec4_f __smear_float(float v) {
+    return _mm_set1_ps(v);
+}
+
+static FORCEINLINE float __extract_element(__vec4_f v, int index) {
+    return ((float *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_f *v, int index, float val) {
+    ((float *)v)[index] = val;
+}
+
+static FORCEINLINE __vec4_f __broadcast_float(__vec4_f v, int index) {
+    return _mm_set1_ps(__extract_element(v, index));
+}
+
+static FORCEINLINE __vec4_f __rotate_float(__vec4_f v, int delta) {
+    return __vec4_f(__extract_element(v, delta     & 0x3),
+                    __extract_element(v, (delta+1) & 0x3),
+                    __extract_element(v, (delta+2) & 0x3),
+                    __extract_element(v, (delta+3) & 0x3));
+}
+
+static FORCEINLINE __vec4_f __shuffle_float(__vec4_f v, __vec4_i32 index) {
+    return __vec4_f(__extract_element(v, __extract_element(index, 0) & 0x3),
+                    __extract_element(v, __extract_element(index, 1) & 0x3),
+                    __extract_element(v, __extract_element(index, 2) & 0x3),
+                    __extract_element(v, __extract_element(index, 3) & 0x3));
+}
+
+static FORCEINLINE __vec4_f __shuffle2_float(__vec4_f v0, __vec4_f v1, 
+                                             __vec4_i32 index) {
+    float r[4];
+    for (int i = 0; i < 4; ++i) {
+        uint32_t elt = __extract_element(index, i) & 0x7;
+        r[i] = (elt < 4) ? __extract_element(v0, elt) : __extract_element(v1, elt & 0x3);
+    }
+    return __vec4_f(r[0], r[1], r[2], r[3]);
+}
+
+static FORCEINLINE __vec4_f __load(__vec4_f *v, int align) {
+    // FIXME: handle align of 16...
+    return _mm_loadu_ps((float *)(&v->v));
+}
+
+static FORCEINLINE void __store(__vec4_f *p, __vec4_f value, int align) {
+    // FIXME: handle align
+    _mm_storeu_ps((float *)(&p->v), value.v);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// double
+
+static FORCEINLINE __vec4_d __add(__vec4_d a, __vec4_d b) {
+    return __vec4_d(_mm_add_pd(a.v[0], b.v[0]),
+                    _mm_add_pd(a.v[1], b.v[1]));
+}
+
+static FORCEINLINE __vec4_d __sub(__vec4_d a, __vec4_d b) {
+    return __vec4_d(_mm_sub_pd(a.v[0], b.v[0]),
+                    _mm_sub_pd(a.v[1], b.v[1]));
+}
+
+static FORCEINLINE __vec4_d __mul(__vec4_d a, __vec4_d b) {
+    return __vec4_d(_mm_mul_pd(a.v[0], b.v[0]),
+                    _mm_mul_pd(a.v[1], b.v[1]));
+}
+
+static FORCEINLINE __vec4_d __div(__vec4_d a, __vec4_d b) {
+    return __vec4_d(_mm_div_pd(a.v[0], b.v[0]),
+                    _mm_div_pd(a.v[1], b.v[1]));
+}
+
+static FORCEINLINE __vec4_i1 __equal(__vec4_d a, __vec4_d b) {
+    __m128d cmp0 = _mm_cmpeq_pd(a.v[0], b.v[0]);
+    __m128d cmp1 = _mm_cmpeq_pd(a.v[1], b.v[1]);
+    return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
+                          _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+static FORCEINLINE __vec4_i1 __not_equal(__vec4_d a, __vec4_d b) {
+    __m128d cmp0 = _mm_cmpneq_pd(a.v[0], b.v[0]);
+    __m128d cmp1 = _mm_cmpneq_pd(a.v[1], b.v[1]);
+    return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
+                          _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+static FORCEINLINE __vec4_i1 __less_than(__vec4_d a, __vec4_d b) {
+    __m128d cmp0 = _mm_cmplt_pd(a.v[0], b.v[0]);
+    __m128d cmp1 = _mm_cmplt_pd(a.v[1], b.v[1]);
+    return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
+                          _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+static FORCEINLINE __vec4_i1 __less_equal(__vec4_d a, __vec4_d b) {
+    __m128d cmp0 = _mm_cmple_pd(a.v[0], b.v[0]);
+    __m128d cmp1 = _mm_cmple_pd(a.v[1], b.v[1]);
+    return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
+                          _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+static FORCEINLINE __vec4_i1 __greater_than(__vec4_d a, __vec4_d b) {
+    __m128d cmp0 = _mm_cmpgt_pd(a.v[0], b.v[0]);
+    __m128d cmp1 = _mm_cmpgt_pd(a.v[1], b.v[1]);
+    return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
+                          _MM_SHUFFLE(2, 0, 0 ,2));
+}
+
+static FORCEINLINE __vec4_i1 __greater_equal(__vec4_d a, __vec4_d b) {
+    __m128d cmp0 = _mm_cmpge_pd(a.v[0], b.v[0]);
+    __m128d cmp1 = _mm_cmpge_pd(a.v[1], b.v[1]);
+    return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
+                          _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+static FORCEINLINE __vec4_i1 __ordered(__vec4_d a, __vec4_d b) {
+    __m128d cmp0 = _mm_cmpord_pd(a.v[0], b.v[0]);
+    __m128d cmp1 = _mm_cmpord_pd(a.v[1], b.v[1]);
+    return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
+                          _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+static FORCEINLINE __vec4_d __select(__vec4_i1 mask, __vec4_d a, __vec4_d b) {
+    __m128 m0 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 m1 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(3, 3, 2, 2));
+    __m128d m0d = _mm_castps_pd(m0);
+    __m128d m1d = _mm_castps_pd(m1);
+    __m128d r0 = _mm_blendv_pd(b.v[0], a.v[0], m0d);
+    __m128d r1 = _mm_blendv_pd(b.v[1], a.v[1], m1d);
+    return __vec4_d(r0, r1);
+}
+
+static FORCEINLINE __vec4_d __smear_double(double v) {
+    return __vec4_d(_mm_set1_pd(v), _mm_set1_pd(v));
+}
+
+static FORCEINLINE double __extract_element(__vec4_d v, int index) {
+    return ((double *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_d *v, int index, double val) {
+    ((double *)v)[index] = val;
+}
+
+static FORCEINLINE __vec4_d __broadcast_double(__vec4_d v, int index) {
+    return __vec4_d(_mm_set1_pd(__extract_element(v, index)),
+                    _mm_set1_pd(__extract_element(v, index)));
+}
+
+static FORCEINLINE __vec4_d __rotate_double(__vec4_d v, int delta) {
+    return __vec4_d(__extract_element(v, delta     & 0x3),
+                    __extract_element(v, (delta+1) & 0x3),
+                    __extract_element(v, (delta+2) & 0x3),
+                    __extract_element(v, (delta+3) & 0x3));
+}
+
+static FORCEINLINE __vec4_d __shuffle_double(__vec4_d v, __vec4_i32 index) {
+    return __vec4_d(__extract_element(v, __extract_element(index, 0) & 0x3),
+                    __extract_element(v, __extract_element(index, 1) & 0x3),
+                    __extract_element(v, __extract_element(index, 2) & 0x3),
+                    __extract_element(v, __extract_element(index, 3) & 0x3));
+}
+
+static FORCEINLINE __vec4_d __shuffle2_double(__vec4_d v0, __vec4_d v1, 
+                                              __vec4_i32 index) {
+    double r[4];
+    for (int i = 0; i < 4; ++i) {
+        uint32_t elt = __extract_element(index, i) & 0x7;
+        r[i] = (elt < 4) ? __extract_element(v0, elt) : __extract_element(v1, elt & 0x3);
+    }
+    return __vec4_d(r[0], r[1], r[2], r[3]);
+}
+
+static FORCEINLINE __vec4_d __load(__vec4_d *v, int align) {
+    // FIXME: handle align of 16...
+    return __vec4_d(_mm_loadu_pd((double *)(&v->v[0])),
+                    _mm_loadu_pd((double *)(&v->v[1])));
+}
+
+static FORCEINLINE void __store(__vec4_d *p, __vec4_d value, int align) {
+    // FIXME: handle align
+    _mm_storeu_pd((double *)(&p->v[0]), value.v[0]);
+    _mm_storeu_pd((double *)(&p->v[1]), value.v[1]);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// casts
+// sign extension conversions
+
+static FORCEINLINE __vec4_i64 __cast_sext(__vec4_i64, __vec4_i32 val) {
+    return __vec4_i64((int64_t)((int32_t)_mm_extract_epi32(val.v, 0)),
+                      (int64_t)((int32_t)_mm_extract_epi32(val.v, 1)),
+                      (int64_t)((int32_t)_mm_extract_epi32(val.v, 2)),
+                      (int64_t)((int32_t)_mm_extract_epi32(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i64 __cast_sext(__vec4_i64, __vec4_i16 val) {
+    return __vec4_i64((int64_t)((int16_t)_mm_extract_epi16(val.v, 0)),
+                      (int64_t)((int16_t)_mm_extract_epi16(val.v, 1)),
+                      (int64_t)((int16_t)_mm_extract_epi16(val.v, 2)),
+                      (int64_t)((int16_t)_mm_extract_epi16(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i64 __cast_sext(__vec4_i64, __vec4_i8 val) {
+    return __vec4_i64((int64_t)((int8_t)_mm_extract_epi8(val.v, 0)),
+                      (int64_t)((int8_t)_mm_extract_epi8(val.v, 1)),
+                      (int64_t)((int8_t)_mm_extract_epi8(val.v, 2)),
+                      (int64_t)((int8_t)_mm_extract_epi8(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i32 __cast_sext(__vec4_i32, __vec4_i16 val) {
+    return __vec4_i32((int32_t)((int16_t)_mm_extract_epi16(val.v, 0)),
+                      (int32_t)((int16_t)_mm_extract_epi16(val.v, 1)),
+                      (int32_t)((int16_t)_mm_extract_epi16(val.v, 2)),
+                      (int32_t)((int16_t)_mm_extract_epi16(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i32 __cast_sext(__vec4_i32, __vec4_i8 val) {
+    return __vec4_i32((int32_t)((int8_t)_mm_extract_epi8(val.v, 0)),
+                      (int32_t)((int8_t)_mm_extract_epi8(val.v, 1)),
+                      (int32_t)((int8_t)_mm_extract_epi8(val.v, 2)),
+                      (int32_t)((int8_t)_mm_extract_epi8(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i8 val) {
+    return __vec4_i16((int16_t)((int8_t)_mm_extract_epi8(val.v, 0)),
+                      (int16_t)((int8_t)_mm_extract_epi8(val.v, 1)),
+                      (int16_t)((int8_t)_mm_extract_epi8(val.v, 2)),
+                      (int16_t)((int8_t)_mm_extract_epi8(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i8 __cast_sext(__vec4_i8, __vec4_i1 v) {
+    return __select(v, __smear_i8(0xff), __smear_i8(0));
+}
+
+static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i1 v) {
+    return __select(v, __smear_i16(0xffff), __smear_i16(0));
+}
+
+static FORCEINLINE __vec4_i32 __cast_sext(__vec4_i32, __vec4_i1 v) {
+    return _mm_castps_si128(v.v);
+}
+
+static FORCEINLINE __vec4_i64 __cast_sext(__vec4_i64, __vec4_i1 v) {
+    // For once it's nice that _mm_extract_ps() returns an int
+    // representation of the float bits.
+    return __vec4_i64((int64_t)((int32_t)_mm_extract_ps(v.v, 0)),
+                      (int64_t)((int32_t)_mm_extract_ps(v.v, 1)),
+                      (int64_t)((int32_t)_mm_extract_ps(v.v, 2)),
+                      (int64_t)((int32_t)_mm_extract_ps(v.v, 3)));
+}
+
+// zero extension
+static FORCEINLINE __vec4_i64 __cast_zext(__vec4_i64, __vec4_i32 val) {
+    return __vec4_i64((uint64_t)((uint32_t)_mm_extract_epi32(val.v, 0)),
+                      (uint64_t)((uint32_t)_mm_extract_epi32(val.v, 1)),
+                      (uint64_t)((uint32_t)_mm_extract_epi32(val.v, 2)),
+                      (uint64_t)((uint32_t)_mm_extract_epi32(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i64 __cast_zext(__vec4_i64, __vec4_i16 val) {
+    return __vec4_i64((uint64_t)((uint16_t)_mm_extract_epi16(val.v, 0)),
+                      (uint64_t)((uint16_t)_mm_extract_epi16(val.v, 1)),
+                      (uint64_t)((uint16_t)_mm_extract_epi16(val.v, 2)),
+                      (uint64_t)((uint16_t)_mm_extract_epi16(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i64 __cast_zext(__vec4_i64, __vec4_i8 val) {
+    return __vec4_i64((uint64_t)((uint8_t)_mm_extract_epi8(val.v, 0)),
+                      (uint64_t)((uint8_t)_mm_extract_epi8(val.v, 1)),
+                      (uint64_t)((uint8_t)_mm_extract_epi8(val.v, 2)),
+                      (uint64_t)((uint8_t)_mm_extract_epi8(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i16 val) {
+    return __vec4_i32((uint32_t)((uint16_t)_mm_extract_epi16(val.v, 0)),
+                      (uint32_t)((uint16_t)_mm_extract_epi16(val.v, 1)),
+                      (uint32_t)((uint16_t)_mm_extract_epi16(val.v, 2)),
+                      (uint32_t)((uint16_t)_mm_extract_epi16(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i8 val) {
+    return __vec4_i32((uint32_t)((uint8_t)_mm_extract_epi8(val.v, 0)),
+                      (uint32_t)((uint8_t)_mm_extract_epi8(val.v, 1)),
+                      (uint32_t)((uint8_t)_mm_extract_epi8(val.v, 2)),
+                      (uint32_t)((uint8_t)_mm_extract_epi8(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i8 val) {
+    return __vec4_i16((uint16_t)((uint8_t)_mm_extract_epi8(val.v, 0)),
+                      (uint16_t)((uint8_t)_mm_extract_epi8(val.v, 1)),
+                      (uint16_t)((uint8_t)_mm_extract_epi8(val.v, 2)),
+                      (uint16_t)((uint8_t)_mm_extract_epi8(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i8 __cast_zext(__vec4_i8, __vec4_i1 v) {
+    return __select(v, __smear_i8(1), __smear_i8(0));
+}
+
+static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i1 v) {
+    return __select(v, __smear_i16(1), __smear_i16(0));
+}
+
+static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) {
+    return _mm_and_si128(_mm_castps_si128(v.v), _mm_set1_epi32(1));
+}
+
+static FORCEINLINE __vec4_i64 __cast_zext(__vec4_i64, __vec4_i1 v) {
+    return __select(v, __smear_i64(1), __smear_i64(0));
+}
+
+// truncations
+static FORCEINLINE __vec4_i32 __cast_trunc(__vec4_i32, __vec4_i64 val) {
+    return __vec4_i32((int32_t)((int64_t)_mm_extract_epi64(val.v[0], 0)),
+                      (int32_t)((int64_t)_mm_extract_epi64(val.v[0], 1)),
+                      (int32_t)((int64_t)_mm_extract_epi64(val.v[1], 0)),
+                      (int32_t)((int64_t)_mm_extract_epi64(val.v[1], 1)));
+}
+
+static FORCEINLINE __vec4_i16 __cast_trunc(__vec4_i16, __vec4_i64 val) {
+    return __vec4_i16((int16_t)((int64_t)_mm_extract_epi64(val.v[0], 0)),
+                      (int16_t)((int64_t)_mm_extract_epi64(val.v[0], 1)),
+                      (int16_t)((int64_t)_mm_extract_epi64(val.v[1], 0)),
+                      (int16_t)((int64_t)_mm_extract_epi64(val.v[1], 1)));
+}
+
+static FORCEINLINE __vec4_i8 __cast_trunc(__vec4_i8, __vec4_i64 val) {
+    return __vec4_i8((int8_t)((int64_t)_mm_extract_epi64(val.v[0], 0)),
+                     (int8_t)((int64_t)_mm_extract_epi64(val.v[0], 1)),
+                     (int8_t)((int64_t)_mm_extract_epi64(val.v[1], 0)),
+                     (int8_t)((int64_t)_mm_extract_epi64(val.v[1], 1)));
+}
+
+static FORCEINLINE __vec4_i16 __cast_trunc(__vec4_i16, __vec4_i32 val) {
+    return __vec4_i16((int16_t)((int32_t)_mm_extract_epi32(val.v, 0)),
+                      (int16_t)((int32_t)_mm_extract_epi32(val.v, 1)),
+                      (int16_t)((int32_t)_mm_extract_epi32(val.v, 2)),
+                      (int16_t)((int32_t)_mm_extract_epi32(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i8 __cast_trunc(__vec4_i8, __vec4_i32 val) {
+    return __vec4_i8((int8_t)((int32_t)_mm_extract_epi32(val.v, 0)),
+                     (int8_t)((int32_t)_mm_extract_epi32(val.v, 1)),
+                     (int8_t)((int32_t)_mm_extract_epi32(val.v, 2)),
+                     (int8_t)((int32_t)_mm_extract_epi32(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i8 __cast_trunc(__vec4_i8, __vec4_i16 val) {
+    return __vec4_i8((int8_t)((int16_t)_mm_extract_epi16(val.v, 0)),
+                     (int8_t)((int16_t)_mm_extract_epi16(val.v, 1)),
+                     (int8_t)((int16_t)_mm_extract_epi16(val.v, 2)),
+                     (int8_t)((int16_t)_mm_extract_epi16(val.v, 3)));
+}
+
+// signed int to float/double
+static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, __vec4_i8 val) {
+    return __vec4_f((float)((int8_t)_mm_extract_epi8(val.v, 0)),
+                    (float)((int8_t)_mm_extract_epi8(val.v, 1)),
+                    (float)((int8_t)_mm_extract_epi8(val.v, 2)),
+                    (float)((int8_t)_mm_extract_epi8(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, __vec4_i16 val) {
+    return __vec4_f((float)((int16_t)_mm_extract_epi16(val.v, 0)),
+                    (float)((int16_t)_mm_extract_epi16(val.v, 1)),
+                    (float)((int16_t)_mm_extract_epi16(val.v, 2)),
+                    (float)((int16_t)_mm_extract_epi16(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, __vec4_i32 val) {
+    return _mm_cvtepi32_ps(val.v);
+}
+
+static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, __vec4_i64 val) {
+    return __vec4_f((float)((int64_t)_mm_extract_epi64(val.v[0], 0)),
+                    (float)((int64_t)_mm_extract_epi64(val.v[0], 1)),
+                    (float)((int64_t)_mm_extract_epi64(val.v[1], 0)),
+                    (float)((int64_t)_mm_extract_epi64(val.v[1], 1)));
+}
+
+static FORCEINLINE __vec4_d __cast_sitofp(__vec4_d, __vec4_i8 val) {
+    return __vec4_d((double)((int8_t)_mm_extract_epi8(val.v, 0)),
+                    (double)((int8_t)_mm_extract_epi8(val.v, 1)),
+                    (double)((int8_t)_mm_extract_epi8(val.v, 2)),
+                    (double)((int8_t)_mm_extract_epi8(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_d __cast_sitofp(__vec4_d, __vec4_i16 val) {
+    return __vec4_d((double)((int16_t)_mm_extract_epi16(val.v, 0)),
+                    (double)((int16_t)_mm_extract_epi16(val.v, 1)),
+                    (double)((int16_t)_mm_extract_epi16(val.v, 2)),
+                    (double)((int16_t)_mm_extract_epi16(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_d __cast_sitofp(__vec4_d, __vec4_i32 val) {
+    __m128d r0 = _mm_cvtepi32_pd(val.v);
+    __m128 shuf = _mm_shuffle_ps(_mm_castsi128_ps(val.v),
+                                 _mm_castsi128_ps(val.v),
+                                 _MM_SHUFFLE(3, 2, 3, 2));
+    __m128d r1 = _mm_cvtepi32_pd(_mm_castps_si128(shuf));
+    return __vec4_d(r0, r1);
+}
+
+static FORCEINLINE __vec4_d __cast_sitofp(__vec4_d, __vec4_i64 val) {
+    return __vec4_d((double)((int64_t)_mm_extract_epi64(val.v[0], 0)),
+                    (double)((int64_t)_mm_extract_epi64(val.v[0], 1)),
+                    (double)((int64_t)_mm_extract_epi64(val.v[1], 0)),
+                    (double)((int64_t)_mm_extract_epi64(val.v[1], 1)));
+}
+
+// unsigned int to float/double
+static FORCEINLINE __vec4_f __cast_uitofp(__vec4_f, __vec4_i8 val) {
+    return __vec4_f((float)((uint8_t)_mm_extract_epi8(val.v, 0)),
+                    (float)((uint8_t)_mm_extract_epi8(val.v, 1)),
+                    (float)((uint8_t)_mm_extract_epi8(val.v, 2)),
+                    (float)((uint8_t)_mm_extract_epi8(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_f __cast_uitofp(__vec4_f, __vec4_i16 val) {
+    return __vec4_f((float)((uint16_t)_mm_extract_epi16(val.v, 0)),
+                    (float)((uint16_t)_mm_extract_epi16(val.v, 1)),
+                    (float)((uint16_t)_mm_extract_epi16(val.v, 2)),
+                    (float)((uint16_t)_mm_extract_epi16(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_f __cast_uitofp(__vec4_f, __vec4_i32 val) {
+    return __vec4_f((float)((uint32_t)_mm_extract_epi32(val.v, 0)),
+                    (float)((uint32_t)_mm_extract_epi32(val.v, 1)),
+                    (float)((uint32_t)_mm_extract_epi32(val.v, 2)),
+                    (float)((uint32_t)_mm_extract_epi32(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_f __cast_uitofp(__vec4_f, __vec4_i64 val) {
+    return __vec4_f((float)((uint64_t)_mm_extract_epi64(val.v[0], 0)),
+                    (float)((uint64_t)_mm_extract_epi64(val.v[0], 1)),
+                    (float)((uint64_t)_mm_extract_epi64(val.v[1], 0)),
+                    (float)((uint64_t)_mm_extract_epi64(val.v[1], 1)));
+}
+
+static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i8 val) {
+    return __vec4_d((double)((uint8_t)_mm_extract_epi8(val.v, 0)),
+                    (double)((uint8_t)_mm_extract_epi8(val.v, 1)),
+                    (double)((uint8_t)_mm_extract_epi8(val.v, 2)),
+                    (double)((uint8_t)_mm_extract_epi8(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i16 val) {
+    return __vec4_d((double)((uint16_t)_mm_extract_epi16(val.v, 0)),
+                    (double)((uint16_t)_mm_extract_epi16(val.v, 1)),
+                    (double)((uint16_t)_mm_extract_epi16(val.v, 2)),
+                    (double)((uint16_t)_mm_extract_epi16(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i32 val) {
+    return __vec4_d((double)((uint32_t)_mm_extract_epi32(val.v, 0)),
+                    (double)((uint32_t)_mm_extract_epi32(val.v, 1)),
+                    (double)((uint32_t)_mm_extract_epi32(val.v, 2)),
+                    (double)((uint32_t)_mm_extract_epi32(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i64 val) {
+    return __vec4_d((double)((uint64_t)_mm_extract_epi64(val.v[0], 0)),
+                    (double)((uint64_t)_mm_extract_epi64(val.v[0], 1)),
+                    (double)((uint64_t)_mm_extract_epi64(val.v[1], 0)),
+                    (double)((uint64_t)_mm_extract_epi64(val.v[1], 1)));
+}
+
+static FORCEINLINE __vec4_f __cast_uitofp(__vec4_f, __vec4_i1 v) {
+    return __select(v, __smear_float(1.), __smear_float(0.));
+}
+
+static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i1 v) {
+    return __select(v, __smear_double(1.), __smear_double(0.));
+}
+
+// float/double to signed int
+static FORCEINLINE __vec4_i8 __cast_fptosi(__vec4_i8, __vec4_f val) {
+    return __vec4_i8((int8_t)bits_as_float(_mm_extract_ps(val.v, 0)),
+                     (int8_t)bits_as_float(_mm_extract_ps(val.v, 1)),
+                     (int8_t)bits_as_float(_mm_extract_ps(val.v, 2)),
+                     (int8_t)bits_as_float(_mm_extract_ps(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i16 __cast_fptosi(__vec4_i16, __vec4_f val) {
+    return __vec4_i16((int16_t)bits_as_float(_mm_extract_ps(val.v, 0)),
+                      (int16_t)bits_as_float(_mm_extract_ps(val.v, 1)),
+                      (int16_t)bits_as_float(_mm_extract_ps(val.v, 2)),
+                      (int16_t)bits_as_float(_mm_extract_ps(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i32 __cast_fptosi(__vec4_i32, __vec4_f val) {
+    return _mm_cvttps_epi32(val.v);
+}
+
+static FORCEINLINE __vec4_i64 __cast_fptosi(__vec4_i64, __vec4_f val) {
+    return __vec4_i64((int64_t)bits_as_float(_mm_extract_ps(val.v, 0)),
+                      (int64_t)bits_as_float(_mm_extract_ps(val.v, 1)),
+                      (int64_t)bits_as_float(_mm_extract_ps(val.v, 2)),
+                      (int64_t)bits_as_float(_mm_extract_ps(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i8 __cast_fptosi(__vec4_i8, __vec4_d val) {
+    return __vec4_i8((int8_t)_mm_extract_pd(val.v[0], 0),
+                     (int8_t)_mm_extract_pd(val.v[0], 1),
+                     (int8_t)_mm_extract_pd(val.v[1], 0),
+                     (int8_t)_mm_extract_pd(val.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i16 __cast_fptosi(__vec4_i16, __vec4_d val) {
+    return __vec4_i16((int16_t)_mm_extract_pd(val.v[0], 0),
+                      (int16_t)_mm_extract_pd(val.v[0], 1),
+                      (int16_t)_mm_extract_pd(val.v[1], 0),
+                      (int16_t)_mm_extract_pd(val.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i32 __cast_fptosi(__vec4_i32, __vec4_d val) {
+    __m128i r0 = _mm_cvtpd_epi32(val.v[0]);
+    __m128i r1 = _mm_cvtpd_epi32(val.v[1]);
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1),
+                                           _MM_SHUFFLE(1, 0, 1, 0)));
+}
+
+static FORCEINLINE __vec4_i64 __cast_fptosi(__vec4_i64, __vec4_d val) {
+    return __vec4_i64((int64_t)_mm_extract_pd(val.v[0], 0),
+                      (int64_t)_mm_extract_pd(val.v[0], 1),
+                      (int64_t)_mm_extract_pd(val.v[1], 0),
+                      (int64_t)_mm_extract_pd(val.v[1], 1));
+}
+
+// float/double to unsigned int
+static FORCEINLINE __vec4_i8 __cast_fptoui(__vec4_i8, __vec4_f val) {
+    return __vec4_i8((uint8_t)bits_as_float(_mm_extract_ps(val.v, 0)),
+                     (uint8_t)bits_as_float(_mm_extract_ps(val.v, 1)),
+                     (uint8_t)bits_as_float(_mm_extract_ps(val.v, 2)),
+                     (uint8_t)bits_as_float(_mm_extract_ps(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i16 __cast_fptoui(__vec4_i16, __vec4_f val) {
+    return __vec4_i16((uint16_t)bits_as_float(_mm_extract_ps(val.v, 0)),
+                      (uint16_t)bits_as_float(_mm_extract_ps(val.v, 1)),
+                      (uint16_t)bits_as_float(_mm_extract_ps(val.v, 2)),
+                      (uint16_t)bits_as_float(_mm_extract_ps(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i32 __cast_fptoui(__vec4_i32, __vec4_f val) {
+    return __vec4_i32((uint32_t)bits_as_float(_mm_extract_ps(val.v, 0)),
+                      (uint32_t)bits_as_float(_mm_extract_ps(val.v, 1)),
+                      (uint32_t)bits_as_float(_mm_extract_ps(val.v, 2)),
+                      (uint32_t)bits_as_float(_mm_extract_ps(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i64 __cast_fptoui(__vec4_i64, __vec4_f val) {
+    return __vec4_i64((uint64_t)bits_as_float(_mm_extract_ps(val.v, 0)),
+                      (uint64_t)bits_as_float(_mm_extract_ps(val.v, 1)),
+                      (uint64_t)bits_as_float(_mm_extract_ps(val.v, 2)),
+                      (uint64_t)bits_as_float(_mm_extract_ps(val.v, 3)));
+}
+
+static FORCEINLINE __vec4_i8 __cast_fptoui(__vec4_i8, __vec4_d val) {
+    return __vec4_i8((uint8_t)_mm_extract_pd(val.v[0], 0),
+                     (uint8_t)_mm_extract_pd(val.v[0], 1),
+                     (uint8_t)_mm_extract_pd(val.v[1], 0),
+                     (uint8_t)_mm_extract_pd(val.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i16 __cast_fptoui(__vec4_i16, __vec4_d val) {
+    return __vec4_i16((uint16_t)_mm_extract_pd(val.v[0], 0),
+                      (uint16_t)_mm_extract_pd(val.v[0], 1),
+                      (uint16_t)_mm_extract_pd(val.v[1], 0),
+                      (uint16_t)_mm_extract_pd(val.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i32 __cast_fptoui(__vec4_i32, __vec4_d val) {
+    return __vec4_i32((uint32_t)_mm_extract_pd(val.v[0], 0),
+                      (uint32_t)_mm_extract_pd(val.v[0], 1),
+                      (uint32_t)_mm_extract_pd(val.v[1], 0),
+                      (uint32_t)_mm_extract_pd(val.v[1], 1));
+}
+
+static FORCEINLINE __vec4_i64 __cast_fptoui(__vec4_i64, __vec4_d val) {
+    return __vec4_i64((int64_t)_mm_extract_pd(val.v[0], 0),
+                      (int64_t)_mm_extract_pd(val.v[0], 1),
+                      (int64_t)_mm_extract_pd(val.v[1], 0),
+                      (int64_t)_mm_extract_pd(val.v[1], 1));
+}
+
+// float/double conversions
+static FORCEINLINE __vec4_f __cast_fptrunc(__vec4_f, __vec4_d val) {
+    __m128 r0 = _mm_cvtpd_ps(val.v[0]);
+    __m128 r1 = _mm_cvtpd_ps(val.v[1]);
+    return _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 0, 1, 0));
+}
+
+static FORCEINLINE __vec4_d __cast_fpext(__vec4_d, __vec4_f val) {
+    return __vec4_d(_mm_cvtps_pd(val.v),
+                    _mm_cvtps_pd(_mm_shuffle_ps(val.v, val.v, 
+                                                _MM_SHUFFLE(3, 2, 3, 2))));
+}
+
+static FORCEINLINE __vec4_f __cast_bits(__vec4_f, __vec4_i32 val) {
+    return _mm_castsi128_ps(val.v);
+}
+
+static FORCEINLINE __vec4_i32 __cast_bits(__vec4_i32, __vec4_f val) {
+    return _mm_castps_si128(val.v);
+}
+
+static FORCEINLINE __vec4_d __cast_bits(__vec4_d, __vec4_i64 val) {
+    return __vec4_d(_mm_castsi128_pd(val.v[0]),
+                    _mm_castsi128_pd(val.v[1]));
+}
+
+static FORCEINLINE __vec4_i64 __cast_bits(__vec4_i64, __vec4_d val) {
+    return __vec4_i64(_mm_castpd_si128(val.v[0]),
+                      _mm_castpd_si128(val.v[1]));
+}
+
+///////////////////////////////////////////////////////////////////////////
+// various math functions
+
+static FORCEINLINE void __fastmath() {
+}
+
+static FORCEINLINE float __round_uniform_float(float v) {
+    __m128 r = _mm_set_ss(v);
+    r = _mm_round_ss(r, r, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    return bits_as_float(_mm_extract_ps(r, 0));
+}
+
+static FORCEINLINE float __floor_uniform_float(float v) {
+    __m128 r = _mm_set_ss(v);
+    r = _mm_round_ss(r, r, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+    return bits_as_float(_mm_extract_ps(r, 0));
+}
+
+static FORCEINLINE float __ceil_uniform_float(float v) {
+    __m128 r = _mm_set_ss(v);
+    r = _mm_round_ss(r, r, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+    return bits_as_float(_mm_extract_ps(r, 0));
+}
+
+static FORCEINLINE double __round_uniform_double(double v) {
+    __m128d r = _mm_set_sd(v);
+    r = _mm_round_sd(r, r, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    return _mm_extract_pd(r, 0);
+}
+
+static FORCEINLINE double __floor_uniform_double(double v) {
+    __m128d r = _mm_set_sd(v);
+    r = _mm_round_sd(r, r, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+    return _mm_extract_pd(r, 0);
+}
+
+static FORCEINLINE double __ceil_uniform_double(double v) {
+    __m128d r = _mm_set_sd(v);
+    r = _mm_round_sd(r, r, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+    return _mm_extract_pd(r, 0);
+}
+
+static FORCEINLINE __vec4_f __round_varying_float(__vec4_f v) {
+    return _mm_round_ps(v.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
+static FORCEINLINE __vec4_f __floor_varying_float(__vec4_f v) {
+    return _mm_round_ps(v.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+static FORCEINLINE __vec4_f __ceil_varying_float(__vec4_f v) {
+    return _mm_round_ps(v.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+static FORCEINLINE __vec4_d __round_varying_double(__vec4_d v) {
+    return __vec4_d(_mm_round_pd(v.v[0], _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
+                    _mm_round_pd(v.v[1], _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+}
+
+static FORCEINLINE __vec4_d __floor_varying_double(__vec4_d v) {
+    return __vec4_d(_mm_round_pd(v.v[0], _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC),
+                    _mm_round_pd(v.v[1], _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
+}
+
+static FORCEINLINE __vec4_d __ceil_varying_double(__vec4_d v) {
+    return __vec4_d(_mm_round_pd(v.v[0], _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC),
+                    _mm_round_pd(v.v[1], _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
+}
+
+// min/max
+static FORCEINLINE float __min_uniform_float(float a, float b) { return (a<b) ? a : b; }
+static FORCEINLINE float __max_uniform_float(float a, float b) { return (a>b) ? a : b; }
+static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
+static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32(int32_t a, int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64(int64_t a, int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE  __vec4_f  __max_varying_float(__vec4_f a, __vec4_f b) {
+    return _mm_max_ps(a.v, b.v);
+}
+
+static FORCEINLINE  __vec4_f  __min_varying_float(__vec4_f a, __vec4_f b) {
+    return _mm_min_ps(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_d __max_varying_double(__vec4_d a, __vec4_d b) {
+    return __vec4_d(_mm_max_pd(a.v[0], b.v[0]),
+                    _mm_max_pd(a.v[1], b.v[1]));
+}
+
+static FORCEINLINE __vec4_d __min_varying_double(__vec4_d a, __vec4_d b) {
+    return __vec4_d(_mm_min_pd(a.v[0], b.v[0]),
+                    _mm_min_pd(a.v[1], b.v[1]));
+}
+
+static FORCEINLINE __vec4_i32 __max_varying_int32(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_max_epi32(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i32 __min_varying_int32(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_min_epi32(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i32 __max_varying_uint32(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_max_epu32(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i32 __min_varying_uint32(__vec4_i32 a, __vec4_i32 b) {
+    return _mm_min_epu32(a.v, b.v);
+}
+
+static FORCEINLINE __vec4_i64 __max_varying_int64(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64((int64_t)a[0] > (int64_t)b[0] ? a[0] : b[0],
+                      (int64_t)a[1] > (int64_t)b[1] ? a[1] : b[1],
+                      (int64_t)a[2] > (int64_t)b[2] ? a[2] : b[2],
+                      (int64_t)a[3] > (int64_t)b[3] ? a[3] : b[3]);
+}
+
+static FORCEINLINE __vec4_i64 __min_varying_int64(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64((int64_t)a[0] < (int64_t)b[0] ? a[0] : b[0],
+                      (int64_t)a[1] < (int64_t)b[1] ? a[1] : b[1],
+                      (int64_t)a[2] < (int64_t)b[2] ? a[2] : b[2],
+                      (int64_t)a[3] < (int64_t)b[3] ? a[3] : b[3]);
+}
+
+static FORCEINLINE __vec4_i64 __max_varying_uint64(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64((uint64_t)a[0] > (uint64_t)b[0] ? a[0] : b[0],
+                      (uint64_t)a[1] > (uint64_t)b[1] ? a[1] : b[1],
+                      (uint64_t)a[2] > (uint64_t)b[2] ? a[2] : b[2],
+                      (uint64_t)a[3] > (uint64_t)b[3] ? a[3] : b[3]);
+}
+
+static FORCEINLINE __vec4_i64 __min_varying_uint64(__vec4_i64 a, __vec4_i64 b) {
+    return __vec4_i64((uint64_t)a[0] < (uint64_t)b[0] ? a[0] : b[0],
+                      (uint64_t)a[1] < (uint64_t)b[1] ? a[1] : b[1],
+                      (uint64_t)a[2] < (uint64_t)b[2] ? a[2] : b[2],
+                      (uint64_t)a[3] < (uint64_t)b[3] ? a[3] : b[3]);
+}
+
+// sqrt/rsqrt/rcp
+
+static FORCEINLINE float __rsqrt_uniform_float(float v) {
+    __m128 vv = _mm_set_ss(v);
+    __m128 rsqrt = _mm_rsqrt_ss(vv);
+    // Newton-Raphson iteration to improve precision
+    // return 0.5 * rsqrt * (3. - (v * rsqrt) * rsqrt);
+    __m128 v_rsqrt = _mm_mul_ss(rsqrt, vv);
+    __m128 v_r_r = _mm_mul_ss(v_rsqrt, rsqrt);
+    __m128 three_sub = _mm_sub_ss(_mm_set_ss(3.f), v_r_r);
+    __m128 rs_mul = _mm_mul_ss(rsqrt, three_sub);
+    __m128 half_scale = _mm_mul_ss(_mm_set_ss(0.5), rs_mul);
+    return bits_as_float(_mm_extract_ps(half_scale, 0));
+}
+
+static FORCEINLINE float __rcp_uniform_float(float v) {
+    __m128 rcp = _mm_rcp_ss(_mm_set_ss(v));
+    // N-R iteration:
+    __m128 m = _mm_mul_ss(_mm_set_ss(v), rcp);
+    __m128 twominus = _mm_sub_ss(_mm_set_ss(2.f), m);
+    __m128 r = _mm_mul_ss(rcp, twominus);
+    return bits_as_float(_mm_extract_ps(r, 0));
+}
+
+static FORCEINLINE float __sqrt_uniform_float(float v) {
+    __m128 r = _mm_set_ss(v);
+    r = _mm_sqrt_ss(r);
+    return bits_as_float(_mm_extract_ps(r, 0));
+}
+
+static FORCEINLINE double __sqrt_uniform_double(double v) {
+    __m128d r = _mm_set_sd(v);
+    r = _mm_sqrt_sd(r, r);
+    return _mm_extract_pd(r, 0);
+}
+
+static FORCEINLINE __vec4_f __rcp_varying_float(__vec4_f v) {
+    __m128 rcp = _mm_rcp_ps(v.v);
+    // N-R iteration:
+    __m128 m = _mm_mul_ps(v.v, rcp);
+    __m128 twominus = _mm_sub_ps(_mm_set1_ps(2.f), m);
+    __m128 r = _mm_mul_ps(rcp, twominus);
+    return r;
+}
+
+static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) {
+    __m128 rsqrt = _mm_rsqrt_ps(v.v);
+    // Newton-Raphson iteration to improve precision
+    // return 0.5 * rsqrt * (3. - (v * rsqrt) * rsqrt);
+    __m128 v_rsqrt = _mm_mul_ps(rsqrt, v.v);
+    __m128 v_r_r = _mm_mul_ps(v_rsqrt, rsqrt);
+    __m128 three_sub = _mm_sub_ps(_mm_set1_ps(3.f), v_r_r);
+    __m128 rs_mul = _mm_mul_ps(rsqrt, three_sub);
+    __m128 half_scale = _mm_mul_ps(_mm_set1_ps(0.5), rs_mul);
+    return half_scale;
+}
+
+static FORCEINLINE __vec4_f __sqrt_varying_float(__vec4_f v) {
+    return _mm_sqrt_ps(v.v);
+}
+
+static FORCEINLINE __vec4_d __sqrt_varying_double(__vec4_d v) {
+    return __vec4_d(_mm_sqrt_pd(v.v[0]), _mm_sqrt_pd(v.v[1]));
+}
+
+///////////////////////////////////////////////////////////////////////////
+// bit ops
+
+static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
+    return _mm_popcnt_u32(v);
+}
+
+static FORCEINLINE int32_t __popcnt_int64(uint64_t v) {
+    return _mm_popcnt_u64(v);
+}
+
+static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) {
+#if 0
+    // http://aggregate.org/MAGIC/#Trailing Zero Count
+    return __popcnt_int32((v & -v) - 1);
+#else
+#ifdef _MSC_VER
+    unsigned long i;
+    _BitScanForward(&i, v);
+    return i;
+#else
+    return __builtin_ctz(v);
+#endif
+#endif
+}
+
+static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) {
+#if 0
+    // http://aggregate.org/MAGIC/#Trailing Zero Count
+    return __popcnt_int64((v & -v) - 1);
+#else
+#ifdef _MSC_VER
+    unsigned long i;
+    _BitScanForward64(&i, v);
+    return i;
+#else
+    return __builtin_ctzl(v);
+#endif
+#endif
+}
+
+static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) {
+#ifdef _MSC_VER
+    unsigned long i;
+    _BitScanReverse(&i, v);
+    return i;
+#else
+    return __builtin_clz(v);
+#endif
+}
+
+static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
+#ifdef _MSC_VER
+    unsigned long i;
+    _BitScanReverse64(&i, v);
+    return i;
+#else
+    return __builtin_clzl(v);
+#endif
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// reductions
+
+static FORCEINLINE float __reduce_add_float(__vec4_f v) {
+    float r = bits_as_float(_mm_extract_ps(v.v, 0));
+    r += bits_as_float(_mm_extract_ps(v.v, 1));
+    r += bits_as_float(_mm_extract_ps(v.v, 2));
+    r += bits_as_float(_mm_extract_ps(v.v, 3));
+    return r;
+}
+
+static FORCEINLINE float __reduce_min_float(__vec4_f v) {
+    float r = bits_as_float(_mm_extract_ps(v.v, 0));
+    float val = bits_as_float(_mm_extract_ps(v.v, 1));
+    r = (r < val) ? r : val;
+    val = bits_as_float(_mm_extract_ps(v.v, 2));
+    r = (r < val) ? r : val;
+    val = bits_as_float(_mm_extract_ps(v.v, 3));
+    r = (r < val) ? r : val;
+    return r;
+}
+
+static FORCEINLINE float __reduce_max_float(__vec4_f v) {
+    float r = bits_as_float(_mm_extract_ps(v.v, 0));
+    float val = bits_as_float(_mm_extract_ps(v.v, 1));
+    r = (r > val) ? r : val;
+    val = bits_as_float(_mm_extract_ps(v.v, 2));
+    r = (r > val) ? r : val;
+    val = bits_as_float(_mm_extract_ps(v.v, 3));
+    r = (r > val) ? r : val;
+    return r;
+}
+
+static FORCEINLINE double __reduce_add_double(__vec4_d v) {
+    double r = _mm_extract_pd(v.v[0], 0);
+    r += _mm_extract_pd(v.v[0], 1);
+    r += _mm_extract_pd(v.v[1], 0);
+    r += _mm_extract_pd(v.v[1], 1);
+    return r;
+}
+
+static FORCEINLINE double __reduce_min_double(__vec4_d v) {
+    double r = _mm_extract_pd(v.v[0], 0);
+    r = (r < _mm_extract_pd(v.v[0], 1)) ? r : _mm_extract_pd(v.v[0], 1);
+    r = (r < _mm_extract_pd(v.v[1], 0)) ? r : _mm_extract_pd(v.v[1], 0);
+    r = (r < _mm_extract_pd(v.v[1], 1)) ? r : _mm_extract_pd(v.v[1], 1);
+    return r;
+}
+
+static FORCEINLINE double __reduce_max_double(__vec4_d v) {
+    double r = _mm_extract_pd(v.v[0], 0);
+    r = (r > _mm_extract_pd(v.v[0], 1)) ? r : _mm_extract_pd(v.v[0], 1);
+    r = (r > _mm_extract_pd(v.v[1], 0)) ? r : _mm_extract_pd(v.v[1], 0);
+    r = (r > _mm_extract_pd(v.v[1], 1)) ? r : _mm_extract_pd(v.v[1], 1);
+    return r;
+}
+
+static FORCEINLINE uint32_t __reduce_add_int32(__vec4_i32 v) {
+    int32_t r = _mm_extract_epi32(v.v, 0);
+    r += _mm_extract_epi32(v.v, 1);
+    r += _mm_extract_epi32(v.v, 2);
+    r += _mm_extract_epi32(v.v, 3);
+    return r;
+}
+
+static FORCEINLINE int32_t __reduce_min_int32(__vec4_i32 v) {
+    int32_t r = _mm_extract_epi32(v.v, 0);
+    int32_t val = _mm_extract_epi32(v.v, 1);
+    r = (r < val) ? r : val;
+    val = _mm_extract_epi32(v.v, 2);
+    r = (r < val) ? r : val;
+    val = _mm_extract_epi32(v.v, 3);
+    r = (r < val) ? r : val;
+    return r;
+}
+
+static FORCEINLINE int32_t __reduce_max_int32(__vec4_i32 v) {
+    int32_t r = _mm_extract_epi32(v.v, 0);
+    int32_t val = _mm_extract_epi32(v.v, 1);
+    r = (r > val) ? r : val;
+    val = _mm_extract_epi32(v.v, 2);
+    r = (r > val) ? r : val;
+    val = _mm_extract_epi32(v.v, 3);
+    r = (r > val) ? r : val;
+
+    return r;
+}
+
+static FORCEINLINE uint32_t __reduce_add_uint32(__vec4_i32 v) {
+    uint32_t r = _mm_extract_epi32(v.v, 0);
+    r += _mm_extract_epi32(v.v, 1);
+    r += _mm_extract_epi32(v.v, 2);
+    r += _mm_extract_epi32(v.v, 3);
+    return r;
+}
+
+static FORCEINLINE uint32_t __reduce_min_uint32(__vec4_i32 v) {
+    uint32_t r = _mm_extract_epi32(v.v, 0);
+    uint32_t val = _mm_extract_epi32(v.v, 1);
+    r = (r < val) ? r : val;
+    val = _mm_extract_epi32(v.v, 2);
+    r = (r < val) ? r : val;
+    val = _mm_extract_epi32(v.v, 3);
+    r = (r < val) ? r : val;
+    return r;
+}
+
+static FORCEINLINE uint32_t __reduce_max_uint32(__vec4_i32 v) {
+    uint32_t r = _mm_extract_epi32(v.v, 0);
+    uint32_t val = _mm_extract_epi32(v.v, 1);
+    r = (r > val) ? r : val;
+    val = _mm_extract_epi32(v.v, 2);
+    r = (r > val) ? r : val;
+    val = _mm_extract_epi32(v.v, 3);
+    r = (r > val) ? r : val;
+    return r;
+}
+
+static FORCEINLINE uint64_t __reduce_add_int64(__vec4_i64 v) {
+    int64_t r = _mm_extract_epi64(v.v[0], 0);
+    r += _mm_extract_epi64(v.v[0], 1);
+    r += _mm_extract_epi64(v.v[1], 0);
+    r += _mm_extract_epi64(v.v[1], 1);
+    return r;
+}
+
+static FORCEINLINE int64_t __reduce_min_int64(__vec4_i64 v) {
+    int64_t r = _mm_extract_epi64(v.v[0], 0);
+    r = ((int64_t)_mm_extract_epi64(v.v[0], 1) < r) ? _mm_extract_epi64(v.v[0], 1) : r;
+    r = ((int64_t)_mm_extract_epi64(v.v[1], 0) < r) ? _mm_extract_epi64(v.v[1], 0) : r;
+    r = ((int64_t)_mm_extract_epi64(v.v[1], 1) < r) ? _mm_extract_epi64(v.v[1], 1) : r;
+    return r;
+}
+
+static FORCEINLINE int64_t __reduce_max_int64(__vec4_i64 v) {
+    int64_t r = _mm_extract_epi64(v.v[0], 0);
+    r = ((int64_t)_mm_extract_epi64(v.v[0], 1) > r) ? _mm_extract_epi64(v.v[0], 1) : r;
+    r = ((int64_t)_mm_extract_epi64(v.v[1], 0) > r) ? _mm_extract_epi64(v.v[1], 0) : r;
+    r = ((int64_t)_mm_extract_epi64(v.v[1], 1) > r) ? _mm_extract_epi64(v.v[1], 1) : r;
+    return r;
+}
+
+static FORCEINLINE uint64_t __reduce_add_uint64(__vec4_i64 v) {
+    uint64_t r = _mm_extract_epi64(v.v[0], 0);
+    r += _mm_extract_epi64(v.v[0], 1);
+    r += _mm_extract_epi64(v.v[1], 0);
+    r += _mm_extract_epi64(v.v[1], 1);
+    return r;
+}
+
+static FORCEINLINE uint64_t __reduce_min_uint64(__vec4_i64 v) {
+    uint64_t r = _mm_extract_epi64(v.v[0], 0);
+    r = ((uint64_t)_mm_extract_epi64(v.v[0], 1) < r) ? _mm_extract_epi64(v.v[0], 1) : r;
+    r = ((uint64_t)_mm_extract_epi64(v.v[1], 0) < r) ? _mm_extract_epi64(v.v[1], 0) : r;
+    r = ((uint64_t)_mm_extract_epi64(v.v[1], 1) < r) ? _mm_extract_epi64(v.v[1], 1) : r;
+    return r;
+}
+
+static FORCEINLINE uint64_t __reduce_max_uint64(__vec4_i64 v) {
+    uint64_t r = _mm_extract_epi64(v.v[0], 0);
+    r = ((uint64_t)_mm_extract_epi64(v.v[0], 1) > r) ? _mm_extract_epi64(v.v[0], 1) : r;
+    r = ((uint64_t)_mm_extract_epi64(v.v[1], 0) > r) ? _mm_extract_epi64(v.v[1], 0) : r;
+    r = ((uint64_t)_mm_extract_epi64(v.v[1], 1) > r) ? _mm_extract_epi64(v.v[1], 1) : r;
+    return r;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// masked load/store
+
+static FORCEINLINE __vec4_i8 __masked_load_8(unsigned char *p, 
+                                             __vec4_i1 mask) {
+    int8_t r[4];
+    int8_t *ptr = (int8_t *)p;
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0)
+        r[0] = ptr[0];
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0)
+        r[1] = ptr[1];
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0)
+        r[2] = ptr[2];
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0)
+        r[3] = ptr[3];
+
+    return __vec4_i8(r[0], r[1], r[2], r[3]);
+}
+
+static FORCEINLINE __vec4_i16 __masked_load_16(unsigned char *p, 
+                                               __vec4_i1 mask) {
+    int16_t r[4];
+    int16_t *ptr = (int16_t *)p;
+
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0)
+        r[0] = ptr[0];
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0)
+        r[1] = ptr[1];
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0)
+        r[2] = ptr[2];
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0)
+        r[3] = ptr[3];
+
+    return __vec4_i16(r[0], r[1], r[2], r[3]);
+}
+
+static FORCEINLINE __vec4_i32 __masked_load_32(unsigned char *p, 
+                                               __vec4_i1 mask) {
+    __m128i r = _mm_set_epi32(0, 0, 0, 0);
+    int32_t *ptr = (int32_t *)p;
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0)
+        r = _mm_insert_epi32(r, ptr[0], 0);
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0)
+        r = _mm_insert_epi32(r, ptr[1], 1);
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0)
+        r = _mm_insert_epi32(r, ptr[2], 2);
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0)
+        r = _mm_insert_epi32(r, ptr[3], 3);
+
+    return r;
+}
+
+static FORCEINLINE __vec4_i64 __masked_load_64(unsigned char *p, 
+                                               __vec4_i1 mask) {
+    uint64_t r[4];
+    uint64_t *ptr = (uint64_t *)p;
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0)
+        r[0] = ptr[0];
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0)
+        r[1] = ptr[1];
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0)
+        r[2] = ptr[2];
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0)
+        r[3] = ptr[3];
+
+    return __vec4_i64(r[0], r[1], r[2], r[3]);
+}
+
+static FORCEINLINE void __masked_store_8(unsigned char *p, __vec4_i8 val, 
+                                         __vec4_i1 mask) {
+    int8_t *ptr = (int8_t *)p;
+
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0)
+        ptr[0] = _mm_extract_epi8(val.v, 0);
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0)
+        ptr[1] = _mm_extract_epi8(val.v, 1);
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0)
+        ptr[2] = _mm_extract_epi8(val.v, 2);
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0)
+        ptr[3] = _mm_extract_epi8(val.v, 3);
+}
+
+static FORCEINLINE void __masked_store_16(unsigned char *p, __vec4_i16 val, __vec4_i1 mask) {
+    int16_t *ptr = (int16_t *)p;
+
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0)
+        ptr[0] = _mm_extract_epi16(val.v, 0);
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0)
+        ptr[1] = _mm_extract_epi16(val.v, 1);
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0)
+        ptr[2] = _mm_extract_epi16(val.v, 2);
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0)
+        ptr[3] = _mm_extract_epi16(val.v, 3);
+}
+
+static FORCEINLINE void __masked_store_32(unsigned char *p, __vec4_i32 val, 
+                                          __vec4_i1 mask) {
+    int32_t *ptr = (int32_t *)p;
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0)
+        ptr[0] = _mm_extract_epi32(val.v, 0);
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0)
+        ptr[1] = _mm_extract_epi32(val.v, 1);
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0)
+        ptr[2] = _mm_extract_epi32(val.v, 2);
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0)
+        ptr[3] = _mm_extract_epi32(val.v, 3);
+}
+
+static FORCEINLINE void __masked_store_64(unsigned char *p, __vec4_i64 val, 
+                                          __vec4_i1 mask) {
+    int64_t *ptr = (int64_t *)p;
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) 
+        ptr[0] = _mm_extract_epi64(val.v[0], 0);
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) 
+        ptr[1] = _mm_extract_epi64(val.v[0], 1);
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) 
+        ptr[2] = _mm_extract_epi64(val.v[1], 0);
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) 
+        ptr[3] = _mm_extract_epi64(val.v[1], 1);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// gather/scatter
+// offsets * offsetScale is in bytes (for all of these)
+
+template<typename RetVec, typename RetScalar>
+static FORCEINLINE RetVec
+lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
+                     __vec4_i32 offsets, uint32_t scale, __vec4_i1 mask) {
+    RetScalar r[4];
+#if 1
+    // "Fast gather" trick...
+    offsets = __select(mask, offsets, __smear_i32(0));
+    int offset = scale * _mm_extract_epi32(offsets.v, 0);
+    RetScalar *ptr = (RetScalar *)(p + offset);
+    r[0] = *ptr;
+
+    offset = scale * _mm_extract_epi32(offsets.v, 1);
+    ptr = (RetScalar *)(p + offset);
+    r[1] = *ptr;
+
+    offset = scale * _mm_extract_epi32(offsets.v, 2);
+    ptr = (RetScalar *)(p + offset);
+    r[2] = *ptr;
+
+    offset = scale * _mm_extract_epi32(offsets.v, 3);
+    ptr = (RetScalar *)(p + offset);
+    r[3] = *ptr;
+#else
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int offset = scale * _mm_extract_epi32(offsets.v, 0);
+        RetScalar *ptr = (RetScalar *)(p + offset);
+        r[0] = *ptr;
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int offset = scale * _mm_extract_epi32(offsets.v, 1);
+        RetScalar *ptr = (RetScalar *)(p + offset);
+        r[1] = *ptr;
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int offset = scale * _mm_extract_epi32(offsets.v, 2);
+        RetScalar *ptr = (RetScalar *)(p + offset);
+        r[2] = *ptr;
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int offset = scale * _mm_extract_epi32(offsets.v, 3);
+        RetScalar *ptr = (RetScalar *)(p + offset);
+        r[3] = *ptr;
+    }
+#endif
+    return RetVec(r[0], r[1], r[2], r[3]);
+}
+
+template<typename RetVec, typename RetScalar>
+static FORCEINLINE RetVec
+lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
+                     uint32_t scale, __vec4_i1 mask) {
+    RetScalar r[4];
+#if 1
+    // "Fast gather" trick...
+    offsets = __select(mask, offsets, __smear_i64(0));
+    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+    RetScalar *ptr = (RetScalar *)(p + offset);
+    r[0] = *ptr;
+
+    offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+    ptr = (RetScalar *)(p + offset);
+    r[1] = *ptr;
+
+    offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+    ptr = (RetScalar *)(p + offset);
+    r[2] = *ptr;
+
+    offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+    ptr = (RetScalar *)(p + offset);
+    r[3] = *ptr;
+#else
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        RetScalar *ptr = (RetScalar *)(p + offset);
+        r[0] = *ptr;
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        RetScalar *ptr = (RetScalar *)(p + offset);
+        r[1] = *ptr;
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        RetScalar *ptr = (RetScalar *)(p + offset);
+        r[2] = *ptr;
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        RetScalar *ptr = (RetScalar *)(p + offset);
+        r[3] = *ptr;
+    }
+#endif
+    return RetVec(r[0], r[1], r[2], r[3]);
+
+}
+
+static FORCEINLINE __vec4_i8
+__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
+                           uint32_t scale,  __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, 
+                                mask);
+}
+
+static FORCEINLINE __vec4_i8
+__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
+                           uint32_t scale, __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, 
+                                mask);
+}
+
+static FORCEINLINE __vec4_i16
+__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
+                            uint32_t scale, __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, 
+                                mask);
+}
+
+static FORCEINLINE __vec4_i16
+ __gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
+                             uint32_t scale, __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, 
+                                mask);
+}
+
+static FORCEINLINE __vec4_i32
+__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
+                            uint32_t scale, __vec4_i1 mask) {
+    __m128i r = _mm_set_epi32(0, 0, 0, 0);
+#if 1
+    // "Fast gather"...
+    offsets = __select(mask, offsets, __smear_i32(0));
+
+    int offset = scale * _mm_extract_epi32(offsets.v, 0);
+    uint32_t *ptr = (uint32_t *)(p + offset);
+    r = _mm_insert_epi32(r, *ptr, 0);
+
+    offset = scale * _mm_extract_epi32(offsets.v, 1);
+    ptr = (uint32_t *)(p + offset);
+    r = _mm_insert_epi32(r, *ptr, 1);
+
+    offset = scale * _mm_extract_epi32(offsets.v, 2);
+    ptr = (uint32_t *)(p + offset);
+    r = _mm_insert_epi32(r, *ptr, 2);
+
+    offset = scale * _mm_extract_epi32(offsets.v, 3);
+    ptr = (uint32_t *)(p + offset);
+    r = _mm_insert_epi32(r, *ptr, 3);
+#else
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int offset = scale * _mm_extract_epi32(offsets.v, 0);
+        uint32_t *ptr = (uint32_t *)(p + offset);
+        r = _mm_insert_epi32(r, *ptr, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int offset = scale * _mm_extract_epi32(offsets.v, 1);
+        uint32_t *ptr = (uint32_t *)(p + offset);
+        r = _mm_insert_epi32(r, *ptr, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int offset = scale * _mm_extract_epi32(offsets.v, 2);
+        uint32_t *ptr = (uint32_t *)(p + offset);
+        r = _mm_insert_epi32(r, *ptr, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int offset = scale * _mm_extract_epi32(offsets.v, 3);
+        uint32_t *ptr = (uint32_t *)(p + offset);
+        r = _mm_insert_epi32(r, *ptr, 3);
+    }
+#endif
+    return r;
+}
+
+static FORCEINLINE __vec4_i32
+__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
+                            uint32_t scale, __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, 
+                                mask);
+}
+
+static FORCEINLINE __vec4_i64
+__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
+                            uint32_t scale, __vec4_i1 mask) {
+    return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, 
+                                mask);
+}
+
+static FORCEINLINE __vec4_i64
+__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
+                            uint32_t scale, __vec4_i1 mask) {
+    return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, 
+                                mask);
+}
+
+template<typename RetVec, typename RetScalar>
+static FORCEINLINE RetVec lGather32(RetVec, RetScalar, __vec4_i32 ptrs, 
+                                    __vec4_i1 mask) {
+    RetScalar r[4];
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        RetScalar *ptr = (RetScalar *)_mm_extract_epi32(ptrs.v, 0);
+        r[0] = *ptr;
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        RetScalar *ptr = (RetScalar *)_mm_extract_epi32(ptrs.v, 1);
+        r[1] = *ptr;
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        RetScalar *ptr = (RetScalar *)_mm_extract_epi32(ptrs.v, 2);
+        r[2] = *ptr;
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        RetScalar *ptr = (RetScalar *)_mm_extract_epi32(ptrs.v, 3);
+        r[3] = *ptr;
+    }
+
+    return RetVec(r[0], r[1], r[2], r[3]);
+}
+
+template<typename RetVec, typename RetScalar>
+static FORCEINLINE RetVec lGather64(RetVec, RetScalar, __vec4_i64 ptrs, 
+                                    __vec4_i1 mask) {
+    RetScalar r[4];
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        RetScalar *ptr = (RetScalar *)_mm_extract_epi64(ptrs.v[0], 0);
+        r[0] = *ptr;
+    }
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        RetScalar *ptr = (RetScalar *)_mm_extract_epi64(ptrs.v[0], 1);
+        r[1] = *ptr;
+    }
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        RetScalar *ptr = (RetScalar *)_mm_extract_epi64(ptrs.v[1], 0);
+        r[2] = *ptr;
+    }
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        RetScalar *ptr = (RetScalar *)_mm_extract_epi64(ptrs.v[1], 1);
+        r[3] = *ptr;
+    }
+    return RetVec(r[0], r[1], r[2], r[3]);
+}
+
+
+static FORCEINLINE __vec4_i8 __gather32_i8(__vec4_i32 ptrs, __vec4_i1 mask) {
+    return lGather32(__vec4_i8(), uint8_t(), ptrs, mask);
+}
+
+static FORCEINLINE __vec4_i8 __gather64_i8(__vec4_i64 ptrs, __vec4_i1 mask) {
+    return lGather64(__vec4_i8(), uint8_t(), ptrs, mask);
+}
+
+static FORCEINLINE __vec4_i16 __gather32_i16(__vec4_i32 ptrs, __vec4_i1 mask) {
+    return lGather32(__vec4_i16(), uint16_t(), ptrs, mask);
+}
+
+static FORCEINLINE __vec4_i16 __gather64_i16(__vec4_i64 ptrs, __vec4_i1 mask) {
+    return lGather64(__vec4_i16(), uint16_t(), ptrs, mask);
+}
+
+static FORCEINLINE __vec4_i32 __gather32_i32(__vec4_i32 ptrs, __vec4_i1 mask) {
+    __m128i r = _mm_set_epi32(0, 0, 0, 0);
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int32_t *ptr = (int32_t *)_mm_extract_epi32(ptrs.v, 0);
+        r = _mm_insert_epi32(r, *ptr, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int32_t *ptr = (int32_t *)_mm_extract_epi32(ptrs.v, 1);
+        r = _mm_insert_epi32(r, *ptr, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int32_t *ptr = (int32_t *)_mm_extract_epi32(ptrs.v, 2);
+        r = _mm_insert_epi32(r, *ptr, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int32_t *ptr = (int32_t *)_mm_extract_epi32(ptrs.v, 3);
+        r = _mm_insert_epi32(r, *ptr, 3);
+    }
+
+    return r;
+}
+
+static FORCEINLINE __vec4_i32 __gather64_i32(__vec4_i64 ptrs, __vec4_i1 mask) {
+    __m128i r = _mm_set_epi32(0, 0, 0, 0);
+
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int32_t *ptr = (int32_t *)_mm_extract_epi64(ptrs.v[0], 0);
+        r = _mm_insert_epi32(r, *ptr, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int32_t *ptr = (int32_t *)_mm_extract_epi64(ptrs.v[0], 1);
+        r = _mm_insert_epi32(r, *ptr, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int32_t *ptr = (int32_t *)_mm_extract_epi64(ptrs.v[1], 0);
+        r = _mm_insert_epi32(r, *ptr, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int32_t *ptr = (int32_t *)_mm_extract_epi64(ptrs.v[1], 1);
+        r = _mm_insert_epi32(r, *ptr, 3);
+    }
+
+    return r;
+}
+
+static FORCEINLINE __vec4_i64 __gather32_i64(__vec4_i32 ptrs, __vec4_i1 mask) {
+    return lGather32(__vec4_i64(), uint64_t(), ptrs, mask);
+}
+
+static FORCEINLINE __vec4_i64 __gather64_i64(__vec4_i64 ptrs, __vec4_i1 mask) {
+    return lGather64(__vec4_i64(), uint64_t(), ptrs, mask);
+}
+
+// scatter
+  
+static FORCEINLINE void
+__scatter_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, 
+                            uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
+        *ptr = _mm_extract_epi8(val.v, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
+        *ptr = _mm_extract_epi8(val.v, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
+        *ptr = _mm_extract_epi8(val.v, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
+        *ptr = _mm_extract_epi8(val.v, 3);
+    }
+}
+
+static FORCEINLINE void
+__scatter_base_offsets64_i8(unsigned char *p, __vec4_i64 offsets, 
+                            uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        uint8_t *ptr = (uint8_t *)(p + offset);
+        *ptr = _mm_extract_epi8(val.v, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        uint8_t *ptr = (uint8_t *)(p + offset);
+        *ptr = _mm_extract_epi8(val.v, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        uint8_t *ptr = (uint8_t *)(p + offset);
+        *ptr = _mm_extract_epi8(val.v, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        uint8_t *ptr = (uint8_t *)(p + offset);
+        *ptr = _mm_extract_epi8(val.v, 3);
+    }
+}
+
+static FORCEINLINE void
+__scatter_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
+                             uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
+        *ptr = _mm_extract_epi16(val.v, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
+        *ptr = _mm_extract_epi16(val.v, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
+        *ptr = _mm_extract_epi16(val.v, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
+        *ptr = _mm_extract_epi16(val.v, 3);
+    }
+}
+
+static FORCEINLINE void
+__scatter_base_offsets64_i16(unsigned char *p, __vec4_i64 offsets, 
+                             uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        uint16_t *ptr = (uint16_t *)(p + offset);
+        *ptr = _mm_extract_epi16(val.v, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        uint16_t *ptr = (uint16_t *)(p + offset);
+        *ptr = _mm_extract_epi16(val.v, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        uint16_t *ptr = (uint16_t *)(p + offset);
+        *ptr = _mm_extract_epi16(val.v, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        uint16_t *ptr = (uint16_t *)(p + offset);
+        *ptr = _mm_extract_epi16(val.v, 3);
+    }
+}
+
+static FORCEINLINE void
+__scatter_base_offsets32_i32(unsigned char *b, __vec4_i32 offsets,
+                             uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int32_t *ptr = (int32_t *)(b + scale *
+                                   _mm_extract_epi32(offsets.v, 0));
+        *ptr = _mm_extract_epi32(val.v, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int32_t *ptr = (int32_t *)(b + scale *
+                                   _mm_extract_epi32(offsets.v, 1));
+        *ptr = _mm_extract_epi32(val.v, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int32_t *ptr = (int32_t *)(b + scale *
+                                   _mm_extract_epi32(offsets.v, 2));
+        *ptr = _mm_extract_epi32(val.v, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int32_t *ptr = (int32_t *)(b + scale *
+                                   _mm_extract_epi32(offsets.v, 3));
+        *ptr = _mm_extract_epi32(val.v, 3);
+    }
+}
+
+static FORCEINLINE void
+__scatter_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, 
+                             uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        uint32_t *ptr = (uint32_t *)(p + offset);
+        *ptr = _mm_extract_epi32(val.v, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        uint32_t *ptr = (uint32_t *)(p + offset);
+        *ptr = _mm_extract_epi32(val.v, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        uint32_t *ptr = (uint32_t *)(p + offset);
+        *ptr = _mm_extract_epi32(val.v, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        uint32_t *ptr = (uint32_t *)(p + offset);
+        *ptr = _mm_extract_epi32(val.v, 3);
+    }
+}
+
+static FORCEINLINE void
+__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, 
+                             uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[0], 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[0], 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[1], 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[1], 1);
+    }
+}
+
+static FORCEINLINE void
+__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, 
+                             uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[0], 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[0], 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[1], 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        uint64_t *ptr = (uint64_t *)(p + offset);
+        *ptr = _mm_extract_epi64(val.v[1], 1);
+    }
+}
+
+static FORCEINLINE void __scatter32_i8(__vec4_i32 ptrs, __vec4_i8 val,
+                                       __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        uint8_t *ptr = (uint8_t *)_mm_extract_epi32(ptrs.v, 0);
+        *ptr = _mm_extract_epi8(val.v, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        uint8_t *ptr = (uint8_t *)_mm_extract_epi32(ptrs.v, 1);
+        *ptr = _mm_extract_epi8(val.v, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        uint8_t *ptr = (uint8_t *)_mm_extract_epi32(ptrs.v, 2);
+        *ptr = _mm_extract_epi8(val.v, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        uint8_t *ptr = (uint8_t *)_mm_extract_epi32(ptrs.v, 3);
+        *ptr = _mm_extract_epi8(val.v, 3);
+    }
+}
+
+static FORCEINLINE void __scatter64_i8(__vec4_i64 ptrs, __vec4_i8 val,
+                                       __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        uint8_t *ptr = (uint8_t *)_mm_extract_epi64(ptrs.v[0], 0);
+        *ptr = _mm_extract_epi8(val.v, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        uint8_t *ptr = (uint8_t *)_mm_extract_epi64(ptrs.v[0], 1);
+        *ptr = _mm_extract_epi8(val.v, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        uint8_t *ptr = (uint8_t *)_mm_extract_epi64(ptrs.v[1], 0);
+        *ptr = _mm_extract_epi8(val.v, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        uint8_t *ptr = (uint8_t *)_mm_extract_epi64(ptrs.v[1], 1);
+        *ptr = _mm_extract_epi8(val.v, 3);
+    }
+}
+
+static FORCEINLINE void __scatter32_i16(__vec4_i32 ptrs, __vec4_i16 val,
+                                        __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        uint16_t *ptr = (uint16_t *)_mm_extract_epi32(ptrs.v, 0);
+        *ptr = _mm_extract_epi16(val.v, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        uint16_t *ptr = (uint16_t *)_mm_extract_epi32(ptrs.v, 1);
+        *ptr = _mm_extract_epi16(val.v, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        uint16_t *ptr = (uint16_t *)_mm_extract_epi32(ptrs.v, 2);
+        *ptr = _mm_extract_epi16(val.v, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        uint16_t *ptr = (uint16_t *)_mm_extract_epi32(ptrs.v, 3);
+        *ptr = _mm_extract_epi16(val.v, 3);
+    }
+}
+
+static FORCEINLINE void __scatter64_i16(__vec4_i64 ptrs, __vec4_i16 val,
+                                        __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        uint16_t *ptr = (uint16_t *)_mm_extract_epi64(ptrs.v[0], 0);
+        *ptr = _mm_extract_epi16(val.v, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        uint16_t *ptr = (uint16_t *)_mm_extract_epi64(ptrs.v[0], 1);
+        *ptr = _mm_extract_epi16(val.v, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        uint16_t *ptr = (uint16_t *)_mm_extract_epi64(ptrs.v[1], 0);
+        *ptr = _mm_extract_epi16(val.v, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        uint16_t *ptr = (uint16_t *)_mm_extract_epi64(ptrs.v[1], 1);
+        *ptr = _mm_extract_epi16(val.v, 3);
+    }
+}
+
+static FORCEINLINE void __scatter32_i32(__vec4_i32 ptrs, __vec4_i32 val,
+                                        __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        uint32_t *ptr = (uint32_t *)_mm_extract_epi32(ptrs.v, 0);
+        *ptr = _mm_extract_epi32(val.v, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        uint32_t *ptr = (uint32_t *)_mm_extract_epi32(ptrs.v, 1);
+        *ptr = _mm_extract_epi32(val.v, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        uint32_t *ptr = (uint32_t *)_mm_extract_epi32(ptrs.v, 2);
+        *ptr = _mm_extract_epi32(val.v, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        uint32_t *ptr = (uint32_t *)_mm_extract_epi32(ptrs.v, 3);
+        *ptr = _mm_extract_epi32(val.v, 3);
+    }
+}
+
+static FORCEINLINE void __scatter64_i32(__vec4_i64 ptrs, __vec4_i32 val,
+                                        __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        uint32_t *ptr = (uint32_t *)_mm_extract_epi64(ptrs.v[0], 0);
+        *ptr = _mm_extract_epi32(val.v, 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        uint32_t *ptr = (uint32_t *)_mm_extract_epi64(ptrs.v[0], 1);
+        *ptr = _mm_extract_epi32(val.v, 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        uint32_t *ptr = (uint32_t *)_mm_extract_epi64(ptrs.v[1], 0);
+        *ptr = _mm_extract_epi32(val.v, 2);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        uint32_t *ptr = (uint32_t *)_mm_extract_epi64(ptrs.v[1], 1);
+        *ptr = _mm_extract_epi32(val.v, 3);
+    }
+}
+
+static FORCEINLINE void __scatter32_i64(__vec4_i32 ptrs, __vec4_i64 val, 
+                                        __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        uint64_t *ptr = (uint64_t *)_mm_extract_epi32(ptrs.v, 0);
+        *ptr = _mm_extract_epi64(val.v[0], 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        uint64_t *ptr = (uint64_t *)_mm_extract_epi32(ptrs.v, 1);
+        *ptr = _mm_extract_epi64(val.v[0], 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        uint64_t *ptr = (uint64_t *)_mm_extract_epi32(ptrs.v, 2);
+        *ptr = _mm_extract_epi64(val.v[1], 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        uint64_t *ptr = (uint64_t *)_mm_extract_epi32(ptrs.v, 3);
+        *ptr = _mm_extract_epi64(val.v[1], 1);
+    }
+}
+
+static FORCEINLINE void __scatter64_i64(__vec4_i64 ptrs, __vec4_i64 val,
+                                        __vec4_i1 mask) {
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0) {
+        uint64_t *ptr = (uint64_t *)_mm_extract_epi64(ptrs.v[0], 0);
+        *ptr = _mm_extract_epi64(val.v[0], 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0) {
+        uint64_t *ptr = (uint64_t *)_mm_extract_epi64(ptrs.v[0], 1);
+        *ptr = _mm_extract_epi64(val.v[0], 1);
+    }
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0) {
+        uint64_t *ptr = (uint64_t *)_mm_extract_epi64(ptrs.v[1], 0);
+        *ptr = _mm_extract_epi64(val.v[1], 0);
+    }
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0) {
+        uint64_t *ptr = (uint64_t *)_mm_extract_epi64(ptrs.v[1], 1);
+        *ptr = _mm_extract_epi64(val.v[1], 1);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// packed load/store
+
+static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec4_i32 *val,
+                                                __vec4_i1 mask) {
+    int count = 0;
+    uint32_t m = _mm_extract_ps(mask.v, 0); 
+    if (m != 0)
+        val->v = _mm_insert_epi32(val->v, ptr[count++], 0);
+
+    m = _mm_extract_ps(mask.v, 1); 
+    if (m != 0)
+        val->v = _mm_insert_epi32(val->v, ptr[count++], 1);
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0)
+        val->v = _mm_insert_epi32(val->v, ptr[count++], 2);
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0)
+        val->v = _mm_insert_epi32(val->v, ptr[count++], 3);
+
+    return count;
+}
+
+static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec4_i32 val,
+                                                 __vec4_i1 mask) {
+    int count = 0;
+    uint32_t m = _mm_extract_ps(mask.v, 0);
+    if (m != 0)
+        ptr[count++] = _mm_extract_epi32(val.v, 0);
+
+    m = _mm_extract_ps(mask.v, 1);
+    if (m != 0)
+        ptr[count++] = _mm_extract_epi32(val.v, 1);
+
+    m = _mm_extract_ps(mask.v, 2);
+    if (m != 0)
+        ptr[count++] = _mm_extract_epi32(val.v, 2);
+
+    m = _mm_extract_ps(mask.v, 3);
+    if (m != 0)
+        ptr[count++] = _mm_extract_epi32(val.v, 3);
+
+    return count;
+}
+
+static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec4_i32 *val,
+                                                __vec4_i1 mask) {
+    return __packed_load_active((int32_t *)ptr, val, mask);
+}
+
+static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec4_i32 val,
+                                                 __vec4_i1 mask) {
+    return __packed_store_active((int32_t *)ptr, val, mask);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// aos/soa
+
+// FIXME: these all are correct but could be much more efficient with
+// actual use of SSE shuffles and the like
+
+static FORCEINLINE void __soa_to_aos3_float(__vec4_f v0, __vec4_f v1, __vec4_f v2,
+                                            float *ptr) {
+    for (int i = 0; i < 4; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec4_f *out0, 
+                                            __vec4_f *out1, __vec4_f *out2) {
+    for (int i = 0; i < 4; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+    }
+}
+
+static FORCEINLINE void __soa_to_aos4_float(__vec4_f v0, __vec4_f v1, __vec4_f v2,
+                                            __vec4_f v3, float *ptr) {
+    for (int i = 0; i < 4; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+        *ptr++ = __extract_element(v3, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec4_f *out0, __vec4_f *out1,
+                                            __vec4_f *out2, __vec4_f *out3) {
+    for (int i = 0; i < 4; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+        __insert_element(out3, i, *ptr++);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// prefetch
+
+static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *ptr) {
+    _mm_prefetch((char *)ptr, _MM_HINT_T0);
+}
+
+static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *ptr) {
+    _mm_prefetch((char *)ptr, _MM_HINT_T1);
+}
+
+static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *ptr) {
+    _mm_prefetch((char *)ptr, _MM_HINT_T2);
+}
+
+static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *ptr) {
+    _mm_prefetch((char *)ptr, _MM_HINT_NTA);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// atomics
+
+static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
+    int32_t old, min;
+    do {
+        old = *((volatile int32_t *)p);
+        min = (old < (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
+    int32_t old, max;
+    do {
+        old = *((volatile int32_t *)p);
+        max = (old > (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
+    uint32_t old, min;
+    do {
+        old = *((volatile uint32_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
+    uint32_t old, max;
+    do {
+        old = *((volatile uint32_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange((LONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
+                                             uint32_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
+    int64_t old, min;
+    do {
+        old = *((volatile int64_t *)p);
+        min = (old < (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
+    int64_t old, max;
+    do {
+        old = *((volatile int64_t *)p);
+        max = (old > (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
+    uint64_t old, min;
+    do {
+        old = *((volatile uint64_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
+    uint64_t old, max;
+    do {
+        old = *((volatile uint64_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange64((LONGLONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
+                                             uint64_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
diff --git a/ispc.vcxproj b/ispc.vcxproj
index 96a6855dca2..fc65004bd5d 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -13,6 +13,7 @@
   <ItemGroup>
     <ClCompile Include="ast.cpp" />
     <ClCompile Include="builtins.cpp" />
+    <ClCompile Include="cbackend.cpp" />
     <ClCompile Include="ctx.cpp" />
     <ClCompile Include="decl.cpp" />
     <ClCompile Include="expr.cpp" />
diff --git a/main.cpp b/main.cpp
index 0c483c35570..357e7532e4b 100644
--- a/main.cpp
+++ b/main.cpp
@@ -66,11 +66,15 @@ static void usage(int ret) {
     printf("                          \t\ton 64-bit target architectures.)\n");
     printf("    [--arch={%s}]\t\tSelect target architecture\n", 
            Target::SupportedTargetArchs());
+    printf("    [--c++-include-file=<name>]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n");
     printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
     printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs());
     printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
     printf("    [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
     printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
+#ifndef LLVM_2_9
+    printf("    [--emit-c++]\t\t\tEmit a C++ source file as output\n");
+#endif // !LLVM_2_9
     printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
     printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
     printf("    [-g]\t\t\t\tGenerate debugging information\n");
@@ -187,6 +191,7 @@ int main(int Argc, char *Argv[]) {
     char *file = NULL;
     const char *headerFileName = NULL;
     const char *outFileName = NULL;
+    const char *includeFileName = NULL;
 
     // Initiailize globals early so that we can set various option values
     // as we're parsing below
@@ -236,13 +241,20 @@ int main(int Argc, char *Argv[]) {
         }
         else if (!strcmp(argv[i], "--emit-asm"))
             ot = Module::Asm;
+#ifndef LLVM_2_9
+        else if (!strcmp(argv[i], "--emit-c++"))
+            ot = Module::CXX;
+#endif // !LLVM_2_9
         else if (!strcmp(argv[i], "--emit-llvm"))
             ot = Module::Bitcode;
         else if (!strcmp(argv[i], "--emit-obj"))
             ot = Module::Object;
         else if (!strcmp(argv[i], "--target")) {
             // FIXME: should remove this way of specifying the target...
-            if (++i == argc) usage(1);
+            if (++i == argc) {
+                fprintf(stderr, "No target specified after --target option.\n");
+                usage(1);
+            }
             target = argv[i];
         }
         else if (!strncmp(argv[i], "--target=", 9))
@@ -257,8 +269,10 @@ int main(int Argc, char *Argv[]) {
                 g->mathLib = Globals::Math_SVML;
             else if (!strcmp(lib, "system"))
                 g->mathLib = Globals::Math_System;
-            else
+            else {
+                fprintf(stderr, "Unknown --math-lib= option \"%s\".\n", lib);
                 usage(1);
+            }
         }
         else if (!strncmp(argv[i], "--opt=", 6)) {
             const char *opt = argv[i] + 6;
@@ -291,8 +305,10 @@ int main(int Argc, char *Argv[]) {
                 g->opt.disableGatherScatterFlattening = true;
             else if (!strcmp(opt, "disable-uniform-memory-optimizations"))
                 g->opt.disableUniformMemoryOptimizations = true;
-            else 
+            else {
+                fprintf(stderr, "Unknown --opt= option \"%s\".\n", opt);
                 usage(1);
+            }
         }
         else if (!strcmp(argv[i], "--woff") || !strcmp(argv[i], "-woff")) {
             g->disableWarnings = true;
@@ -305,18 +321,27 @@ int main(int Argc, char *Argv[]) {
         else if (!strcmp(argv[i], "--wno-perf") || !strcmp(argv[i], "-wno-perf"))
             g->emitPerfWarnings = false;
         else if (!strcmp(argv[i], "-o")) {
-            if (++i == argc) usage(1);
+            if (++i == argc) {
+                fprintf(stderr, "No output file specified after -o option.\n");
+                usage(1);
+            }
             outFileName = argv[i];
         }
         else if (!strcmp(argv[i], "--outfile="))
             outFileName = argv[i] + strlen("--outfile=");
         else if (!strcmp(argv[i], "-h")) {
-            if (++i == argc) usage(1);
+            if (++i == argc) {
+                fprintf(stderr, "No header file name specified after -h option.\n");
+                usage(1);
+            }
             headerFileName = argv[i];
         }
-        else if (!strcmp(argv[i], "--header-outfile=")) {
+        else if (!strncmp(argv[i], "--header-outfile=", 17)) {
             headerFileName = argv[i] + strlen("--header-outfile=");
         }
+        else if (!strncmp(argv[i], "--c++-include-file=", 19)) {
+            includeFileName = argv[i] + strlen("--c++-include-file=");
+        }
         else if (!strcmp(argv[i], "-O0")) {
             g->opt.level = 0;
             optSet = true;
@@ -341,11 +366,16 @@ int main(int Argc, char *Argv[]) {
                    BUILD_DATE, BUILD_VERSION);
             return 0;
         }
-        else if (argv[i][0] == '-')
+        else if (argv[i][0] == '-') {
+            fprintf(stderr, "Unknown option \"%s\".\n", argv[i]);
             usage(1);
+        }
         else {
-            if (file != NULL)
+            if (file != NULL) {
+                fprintf(stderr, "Multiple input files specified on command "
+                        "line: \"%s\" and \"%s\".\n", file, argv[i]);
                 usage(1);
+            }
             else
                 file = argv[i];
         }
@@ -363,5 +393,6 @@ int main(int Argc, char *Argv[]) {
                 "be issued, but no output will be generated.");
 
     return Module::CompileAndOutput(file, arch, cpu, target, generatePIC,
-                                    ot, outFileName, headerFileName);
+                                    ot, outFileName, headerFileName, 
+                                    includeFileName);
 }
diff --git a/module.cpp b/module.cpp
index 5dc9b16045c..df09955aee2 100644
--- a/module.cpp
+++ b/module.cpp
@@ -76,7 +76,6 @@
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
-#include <llvm/PassManager.h>
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/CFG.h>
 #include <clang/Frontend/CompilerInstance.h>
@@ -584,7 +583,8 @@ Module::AddFunctionDefinition(Symbol *sym, const std::vector<Symbol *> &args,
 
 
 bool
-Module::writeOutput(OutputType outputType, const char *outFileName) {
+Module::writeOutput(OutputType outputType, const char *outFileName,
+                    const char *includeFileName) {
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     if (diBuilder != NULL && outputType != Header)
         diBuilder->finalize();
@@ -610,6 +610,14 @@ Module::writeOutput(OutputType outputType, const char *outFileName) {
             if (strcasecmp(suffix, "o") && strcasecmp(suffix, "obj"))
                 fileType = "object";
             break;
+#ifndef LLVM_2_9
+        case CXX:
+            if (strcasecmp(suffix, "c") && strcasecmp(suffix, "cc") &&
+                strcasecmp(suffix, "c++") && strcasecmp(suffix, "cxx") &&
+                strcasecmp(suffix, "cpp"))
+                fileType = "c++";
+            break;
+#endif // !LLVM_2_9
         case Header:
             if (strcasecmp(suffix, "h") && strcasecmp(suffix, "hh") &&
                 strcasecmp(suffix, "hpp"))
@@ -623,12 +631,18 @@ Module::writeOutput(OutputType outputType, const char *outFileName) {
 
     if (outputType == Header)
         return writeHeader(outFileName);
-    else {
-        if (outputType == Bitcode)
-            return writeBitcode(module, outFileName);
-        else
-            return writeObjectFileOrAssembly(outputType, outFileName);
+    else if (outputType == Bitcode)
+        return writeBitcode(module, outFileName);
+#ifndef LLVM_2_9
+    else if (outputType == CXX) {
+        extern bool WriteCXXFile(llvm::Module *module, const char *fn, 
+                                 int vectorWidth, const char *includeName);
+        return WriteCXXFile(module, outFileName, g->target.vectorWidth,
+                            includeFileName);
     }
+#endif // !LLVM_2_9
+    else
+        return writeObjectFileOrAssembly(outputType, outFileName);
 }
 
 
@@ -1568,7 +1582,8 @@ lCreateDispatchModule(std::map<std::string, FunctionTargetVariants> &functions)
 int
 Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu, 
                          const char *target, bool generatePIC, OutputType outputType, 
-                         const char *outFileName, const char *headerFileName) {
+                         const char *outFileName, const char *headerFileName,
+                         const char *includeFileName) {
     if (target == NULL || strchr(target, ',') == NULL) {
         // We're only compiling to a single target
         if (!Target::GetTarget(arch, cpu, target, generatePIC, &g->target))
@@ -1577,7 +1592,7 @@ Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu,
         m = new Module(srcFile);
         if (m->CompileFile() == 0) {
             if (outFileName != NULL)
-                if (!m->writeOutput(outputType, outFileName))
+                if (!m->writeOutput(outputType, outFileName, includeFileName))
                     return 1;
             if (headerFileName != NULL)
                 if (!m->writeOutput(Module::Header, headerFileName))
@@ -1590,6 +1605,14 @@ Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu,
         return errorCount > 0;
     }
     else {
+#ifndef LLVM_2_9
+        if (outputType == CXX) {
+            Error(SourcePos(), "Illegal to specify more then one target when "
+                  "compiling C++ output.");
+            return 1;
+        }
+#endif // !LLVM_2_9
+
         // The user supplied multiple targets
         std::vector<std::string> targets = lExtractTargets(target);
         Assert(targets.size() > 1);
diff --git a/module.h b/module.h
index f5fe75a9a23..9032548f68a 100644
--- a/module.h
+++ b/module.h
@@ -80,6 +80,9 @@ class Module {
     enum OutputType { Asm,      /** Generate text assembly language output */
                       Bitcode,  /** Generate LLVM IR bitcode output */
                       Object,   /** Generate a native object file */
+#ifndef LLVM_2_9
+                      CXX,      /** Generate a C++ file */
+#endif // !LLVM_2_9
                       Header    /** Generate a C/C++ header file with 
                                     declarations of 'export'ed functions, global
                                     variables, and the types used by them. */
@@ -108,6 +111,10 @@ class Module {
                               inclusion from C/C++ code with declarations of
                               types and functions exported from the given ispc
                               source file.
+        @param includeFileName If non-NULL, gives the filename for the C++ 
+                               backend to emit in an #include statement to
+                               get definitions of the builtins for the generic
+                               target.
         @return             Number of errors encountered when compiling
                             srcFile.
      */
@@ -115,7 +122,8 @@ class Module {
                                 const char *cpu, const char *targets, 
                                 bool generatePIC, OutputType outputType, 
                                 const char *outFileName, 
-                                const char *headerFileName);
+                                const char *headerFileName, 
+                                const char *includeFileName);
 
     /** Total number of errors encountered during compilation. */
     int errorCount;
@@ -138,7 +146,8 @@ class Module {
         true on success, false if there has been an error.  The given
         filename may be NULL, indicating that output should go to standard
         output. */
-    bool writeOutput(OutputType ot, const char *filename);
+    bool writeOutput(OutputType ot, const char *filename,
+                     const char *includeFileName = NULL);
     bool writeHeader(const char *filename);
     bool writeObjectFileOrAssembly(OutputType outputType, const char *filename);
     static bool writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
diff --git a/opt.cpp b/opt.cpp
index a78c3f210c6..0685509fdb9 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -184,10 +184,12 @@ Optimize(llvm::Module *module, int optLevel) {
     llvm::PassManager optPM;
     llvm::FunctionPassManager funcPM(module);
 
-    llvm::TargetLibraryInfo *targetLibraryInfo =
-        new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
-    optPM.add(targetLibraryInfo);
-    optPM.add(new llvm::TargetData(module));
+    if (g->target.isa != Target::GENERIC) {
+        llvm::TargetLibraryInfo *targetLibraryInfo =
+            new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
+        optPM.add(targetLibraryInfo);
+        optPM.add(new llvm::TargetData(module));
+    }
 
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     optPM.add(llvm::createIndVarSimplifyPass());