openSUSE Build Service

Overview Repositories Revisions Requests Users Attributes Meta

File riscv-ncg.patch of Package ghc

Index: ghc-9.8.2/compiler/CodeGen.Platform.h
===================================================================
--- ghc-9.8.2.orig/compiler/CodeGen.Platform.h
+++ ghc-9.8.2/compiler/CodeGen.Platform.h
@@ -1,7 +1,8 @@
 
 import GHC.Cmm.Expr
 #if !(defined(MACHREGS_i386) || defined(MACHREGS_x86_64) \
-    || defined(MACHREGS_powerpc) || defined(MACHREGS_aarch64))
+    || defined(MACHREGS_powerpc) || defined(MACHREGS_aarch64) \
+    || defined(MACHREGS_riscv64))
 import GHC.Utils.Panic.Plain
 #endif
 import GHC.Platform.Reg
@@ -1040,6 +1041,105 @@ freeReg 18 = False
 
 # if defined(REG_Base)
 freeReg REG_Base  = False
+# endif
+# if defined(REG_Sp)
+freeReg REG_Sp    = False
+# endif
+# if defined(REG_SpLim)
+freeReg REG_SpLim = False
+# endif
+# if defined(REG_Hp)
+freeReg REG_Hp    = False
+# endif
+# if defined(REG_HpLim)
+freeReg REG_HpLim = False
+# endif
+
+# if defined(REG_R1)
+freeReg REG_R1    = False
+# endif
+# if defined(REG_R2)
+freeReg REG_R2    = False
+# endif
+# if defined(REG_R3)
+freeReg REG_R3    = False
+# endif
+# if defined(REG_R4)
+freeReg REG_R4    = False
+# endif
+# if defined(REG_R5)
+freeReg REG_R5    = False
+# endif
+# if defined(REG_R6)
+freeReg REG_R6    = False
+# endif
+# if defined(REG_R7)
+freeReg REG_R7    = False
+# endif
+# if defined(REG_R8)
+freeReg REG_R8    = False
+# endif
+
+# if defined(REG_F1)
+freeReg REG_F1    = False
+# endif
+# if defined(REG_F2)
+freeReg REG_F2    = False
+# endif
+# if defined(REG_F3)
+freeReg REG_F3    = False
+# endif
+# if defined(REG_F4)
+freeReg REG_F4    = False
+# endif
+# if defined(REG_F5)
+freeReg REG_F5    = False
+# endif
+# if defined(REG_F6)
+freeReg REG_F6    = False
+# endif
+
+# if defined(REG_D1)
+freeReg REG_D1    = False
+# endif
+# if defined(REG_D2)
+freeReg REG_D2    = False
+# endif
+# if defined(REG_D3)
+freeReg REG_D3    = False
+# endif
+# if defined(REG_D4)
+freeReg REG_D4    = False
+# endif
+# if defined(REG_D5)
+freeReg REG_D5    = False
+# endif
+# if defined(REG_D6)
+freeReg REG_D6    = False
+# endif
+
+freeReg _ = True
+
+#elif defined(MACHREGS_riscv64)
+
+-- zero reg
+freeReg 0 = False
+-- link register
+freeReg 1 = False
+-- stack pointer
+freeReg 2 = False
+-- global pointer
+freeReg 3 = False
+-- thread pointer
+freeReg 4 = False
+-- frame pointer
+freeReg 8 = False
+-- made-up inter-procedural (ip) register
+-- See Note [The made-up RISCV64 IP register]
+freeReg 31 = False
+
+# if defined(REG_Base)
+freeReg REG_Base  = False
 # endif
 # if defined(REG_Sp)
 freeReg REG_Sp    = False
Index: ghc-9.8.2/compiler/GHC/Cmm/CLabel.hs
===================================================================
--- ghc-9.8.2.orig/compiler/GHC/Cmm/CLabel.hs
+++ ghc-9.8.2/compiler/GHC/Cmm/CLabel.hs
@@ -1723,6 +1723,8 @@ pprDynamicLinkerAsmLabel !platform dllIn
       | platformArch platform == ArchAArch64
       = ppLbl
 
+      | platformArch platform == ArchRISCV64
+      = ppLbl
 
       | platformArch platform == ArchX86_64
       = case dllInfo of
Index: ghc-9.8.2/compiler/GHC/CmmToAsm.hs
===================================================================
--- ghc-9.8.2.orig/compiler/GHC/CmmToAsm.hs
+++ ghc-9.8.2/compiler/GHC/CmmToAsm.hs
@@ -82,6 +82,7 @@ import qualified GHC.CmmToAsm.X86   as X
 import qualified GHC.CmmToAsm.PPC   as PPC
 import qualified GHC.CmmToAsm.AArch64 as AArch64
 import qualified GHC.CmmToAsm.Wasm as Wasm32
+import qualified GHC.CmmToAsm.RV64  as RV64
 
 import GHC.CmmToAsm.Reg.Liveness
 import qualified GHC.CmmToAsm.Reg.Linear                as Linear
@@ -167,7 +168,7 @@ nativeCodeGen logger ts config modLoc h
       ArchAlpha     -> panic "nativeCodeGen: No NCG for Alpha"
       ArchMipseb    -> panic "nativeCodeGen: No NCG for mipseb"
       ArchMipsel    -> panic "nativeCodeGen: No NCG for mipsel"
-      ArchRISCV64   -> panic "nativeCodeGen: No NCG for RISCV64"
+      ArchRISCV64   -> nCG' (RV64.ncgRV64 config)
       ArchLoongArch64->panic "nativeCodeGen: No NCG for LoongArch64"
       ArchUnknown   -> panic "nativeCodeGen: No NCG for unknown arch"
       ArchJavaScript-> panic "nativeCodeGen: No NCG for JavaScript"
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
===================================================================
--- ghc-9.8.2.orig/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
@@ -240,6 +240,7 @@ dwarfRegNo p r = case platformArch p of
     | r == xmm15 -> 32
   ArchPPC_64 _ -> fromIntegral $ toRegNo r
   ArchAArch64  -> fromIntegral $ toRegNo r
+  ArchRISCV64  -> fromIntegral $ toRegNo r
   _other -> error "dwarfRegNo: Unsupported platform or unknown register!"
 
 -- | Virtual register number to use for return address.
@@ -252,5 +253,6 @@ dwarfReturnRegNo p
     ArchX86    -> 8  -- eip
     ArchX86_64 -> 16 -- rip
     ArchPPC_64 ELF_V2 -> 65 -- lr (link register)
-    ArchAArch64-> 30
+    ArchAArch64 -> 30
+    ArchRISCV64 -> 1 -- ra (return address)
     _other     -> error "dwarfReturnRegNo: Unsupported platform!"
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/PIC.hs
===================================================================
--- ghc-9.8.2.orig/compiler/GHC/CmmToAsm/PIC.hs
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/PIC.hs
@@ -132,6 +132,11 @@ cmmMakeDynamicReference config reference
               addImport symbolPtr
               return $ cmmMakePicReference config symbolPtr
 
+        AccessViaSymbolPtr | ArchRISCV64 <- platformArch platform -> do
+              let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl
+              addImport symbolPtr
+              return $ cmmMakePicReference config symbolPtr
+
         AccessViaSymbolPtr -> do
               let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl
               addImport symbolPtr
@@ -164,6 +169,10 @@ cmmMakePicReference config lbl
   | ArchAArch64 <- platformArch platform
   = CmmLit $ CmmLabel lbl
 
+  | ArchRISCV64 <- platformArch platform
+  = CmmLit $ CmmLabel lbl
+
+
   | OSAIX <- platformOS platform
   = CmmMachOp (MO_Add W32)
           [ CmmReg (CmmGlobal $ GlobalRegUse PicBaseReg (bWord platform))
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/RV64-notes.md
===================================================================
--- /dev/null
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/RV64-notes.md
@@ -0,0 +1,35 @@
+# Riscv 64 NCG
+
+We model the RV64 NCG along the aarch64 NCG, simply because they share
+a lot in common. Ultimately we might want to extract some RISC like super
+structure from this to reduce duplicate code.
+
+This is the aarch64 register layout (for linux calling)
+```
+.---------------------------------------------------------------------------------------------------------------------------------------------------------------.
+|  0 |  1 |  2 |  3 |  4 |  5 |  6 |  7 |  8 |  9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
+| 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 42 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |
+|== General Purpose registers ==================================================================================================================================|
+| <---- argument passing -------------> | IR | <------- tmp registers --------> | IP0| IP1| PL | <------------------- callee saved ------------> | FP | LR | SP |
+| <------ free registers --------------------------------------------------------------------> | BR | Sp | Hp | R1 | R2 | R3 | R4 | R5 | R6 | SL | -- | -- | -- |
+|== SIMD/FP Registers ==========================================================================================================================================|
+| <---- argument passing -------------> | <-- callee saved (lower 64 bits) ---> | <--------------------------------------- caller saved ----------------------> |
+| <------ free registers -------------> | F1 | F2 | F3 | F4 | D1 | D2 | D3 | D4 | <------ free registers -----------------------------------------------------> |
+'---------------------------------------------------------------------------------------------------------------------------------------------------------------'
+IR: Indirect result location register, IP: Intra-procedure register, PL: Platform register, FP: Frame pointer, LR: Link register, SP: Stack pointer
+BR: Base, SL: SpLim
+```
+comparing this to RV64 gives us:
+```
+.---------------------------------------------------------------------------------------------------------------------------------------------------------------.
+|  0 |  1 |  2 |  3 |  4 |  5 |  6 |  7 |  8 |  9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
+| 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 42 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |
+|== General Purpose registers ==================================================================================================================================|
+| ZR | RA | SP | GP | TP | <- tmp r. -> | FP | <- | <---- argument passing -------------> | -- callee saved ------------------------------> | <--- tmp regs --> |
+| -- | -- | -- | -- | -- | <- free r. > | -- | BR | <---- free registers ---------------> | SP | HP | R1 | R2 | R3 | R4 | R5 | R6 | R7 | SL | <-- free regs --> |
+|== SIMD/FP Registers ==========================================================================================================================================|
+| <--- temporary registers -----------> | <------ | <---- argument passing -------------> | -- callee saved ------------------------------> | <--- tmp regs --> |
+| <---------- free registers ---------> | F1 | F2 | <---- free registers ---------------> | F3 | F4 | F5 | F6 | D1 | D2 | D3 | D4 | D5 | D6 | -- | -- | -- | -- |
+'---------------------------------------------------------------------------------------------------------------------------------------------------------------'
+
+ZR: Zero, RA: Return Address, SP: Stack Pointer, GP: Global Pointer, TP: Thread Pointer, FP: Frame Pointer
\ No newline at end of file
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/RV64.hs
===================================================================
--- /dev/null
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/RV64.hs
@@ -0,0 +1,61 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+-- | Native code generator for RiscV64 architectures
+module GHC.CmmToAsm.RV64
+   ( ncgRV64 )
+where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.Config
+import GHC.CmmToAsm.Types
+import GHC.Utils.Outputable (ftext)
+
+import qualified GHC.CmmToAsm.RV64.Instr   as RV64
+import qualified GHC.CmmToAsm.RV64.Ppr     as RV64
+import qualified GHC.CmmToAsm.RV64.CodeGen as RV64
+import qualified GHC.CmmToAsm.RV64.Regs    as RV64
+import qualified GHC.CmmToAsm.RV64.RegInfo as RV64
+
+ncgRV64 :: NCGConfig -> NcgImpl RawCmmStatics RV64.Instr RV64.JumpDest
+ncgRV64 config
+ = NcgImpl {
+        ncgConfig                 = config
+       ,cmmTopCodeGen             = RV64.cmmTopCodeGen
+       ,generateJumpTableForInstr = RV64.generateJumpTableForInstr config
+       ,getJumpDestBlockId        = RV64.getJumpDestBlockId
+       ,canShortcut               = RV64.canShortcut
+       ,shortcutStatics           = RV64.shortcutStatics
+       ,shortcutJump              = RV64.shortcutJump
+       ,pprNatCmmDeclS            = RV64.pprNatCmmDecl config
+       ,pprNatCmmDeclH            = RV64.pprNatCmmDecl config
+       ,maxSpillSlots             = RV64.maxSpillSlots config
+       ,allocatableRegs           = RV64.allocatableRegs platform
+       ,ncgAllocMoreStack         = RV64.allocMoreStack platform
+       ,ncgMakeFarBranches        = \_p _i bs -> pure bs
+       ,extractUnwindPoints       = const []
+       ,invertCondBranches        = \_ _ -> id
+  }
+    where
+      platform = ncgPlatform config
+
+-- | Instruction instance for RV64
+instance Instruction RV64.Instr where
+        regUsageOfInstr         = RV64.regUsageOfInstr
+        patchRegsOfInstr        = RV64.patchRegsOfInstr
+        isJumpishInstr          = RV64.isJumpishInstr
+        jumpDestsOfInstr        = RV64.jumpDestsOfInstr
+        patchJumpInstr          = RV64.patchJumpInstr
+        mkSpillInstr            = RV64.mkSpillInstr
+        mkLoadInstr             = RV64.mkLoadInstr
+        takeDeltaInstr          = RV64.takeDeltaInstr
+        isMetaInstr             = RV64.isMetaInstr
+        mkRegRegMoveInstr _     = RV64.mkRegRegMoveInstr
+        takeRegRegMoveInstr     = RV64.takeRegRegMoveInstr
+        mkJumpInstr             = RV64.mkJumpInstr
+        mkStackAllocInstr       = RV64.mkStackAllocInstr
+        mkStackDeallocInstr     = RV64.mkStackDeallocInstr
+        mkComment               = pure . RV64.COMMENT . ftext
+        pprInstr                = RV64.pprInstr
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/RV64/CodeGen.hs
===================================================================
--- /dev/null
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/RV64/CodeGen.hs
@@ -0,0 +1,1780 @@
+{-# language GADTs #-}
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE BinaryLiterals #-}
+{-# LANGUAGE OverloadedStrings #-}
+module GHC.CmmToAsm.RV64.CodeGen (
+      cmmTopCodeGen
+    , generateJumpTableForInstr
+)
+
+where
+
+import Control.Monad (mapAndUnzipM)
+import Data.Maybe
+import Data.Word
+import GHC.Cmm
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.DebugBlock
+import GHC.Cmm.Switch
+import GHC.Cmm.Utils
+import GHC.CmmToAsm.CPrim
+import GHC.CmmToAsm.Config
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Monad
+  ( NatM,
+    getBlockIdNat,
+    getConfig,
+    getDebugBlock,
+    getFileId,
+    getNewLabelNat,
+    getNewRegNat,
+    getPicBaseMaybeNat,
+    getPlatform,
+  )
+import GHC.CmmToAsm.PIC
+import GHC.CmmToAsm.RV64.Cond
+import GHC.CmmToAsm.RV64.Instr
+import GHC.CmmToAsm.RV64.Regs
+import GHC.CmmToAsm.Types
+import GHC.Data.FastString
+import GHC.Data.OrdList
+import GHC.Float
+import GHC.Platform
+import GHC.Platform.Reg
+import GHC.Platform.Regs
+import GHC.Prelude hiding (EQ)
+import GHC.Types.Basic
+import GHC.Types.ForeignCall
+import GHC.Types.SrcLoc (srcSpanFile, srcSpanStartCol, srcSpanStartLine)
+import GHC.Types.Tickish (GenTickish (..))
+import GHC.Utils.Constants (debugIsOn)
+import GHC.Utils.Misc
+import GHC.Utils.Outputable
+import GHC.Utils.Panic
+import GHC.Utils.Panic.Plain (assert)
+
+-- For an overview of an NCG's structure, see Note [General layout of an NCG]
+
+cmmTopCodeGen
+    :: RawCmmDecl
+    -> NatM [NatCmmDecl RawCmmStatics Instr]
+
+-- Thus we'll have to deal with either CmmProc ...
+cmmTopCodeGen _cmm@(CmmProc info lab live graph) = do
+  -- do
+  --   traceM $ "-- -------------------------- cmmTopGen (CmmProc) -------------------------- --\n"
+  --         ++ showSDocUnsafe (ppr cmm)
+
+  let blocks = toBlockListEntryFirst graph
+  (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
+  picBaseMb <- getPicBaseMaybeNat
+
+  let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
+      tops = proc : concat statics
+
+  case picBaseMb of
+      Just _picBase -> panic "RV64.cmmTopCodeGen: picBase not implemented"
+      Nothing -> return tops
+
+-- ... or CmmData.
+cmmTopCodeGen _cmm@(CmmData sec dat) = do
+  -- do
+  --   traceM $ "-- -------------------------- cmmTopGen (CmmData) -------------------------- --\n"
+  --         ++ showSDocUnsafe (ppr cmm)
+  return [CmmData sec dat] -- no translation, we just use CmmStatic
+
+basicBlockCodeGen
+        :: Block CmmNode C C
+        -> NatM ( [NatBasicBlock Instr]
+                , [NatCmmDecl RawCmmStatics Instr])
+
+basicBlockCodeGen block = do
+  config <- getConfig
+  -- do
+  --   traceM $ "-- --------------------------- basicBlockCodeGen --------------------------- --\n"
+  --         ++ showSDocUnsafe (ppr block)
+  let (_, nodes, tail)  = blockSplit block
+      id = entryLabel block
+      stmts = blockToList nodes
+
+      header_comment_instr | debugIsOn = unitOL $ MULTILINE_COMMENT (
+          text "-- --------------------------- basicBlockCodeGen --------------------------- --\n"
+          $+$ withPprStyle defaultDumpStyle (pdoc (ncgPlatform config) block)
+          )
+                           | otherwise = nilOL
+  -- Generate location directive
+  dbg <- getDebugBlock (entryLabel block)
+  loc_instrs <- case dblSourceTick =<< dbg of
+    Just (SourceNote span (LexicalFastString name))
+      -> do fileId <- getFileId (srcSpanFile span)
+            let line = srcSpanStartLine span; col = srcSpanStartCol span
+            return $ unitOL $ LOCATION fileId line col (unpackFS name)
+    _ -> return nilOL
+  (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
+  (!tail_instrs,_) <- stmtToInstrs mid_bid tail
+  let instrs = header_comment_instr `appOL` loc_instrs `appOL` mid_instrs `appOL` tail_instrs
+  -- TODO: Then x86 backend run @verifyBasicBlock@ here and inserts
+  --      unwinding info. See Ticket 19913
+  -- code generation may introduce new basic block boundaries, which
+  -- are indicated by the NEWBLOCK instruction.  We must split up the
+  -- instruction stream into basic blocks again.  Also, we extract
+  -- LDATAs here too.
+  let
+        (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs
+
+        mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
+          = ([], BasicBlock id instrs : blocks, statics)
+        mkBlocks (LDATA sec dat) (instrs,blocks,statics)
+          = (instrs, blocks, CmmData sec dat:statics)
+        mkBlocks instr (instrs,blocks,statics)
+          = (instr:instrs, blocks, statics)
+  return (BasicBlock id top : other_blocks, statics)
+
+
+-- -----------------------------------------------------------------------------
+-- | Utilities
+ann :: SDoc -> Instr -> Instr
+ann doc instr {- debugIsOn -} = ANN doc instr
+-- ann _ instr = instr
+{-# INLINE ann #-}
+
+-- Using pprExpr will hide the AST, @ANN@ will end up in the assembly with
+-- -dppr-debug.  The idea is that we can trivially see how a cmm expression
+-- ended up producing the assembly we see.  By having the verbatim AST printed
+-- we can simply check the patterns that were matched to arrive at the assembly
+-- we generated.
+--
+-- pprExpr will hide a lot of noise of the underlying data structure and print
+-- the expression into something that can be easily read by a human. However
+-- going back to the exact CmmExpr representation can be laborious and adds
+-- indirections to find the matches that lead to the assembly.
+--
+-- An improvement oculd be to have
+--
+--    (pprExpr genericPlatform e) <> parens (text. show e)
+--
+-- to have the best of both worlds.
+--
+-- Note: debugIsOn is too restrictive, it only works for debug compilers.
+-- However, we do not only want to inspect this for debug compilers. Ideally
+-- we'd have a check for -dppr-debug here already, such that we don't even
+-- generate the ANN expressions. However, as they are lazy, they shouldn't be
+-- forced until we actually force them, and without -dppr-debug they should
+-- never end up being forced.
+annExpr :: CmmExpr -> Instr -> Instr
+annExpr e {- debugIsOn -} = ANN (text . show $ e)
+-- annExpr e instr {- debugIsOn -} = ANN (pprExpr genericPlatform e) instr
+-- annExpr _ instr = instr
+{-# INLINE annExpr #-}
+
+-- -----------------------------------------------------------------------------
+-- Generating a table-branch
+
+-- | Generate jump to jump table target
+--
+-- The index into the jump table is calulated by evaluating @expr@. The
+-- corresponding table entry contains the address to jump to.
+genSwitch :: NCGConfig -> CmmExpr -> SwitchTargets -> NatM InstrBlock
+genSwitch config expr targets = do
+  (reg, fmt1, e_code) <- getSomeReg indexExpr
+  let fmt = II64
+  tmp <- getNewRegNat fmt
+  lbl <- getNewLabelNat
+  dynRef <- cmmMakeDynamicReference config DataReference lbl
+  (tableReg, fmt2, t_code) <- getSomeReg $ dynRef
+  let code =
+        toOL [ COMMENT (text "indexExpr" <+> (text . show) indexExpr)
+             , COMMENT (text "dynRef" <+> (text . show) dynRef)
+             ]
+          `appOL` e_code
+          `appOL` t_code
+          `appOL` toOL
+            [
+              COMMENT (ftext "Jump table for switch")
+              , annExpr expr (LSL (OpReg (formatToWidth fmt1) reg) (OpReg (formatToWidth fmt1) reg) (OpImm (ImmInt 3)))
+              , ADD (OpReg W64 tmp) (OpReg (formatToWidth fmt1) reg) (OpReg (formatToWidth fmt2) tableReg)
+              , LDRU II64 (OpReg W64 tmp) (OpAddr (AddrRegImm tmp (ImmInt 0)))
+              , J_TBL ids (Just lbl) tmp
+            ]
+  return code
+  where
+    -- See Note [Sub-word subtlety during jump-table indexing] in
+    -- GHC.CmmToAsm.X86.CodeGen for why we must first offset, then widen.
+    indexExpr0 = cmmOffset platform expr offset
+    -- We widen to a native-width register to sanitize the high bits
+    indexExpr =
+      CmmMachOp
+        (MO_UU_Conv expr_w (platformWordWidth platform))
+        [indexExpr0]
+    expr_w = cmmExprWidth platform expr
+    (offset, ids) = switchTargetsToTable targets
+    platform = ncgPlatform config
+
+-- | Generate jump table data (if required)
+--
+-- Relies on PIC relocations. The idea is to emit one table entry per case. The
+-- entry is the label of the block to jump to. This will be relocated to be the
+-- address of the jump target.
+generateJumpTableForInstr ::
+  NCGConfig ->
+  Instr ->
+  Maybe (NatCmmDecl RawCmmStatics Instr)
+generateJumpTableForInstr config (J_TBL ids (Just lbl) _) =
+  let jumpTable =
+        map jumpTableEntryRel ids
+        where
+          jumpTableEntryRel Nothing =
+            CmmStaticLit (CmmInt 0 (ncgWordWidth config))
+          jumpTableEntryRel (Just blockid) =
+            CmmStaticLit (CmmLabel blockLabel)
+            where
+              blockLabel = blockLbl blockid
+   in Just (CmmData (Section ReadOnlyData lbl) (CmmStaticsRaw lbl jumpTable))
+generateJumpTableForInstr _ _ = Nothing
+
+-- -----------------------------------------------------------------------------
+-- Top-level of the instruction selector
+
+-- See Note [Keeping track of the current block] for why
+-- we pass the BlockId.
+stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
+              -> [CmmNode O O] -- ^ Cmm Statement
+              -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
+stmtsToInstrs bid stmts =
+    go bid stmts nilOL
+  where
+    go bid  []        instrs = return (instrs,bid)
+    go bid (s:stmts)  instrs = do
+      (instrs',bid') <- stmtToInstrs bid s
+      -- If the statement introduced a new block, we use that one
+      let !newBid = fromMaybe bid bid'
+      go newBid stmts (instrs `appOL` instrs')
+
+-- | `bid` refers to the current block and is used to update the CFG
+--   if new blocks are inserted in the control flow.
+-- See Note [Keeping track of the current block] for more details.
+stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in.
+             -> CmmNode e x
+             -> NatM (InstrBlock, Maybe BlockId)
+             -- ^ Instructions, and bid of new block if successive
+             -- statements are placed in a different basic block.
+stmtToInstrs bid stmt = do
+  -- traceM $ "-- -------------------------- stmtToInstrs -------------------------- --\n"
+  --     ++ showSDocUnsafe (ppr stmt)
+  config <- getConfig
+  platform <- getPlatform
+  case stmt of
+    CmmUnsafeForeignCall target result_regs args
+       -> genCCall target result_regs args bid
+
+    _ -> (,Nothing) <$> case stmt of
+      CmmComment s   -> return (unitOL (COMMENT (ftext s)))
+      CmmTick {}     -> return nilOL
+
+      CmmAssign reg src
+        | isFloatType ty         -> assignReg_FltCode format reg src
+        | otherwise              -> assignReg_IntCode format reg src
+          where ty = cmmRegType reg
+                format = cmmTypeFormat ty
+
+      CmmStore addr src _alignment
+        | isFloatType ty         -> assignMem_FltCode format addr src
+        | otherwise              -> assignMem_IntCode format addr src
+          where ty = cmmExprType platform src
+                format = cmmTypeFormat ty
+
+      CmmBranch id          -> genBranch id
+
+      --We try to arrange blocks such that the likely branch is the fallthrough
+      --in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here.
+      CmmCondBranch arg true false _prediction ->
+          genCondBranch bid true false arg
+
+      CmmSwitch arg ids -> genSwitch config arg ids
+
+      CmmCall { cml_target = arg } -> genJump arg
+
+      CmmUnwind _regs -> return nilOL
+
+      _ -> pprPanic "stmtToInstrs: statement should have been cps'd away" (pdoc platform stmt)
+
+--------------------------------------------------------------------------------
+-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
+--      They are really trees of insns to facilitate fast appending, where a
+--      left-to-right traversal yields the insns in the correct order.
+--
+type InstrBlock
+        = OrdList Instr
+
+-- | Register's passed up the tree.  If the stix code forces the register
+--      to live in a pre-decided machine register, it comes out as @Fixed@;
+--      otherwise, it comes out as @Any@, and the parent can decide which
+--      register to put it in.
+--
+data Register
+        = Fixed Format Reg InstrBlock
+        | Any   Format (Reg -> InstrBlock)
+
+-- | Sometimes we need to change the Format of a register. Primarily during
+-- conversion.
+swizzleRegisterRep :: Format -> Register -> Register
+swizzleRegisterRep format (Fixed _ reg code) = Fixed format reg code
+swizzleRegisterRep format (Any _ codefn)     = Any   format codefn
+
+-- | Grab the Reg for a CmmReg
+getRegisterReg :: Platform -> CmmReg -> Reg
+
+getRegisterReg _ (CmmLocal (LocalReg u pk))
+  = RegVirtual $ mkVirtualReg u (cmmTypeFormat pk)
+
+getRegisterReg platform (CmmGlobal reg@(GlobalRegUse mid _))
+  = case globalRegMaybe platform mid of
+        Just reg -> RegReal reg
+        Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal reg)
+        -- By this stage, the only MagicIds remaining should be the
+        -- ones which map to a real machine register on this
+        -- platform.  Hence if it's not mapped to a registers something
+        -- went wrong earlier in the pipeline.
+
+-- -----------------------------------------------------------------------------
+-- General things for putting together code sequences
+
+-- | The dual to getAnyReg: compute an expression into a register, but
+--      we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, Format, InstrBlock)
+getSomeReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code -> do
+        tmp <- getNewRegNat rep
+        return (tmp, rep, code tmp)
+    Fixed rep reg code ->
+        return (reg, rep, code)
+
+-- TODO OPT: we might be able give getRegister
+--          a hint, what kind of register we want.
+getFloatReg :: HasCallStack => CmmExpr -> NatM (Reg, Format, InstrBlock)
+getFloatReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code | isFloatFormat rep -> do
+      tmp <- getNewRegNat rep
+      return (tmp, rep, code tmp)
+    Any II32 code -> do
+      tmp <- getNewRegNat FF32
+      return (tmp, FF32, code tmp)
+    Any II64 code -> do
+      tmp <- getNewRegNat FF64
+      return (tmp, FF64, code tmp)
+    Any _w _code -> do
+      config <- getConfig
+      pprPanic "can't do getFloatReg on" (pdoc (ncgPlatform config) expr)
+    -- can't do much for fixed.
+    Fixed rep reg code ->
+      return (reg, rep, code)
+
+-- TODO: TODO, bounds. We can't put any immediate
+-- value in. They are constrained.
+-- See Ticket 19911
+litToImm' :: CmmLit -> NatM (Operand, InstrBlock)
+litToImm' lit = return (OpImm (litToImm lit), nilOL)
+
+getRegister :: CmmExpr -> NatM Register
+getRegister e = do
+  config <- getConfig
+  getRegister' config (ncgPlatform config) e
+
+-- | The register width to be used for an operation on the given width
+-- operand.
+opRegWidth :: Width -> Width
+opRegWidth W64 = W64  -- x
+opRegWidth W32 = W32  -- w
+opRegWidth W16 = W32  -- w
+opRegWidth W8  = W32  -- w
+opRegWidth w   = pprPanic "opRegWidth" (text "Unsupported width" <+> ppr w)
+
+-- Note [Signed arithmetic on RISCV64]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- Handling signed arithmetic on sub-word-size values on RISCV64 is a bit
+-- tricky as Cmm's type system does not capture signedness. While 32-bit values
+-- are fairly easy to handle due to RISCV64's 32-bit instruction variants
+-- (denoted by use of %wN registers), 16- and 8-bit values require quite some
+-- care.
+--
+-- We handle 16-and 8-bit values by using the 32-bit operations and
+-- sign-/zero-extending operands and truncate results as necessary. For
+-- simplicity we maintain the invariant that a register containing a
+-- sub-word-size value always contains the zero-extended form of that value
+-- in between operations.
+--
+-- For instance, consider the program,
+--
+--    test(bits64 buffer)
+--      bits8 a = bits8[buffer];
+--      bits8 b = %mul(a, 42);
+--      bits8 c = %not(b);
+--      bits8 d = %shrl(c, 4::bits8);
+--      return (d);
+--    }
+--
+-- This program begins by loading `a` from memory, for which we use a
+-- zero-extended byte-size load.  We next sign-extend `a` to 32-bits, and use a
+-- 32-bit multiplication to compute `b`, and truncate the result back down to
+-- 8-bits.
+--
+-- Next we compute `c`: The `%not` requires no extension of its operands, but
+-- we must still truncate the result back down to 8-bits. Finally the `%shrl`
+-- requires no extension and no truncate since we can assume that
+-- `c` is zero-extended.
+--
+-- The "RISC-V Sign Extension Optimizations" LLVM tech talk presentation by
+-- Craig Topper covers possible future improvements
+-- (https://llvm.org/devmtg/2022-11/slides/TechTalk21-RISC-VSignExtensionOptimizations.pdf)
+--
+-- TODO:
+--   Don't use Width in Operands
+--   Instructions should rather carry a RegWidth
+--
+-- Note [Handling PIC on RV64]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- RV64 does not have a special PIC register, the general approach is to
+-- simply go through the GOT, and there is assembly support for this:
+--
+-- rv64 assembly has a `la` (load address) pseudo-instruction, that allows
+-- loading a label, ... into a register.  The instruction is desugared into
+--
+-- 1: lui  rd1, %pcrel_hi(label)
+--    addi rd1, %pcrel_lo(1b)
+--
+-- See https://sourceware.org/binutils/docs/as/RISC_002dV_002dModifiers.html,
+-- PIC can be enabled/disabled through
+--
+--  .option pic
+--
+-- See https://sourceware.org/binutils/docs/as/RISC_002dV_002dDirectives.html#RISC_002dV_002dDirectives
+--
+-- CmmGlobal @PicBaseReg@'s are generated in @GHC.CmmToAsm.PIC@ in the
+-- @cmmMakePicReference@.  This is in turn called from @cmmMakeDynamicReference@
+-- also in @Cmm.CmmToAsm.PIC@ from where it is also exported.  There are two
+-- callsites for this. One is in this module to produce the @target@ in @genCCall@
+-- the other is in @GHC.CmmToAsm@ in @cmmExprNative@.
+--
+-- Conceptually we do not want any special PicBaseReg to be used on RV64. If
+-- we want to distinguish between symbol loading, we need to address this through
+-- the way we load it, not through a register.
+--
+
+getRegister' :: NCGConfig -> Platform -> CmmExpr -> NatM Register
+-- OPTIMIZATION WARNING: CmmExpr rewrites
+-- 1. Rewrite: Reg + (-n) => Reg - n
+--    TODO: this expression shouldn't even be generated to begin with.
+getRegister' config plat (CmmMachOp (MO_Add w0) [x, CmmLit (CmmInt i w1)]) | i < 0
+  = getRegister' config plat (CmmMachOp (MO_Sub w0) [x, CmmLit (CmmInt (-i) w1)])
+
+getRegister' config plat (CmmMachOp (MO_Sub w0) [x, CmmLit (CmmInt i w1)]) | i < 0
+  = getRegister' config plat (CmmMachOp (MO_Add w0) [x, CmmLit (CmmInt (-i) w1)])
+
+-- Generic case.
+getRegister' config plat expr =
+  case expr of
+    CmmReg (CmmGlobal (GlobalRegUse PicBaseReg _)) ->
+      pprPanic "getRegisterReg-memory" (ppr PicBaseReg)
+
+    CmmLit lit ->
+      case lit of
+        CmmInt 0 w -> pure $ Fixed (intFormat w) zeroReg nilOL
+        CmmInt i w ->
+                     -- narrowU is important: Negative immediates may be
+                     -- sign-extended on load!
+                     let imm = OpImm . ImmInteger $ narrowU w i
+                     in
+                        pure (Any (intFormat w) (\dst -> unitOL $ annExpr expr (MOV (OpReg w dst) imm)))
+
+        -- floatToBytes (fromRational f)
+        CmmFloat 0 w   -> do
+          (op, imm_code) <- litToImm' lit
+          return (Any (floatFormat w) (\dst -> imm_code `snocOL` annExpr expr (MOV (OpReg w dst) op)))
+
+        CmmFloat _f W8  -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for bytes" (pdoc plat expr)
+        CmmFloat _f W16 -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for halfs" (pdoc plat expr)
+        CmmFloat f W32 -> do
+          let word = castFloatToWord32 (fromRational f) :: Word32
+          tmp <- getNewRegNat (intFormat W32)
+          return (Any (floatFormat W32) (\dst -> toOL [ annExpr expr
+                                                      $ MOV (OpReg W32 tmp) (OpImm (ImmInteger (fromIntegral word)))
+                                                      , MOV (OpReg W32 dst) (OpReg W32 tmp)
+                                                      ]))
+        CmmFloat f W64 -> do
+          let word = castDoubleToWord64 (fromRational f) :: Word64
+          tmp <- getNewRegNat (intFormat W64)
+          return (Any (floatFormat W64) (\dst -> toOL [ annExpr expr
+                                                      $ MOV (OpReg W64 tmp) (OpImm (ImmInteger (fromIntegral word)))
+                                                      , MOV (OpReg W64 dst) (OpReg W64 tmp)
+                                                      ]))
+        CmmFloat _f _w -> pprPanic "getRegister' (CmmLit:CmmFloat), unsupported float lit" (pdoc plat expr)
+        CmmVec _ -> pprPanic "getRegister' (CmmLit:CmmVec): " (pdoc plat expr)
+        CmmLabel _lbl -> do
+          (op, imm_code) <- litToImm' lit
+          let rep = cmmLitType plat lit
+              format = cmmTypeFormat rep
+          return (Any format (\dst -> imm_code `snocOL` annExpr expr (LDR format (OpReg (formatToWidth format) dst) op)))
+
+        CmmLabelOff _lbl off | isNbitEncodeable 12 (fromIntegral off) -> do
+          (op, imm_code) <- litToImm' lit
+          let rep = cmmLitType plat lit
+              format = cmmTypeFormat rep
+          return (Any format (\dst -> imm_code `snocOL` LDR format (OpReg (formatToWidth format) dst) op))
+
+        CmmLabelOff lbl off -> do
+          (op, imm_code) <- litToImm' (CmmLabel lbl)
+          let rep = cmmLitType plat lit
+              format = cmmTypeFormat rep
+              width = typeWidth rep
+          (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width)
+          return (Any format (\dst -> imm_code `appOL` off_code `snocOL` LDR format (OpReg (formatToWidth format) dst) op `snocOL` ADD (OpReg width dst) (OpReg width dst) (OpReg width off_r)))
+
+        CmmLabelDiffOff {} -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
+        CmmBlock _ -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
+        CmmHighStackMark -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
+    CmmLoad mem rep _ -> do
+      let format = cmmTypeFormat rep
+          width = typeWidth rep
+      Amode addr addr_code <- getAmode plat width mem
+      case width of
+        w | w <= W64 ->
+            -- Load without sign-extension. See Note [Signed arithmetic on RISCV64]
+            pure (Any format (\dst -> addr_code `snocOL` LDRU format (OpReg width dst) (OpAddr addr)))
+        _ ->
+          pprPanic ("Width too big! Cannot load: " ++ show width) (pdoc plat expr)
+
+    CmmStackSlot _ _
+      -> pprPanic "getRegister' (CmmStackSlot): " (pdoc plat expr)
+    CmmReg reg
+      -> return (Fixed (cmmTypeFormat (cmmRegType reg))
+                       (getRegisterReg plat reg)
+                       nilOL)
+    CmmRegOff reg off | isNbitEncodeable 12 (fromIntegral off) -> do
+      getRegister' config plat $
+            CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
+          where width = typeWidth (cmmRegType reg)
+
+    CmmRegOff reg off -> do
+      (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width)
+      (reg, _format, code) <- getSomeReg $ CmmReg reg
+      return $ Any (intFormat width) (\dst -> off_code `appOL` code `snocOL` ADD (OpReg width dst) (OpReg width reg) (OpReg width off_r))
+          where width = typeWidth (cmmRegType reg)
+
+
+
+    -- for MachOps, see GHC.Cmm.MachOp
+    -- For CmmMachOp, see GHC.Cmm.Expr
+    CmmMachOp op [e] -> do
+      (reg, _format, code) <- getSomeReg e
+      case op of
+        MO_Not w -> return $ Any (intFormat w) $ \dst ->
+            let w' = opRegWidth w
+             in code `snocOL`
+                -- pseudo instruction `not` is `xori rd, rs, -1`
+                ann (text "not") (XORI (OpReg w' dst) (OpReg w' reg) (OpImm (ImmInt (-1)))) `appOL`
+                truncateReg w' w dst -- See Note [Signed arithmetic on RISCV64]
+
+        MO_S_Neg w -> negate code w reg
+        MO_F_Neg w -> return $ Any (floatFormat w) (\dst -> code `snocOL` NEG (OpReg w dst) (OpReg w reg))
+
+        -- TODO: Can this case happen?
+        MO_SF_Conv from to | from < W32 -> do
+                               -- extend to the smallest available representation
+                               (reg_x, code_x) <- signExtendReg from W32 reg
+                               pure $ Any (floatFormat to)
+                                                    (\dst -> code `appOL` code_x `snocOL` annExpr expr (SCVTF (OpReg to dst) (OpReg from reg_x)))  -- (Signed ConVerT Float)
+        MO_SF_Conv from to -> pure $ Any (floatFormat to) (\dst -> code `snocOL` annExpr expr (SCVTF (OpReg to dst) (OpReg from reg)))  -- (Signed ConVerT Float)
+        -- TODO: Can this case happen?
+        MO_FS_Conv from to | to < W32 -> pure $ Any (intFormat to) (\dst ->
+                                                                      code `snocOL`
+                                                                      -- W32 is the smallest width to convert to. Decrease width afterwards.
+                                                                      annExpr expr (FCVTZS (OpReg W32 dst) (OpReg from reg)) `appOL`
+                                                                      signExtendAdjustPrecission W32 to dst dst) -- (float convert (-> zero) signed)
+        MO_FS_Conv from to -> pure $ Any (intFormat to) (\dst ->
+                                                           code `snocOL`
+                                                           annExpr expr (FCVTZS (OpReg to dst) (OpReg from reg)) `appOL` -- (float convert (-> zero) signed)
+                                                           truncateReg from to dst)
+        MO_UU_Conv from to | from <= to -> pure $ Any (intFormat to) (\dst ->
+                                                                          code `snocOL`
+                                                                          annExpr e (MOV (OpReg to dst) (OpReg from reg))
+                                                                       )
+
+        MO_UU_Conv from to -> pure $ Any (intFormat to) (\dst ->
+                                                            code `snocOL`
+                                                            annExpr e (MOV (OpReg from dst) (OpReg from reg)) `appOL`
+                                                            truncateReg from to dst
+                                                          )
+        MO_SS_Conv from to -> ss_conv from to reg code
+        MO_FF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` annExpr e (FCVT (OpReg to dst) (OpReg from reg)))
+
+        -- Conversions
+        MO_XX_Conv _from to -> swizzleRegisterRep (intFormat to) <$> getRegister e
+        MO_AlignmentCheck align wordWidth -> do
+          reg <- getRegister' config plat e
+          addAlignmentCheck align wordWidth reg
+
+        _ -> pprPanic "getRegister' (monadic CmmMachOp):" (pdoc plat expr)
+      where
+        -- In the case of 16- or 8-bit values we need to sign-extend to 32-bits
+        -- See Note [Signed arithmetic on RISCV64].
+        negate code w reg = do
+            let w' = opRegWidth w
+            (reg', code_sx) <- signExtendReg w w' reg
+            return $ Any (intFormat w) $ \dst ->
+                code `appOL`
+                code_sx `snocOL`
+                NEG (OpReg w' dst) (OpReg w' reg') `appOL`
+                truncateReg w' w dst
+
+        ss_conv from to reg code
+          | from < to = do
+              pure $ Any (intFormat to) $ \dst ->
+                code
+                  `appOL` signExtend from to reg dst
+                  `appOL` truncateReg from to dst
+          | from > to =
+              pure $ Any (intFormat to) $ \dst ->
+                code
+                  `appOL` toOL
+                    [ ann
+                        (text "MO_SS_Conv: narrow register signed" <+> ppr reg <+> ppr from <> text "->" <> ppr to)
+                        (LSL (OpReg to dst) (OpReg from reg) (OpImm (ImmInt shift))),
+                      -- signed right shift
+                      ASR (OpReg to dst) (OpReg to dst) (OpImm (ImmInt shift))
+                    ]
+                  `appOL` truncateReg from to dst
+          | otherwise =
+              -- No conversion necessary: Just copy.
+              pure $ Any (intFormat from) $ \dst ->
+                code `snocOL` MOV (OpReg from dst) (OpReg from reg)
+          where
+            shift = 64 - (widthInBits from - widthInBits to)
+
+-- Dyadic machops:
+    --
+    -- The general idea is:
+    -- compute x<i> <- x
+    -- compute x<j> <- y
+    -- OP x<r>, x<i>, x<j>
+    --
+    -- TODO: for now we'll only implement the 64bit versions. And rely on the
+    --      fallthrough to alert us if things go wrong!
+    -- OPTIMIZATION WARNING: Dyadic CmmMachOp destructuring
+    -- 0. TODO This should not exist! Rewrite: Reg +- 0 -> Reg
+    CmmMachOp (MO_Add _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
+    CmmMachOp (MO_Sub _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
+    -- 1. Compute Reg +/- n directly.
+    --    For Add/Sub we can directly encode 12bits, or 12bits lsl #12.
+    CmmMachOp (MO_Add w) [CmmReg reg, CmmLit (CmmInt n _)]
+      | fitsIn12bitImm n -> return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (ADD (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
+      -- TODO: 12bits lsl #12; e.g. lower 12 bits of n are 0; shift n >> 12, and set lsl to #12.
+      where w' = formatToWidth (cmmTypeFormat (cmmRegType reg))
+            r' = getRegisterReg plat reg
+    CmmMachOp (MO_Sub w) [CmmReg reg, CmmLit (CmmInt n _)]
+      | fitsIn12bitImm n -> return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (SUB (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
+      -- TODO: 12bits lsl #12; e.g. lower 12 bits of n are 0; shift n >> 12, and set lsl to #12.
+      where w' = formatToWidth (cmmTypeFormat (cmmRegType reg))
+            r' = getRegisterReg plat reg
+
+    CmmMachOp (MO_U_Quot w) [x, y] | w == W8 || w == W16  -> do
+      (reg_x, format_x, code_x) <- getSomeReg x
+      (reg_y, format_y, code_y) <- getSomeReg y
+      return $ Any (intFormat w) (\dst -> code_x `appOL`
+                                          truncateReg (formatToWidth format_x) w reg_x `appOL`
+                                          code_y `appOL`
+                                          truncateReg (formatToWidth format_y) w reg_y `snocOL`
+                                          annExpr expr (DIVU (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)))
+
+    -- 2. Shifts. x << n, x >> n.
+    CmmMachOp (MO_Shl w) [x, CmmLit (CmmInt n _)] | w == W32, 0 <= n, n < 32 -> do
+      (reg_x, _format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst ->
+                                    code_x `snocOL`
+                                    annExpr expr (LSL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))) `appOL`
+                                    truncateReg w w dst
+                                 )
+    CmmMachOp (MO_Shl w) [x, CmmLit (CmmInt n _)] | w == W64, 0 <= n, n < 64 -> do
+      (reg_x, _format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst ->
+                                    code_x `snocOL`
+                                    annExpr expr (LSL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))) `appOL`
+                                    truncateReg w w dst
+                                 )
+
+    CmmMachOp (MO_S_Shr w) [x, CmmLit (CmmInt n _)] | fitsIn12bitImm n -> do
+      (reg_x, format_x, code_x) <- getSomeReg x
+      (reg_x', code_x') <- signExtendReg (formatToWidth format_x) w reg_x
+      return $ Any (intFormat w) (
+        \dst ->
+          code_x `appOL` code_x' `snocOL` annExpr expr (ASR (OpReg w dst) (OpReg w reg_x') (OpImm (ImmInteger n)))
+        )
+    CmmMachOp (MO_S_Shr w) [x, y] -> do
+      (reg_x, format_x, code_x) <- getSomeReg x
+      (reg_y, _format_y, code_y) <- getSomeReg y
+      (reg_x', code_x') <- signExtendReg (formatToWidth format_x) w reg_x
+      return $ Any (intFormat w) (
+        \dst ->
+          code_x `appOL` code_x' `appOL` code_y `snocOL` annExpr expr (ASR (OpReg w dst) (OpReg w reg_x') (OpReg w reg_y))
+        )
+
+    CmmMachOp (MO_U_Shr w) [x, CmmLit (CmmInt n _)] | w == W8, 0 <= n, n < 8 -> do
+      (reg_x, format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst -> code_x `appOL` truncateReg (formatToWidth format_x) w reg_x `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
+    CmmMachOp (MO_U_Shr w) [x, CmmLit (CmmInt n _)] | w == W16, 0 <= n, n < 16 -> do
+      (reg_x, format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst -> code_x `appOL` truncateReg (formatToWidth format_x) w reg_x `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
+    CmmMachOp (MO_U_Shr w) [x, y] | w == W8 || w == W16 -> do
+      (reg_x, format_x, code_x) <- getSomeReg x
+      (reg_y, _format_y, code_y) <- getSomeReg y
+      return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `appOL` truncateReg (formatToWidth format_x) w reg_x `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)))
+    CmmMachOp (MO_U_Shr w) [x, CmmLit (CmmInt n _)] | w == W32, 0 <= n, n < 32 -> do
+      (reg_x, _format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
+    CmmMachOp (MO_U_Shr w) [x, CmmLit (CmmInt n _)] | w == W64, 0 <= n, n < 64 -> do
+      (reg_x, _format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
+
+    -- 3. Logic &&, ||
+    CmmMachOp (MO_And w) [CmmReg reg, CmmLit (CmmInt n _)] | fitsIn12bitImm n ->
+      return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (AND (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
+      where w' = formatToWidth (cmmTypeFormat (cmmRegType reg))
+            r' = getRegisterReg plat reg
+
+    CmmMachOp (MO_Or w) [CmmReg reg, CmmLit (CmmInt n _)] | fitsIn12bitImm n ->
+      return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (ORI (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
+      where w' = formatToWidth (cmmTypeFormat (cmmRegType reg))
+            r' = getRegisterReg plat reg
+
+    -- Generic case.
+    CmmMachOp op [x, y] -> do
+      let
+          -- A "plain" operation.
+          bitOp w op = do
+            -- compute x<m> <- x
+            -- compute x<o> <- y
+            -- <OP> x<n>, x<m>, x<o>
+            (reg_x, format_x, code_x) <- getSomeReg x
+            (reg_y, format_y, code_y) <- getSomeReg y
+            massertPpr (isIntFormat format_x == isIntFormat format_y) $ text "bitOp: incompatible"
+            return $ Any (intFormat w) (\dst ->
+                code_x `appOL`
+                code_y `appOL`
+                op (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))
+
+          -- A (potentially signed) integer operation.
+          -- In the case of 8- and 16-bit signed arithmetic we must first
+          -- sign-extend both arguments to 32-bits.
+          -- See Note [Signed arithmetic on RISCV64].
+          intOp is_signed w op = do
+              -- compute x<m> <- x
+              -- compute x<o> <- y
+              -- <OP> x<n>, x<m>, x<o>
+              (reg_x, format_x, code_x) <- getSomeReg x
+              (reg_y, format_y, code_y) <- getSomeReg y
+              massertPpr (isIntFormat format_x && isIntFormat format_y) $ text "intOp: non-int"
+              -- This is the width of the registers on which the operation
+              -- should be performed.
+              let w' = opRegWidth w
+                  signExt r
+                    | not is_signed  = return (r, nilOL)
+                    | otherwise      = signExtendReg w w' r
+              (reg_x_sx, code_x_sx) <- signExt reg_x
+              (reg_y_sx, code_y_sx) <- signExt reg_y
+              return $ Any (intFormat w) $ \dst ->
+                  code_x `appOL`
+                  code_y `appOL`
+                  -- sign-extend both operands
+                  code_x_sx `appOL`
+                  code_y_sx `appOL`
+                  op (OpReg w' dst) (OpReg w' reg_x_sx) (OpReg w' reg_y_sx) `appOL`
+                  truncateReg w' w dst -- truncate back to the operand's original width
+
+          floatOp w op = do
+            (reg_fx, format_x, code_fx) <- getFloatReg x
+            (reg_fy, format_y, code_fy) <- getFloatReg y
+            massertPpr (isFloatFormat format_x && isFloatFormat format_y) $ text "floatOp: non-float"
+            return $ Any (floatFormat w) (\dst -> code_fx `appOL` code_fy `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy))
+
+          -- need a special one for conditionals, as they return ints
+          floatCond w op = do
+            (reg_fx, format_x, code_fx) <- getFloatReg x
+            (reg_fy, format_y, code_fy) <- getFloatReg y
+            massertPpr (isFloatFormat format_x && isFloatFormat format_y) $ text "floatCond: non-float"
+            return $ Any (intFormat w) (\dst -> code_fx `appOL` code_fy `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy))
+
+      case op of
+        -- Integer operations
+        -- Add/Sub should only be Integer Options.
+        MO_Add w -> intOp False w (\d x y -> unitOL $ annExpr expr (ADD d x y))
+        -- TODO: Handle sub-word case
+        MO_Sub w -> intOp False w (\d x y -> unitOL $ annExpr expr (SUB d x y))
+
+        -- TODO: Check if this comment is correct
+        -- Note [CSET]
+        -- ~~~~~~~~~~~
+        -- Setting conditional flags: the architecture internally knows the
+        -- following flag bits.  And based on thsoe comparisons as in the
+        -- table below.
+        --
+        --    31  30  29  28
+        --  .---+---+---+---+-- - -
+        --  | N | Z | C | V |
+        --  '---+---+---+---+-- - -
+        --  Negative
+        --  Zero
+        --  Carry
+        --  oVerflow
+        --
+        --  .------+-------------------------------------+-----------------+----------.
+        --  | Code | Meaning                             | Flags           | Encoding |
+        --  |------+-------------------------------------+-----------------+----------|
+        --  |  EQ  | Equal                               | Z = 1           | 0000     |
+        --  |  NE  | Not Equal                           | Z = 0           | 0001     |
+        --  |  HI  | Unsigned Higher                     | C = 1 && Z = 0  | 1000     |
+        --  |  HS  | Unsigned Higher or Same             | C = 1           | 0010     |
+        --  |  LS  | Unsigned Lower or Same              | C = 0 || Z = 1  | 1001     |
+        --  |  LO  | Unsigned Lower                      | C = 0           | 0011     |
+        --  |  GT  | Signed Greater Than                 | Z = 0 && N = V  | 1100     |
+        --  |  GE  | Signed Greater Than or Equal        | N = V           | 1010     |
+        --  |  LE  | Signed Less Than or Equal           | Z = 1 || N /= V | 1101     |
+        --  |  LT  | Signed Less Than                    | N /= V          | 1011     |
+        --  |  CS  | Carry Set (Unsigned Overflow)       | C = 1           | 0010     |
+        --  |  CC  | Carry Clear (No Unsigned Overflow)  | C = 0           | 0011     |
+        --  |  VS  | Signed Overflow                     | V = 1           | 0110     |
+        --  |  VC  | No Signed Overflow                  | V = 0           | 0111     |
+        --  |  MI  | Minus, Negative                     | N = 1           | 0100     |
+        --  |  PL  | Plus, Positive or Zero (!)          | N = 0           | 0101     |
+        --  |  AL  | Always                              | Any             | 1110     |
+        --  |  NV  | Never                               | Any             | 1111     |
+        --- '-------------------------------------------------------------------------'
+
+        -- N.B. We needn't sign-extend sub-word size (in)equality comparisons
+        -- since we don't care about ordering.
+        MO_Eq w     -> bitOp w (\d x y -> unitOL $ annExpr expr (CSET d x y EQ))
+        MO_Ne w     -> bitOp w (\d x y -> unitOL $ annExpr expr (CSET d x y NE))
+
+        -- Signed multiply/divide
+        MO_Mul w          -> intOp True w (\d x y -> unitOL $ annExpr expr (MUL d x y))
+        MO_S_MulMayOflo w -> do_mul_may_oflo w x y
+        MO_S_Quot w       -> intOp True w (\d x y -> unitOL $ annExpr expr (DIV d x y))
+
+        MO_S_Rem w -> intOp True w (\d x y -> unitOL $ annExpr expr (REM d x y))
+
+        -- Unsigned multiply/divide
+        MO_U_Quot w -> intOp False w (\d x y -> unitOL $ annExpr expr (DIVU d x y))
+        MO_U_Rem w  -> intOp False w (\d x y -> unitOL $ annExpr expr (REMU d x y))
+
+        -- Signed comparisons -- see Note [CSET)
+        MO_S_Ge w     -> intOp True  w (\d x y -> unitOL $ annExpr expr (CSET d x y SGE))
+        MO_S_Le w     -> intOp True  w (\d x y -> unitOL $ annExpr expr (CSET d x y SLE))
+        MO_S_Gt w     -> intOp True  w (\d x y -> unitOL $ annExpr expr (CSET d x y SGT))
+        MO_S_Lt w     -> intOp True  w (\d x y -> unitOL $ annExpr expr (CSET d x y SLT))
+
+        -- Unsigned comparisons
+        MO_U_Ge w     -> intOp False w (\d x y -> unitOL $ annExpr expr (CSET d x y UGE))
+        MO_U_Le w     -> intOp False w (\d x y -> unitOL $ annExpr expr (CSET d x y ULE))
+        MO_U_Gt w     -> intOp False w (\d x y -> unitOL $ annExpr expr (CSET d x y UGT))
+        MO_U_Lt w     -> intOp False w (\d x y -> unitOL $ annExpr expr (CSET d x y ULT))
+
+        -- Floating point arithmetic
+        MO_F_Add w   -> floatOp w (\d x y -> unitOL $ annExpr expr (ADD d x y))
+        MO_F_Sub w   -> floatOp w (\d x y -> unitOL $ annExpr expr (SUB d x y))
+        MO_F_Mul w   -> floatOp w (\d x y -> unitOL $ annExpr expr (MUL d x y))
+        MO_F_Quot w  -> floatOp w (\d x y -> unitOL $ annExpr expr (DIV d x y))
+
+        -- Floating point comparison
+        MO_F_Eq w    -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y EQ))
+        MO_F_Ne w    -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y NE))
+
+        MO_F_Ge w    -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y FGE))
+        MO_F_Le w    -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y FLE)) -- x <= y <=> y > x
+        MO_F_Gt w    -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y FGT))
+        MO_F_Lt w    -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET d x y FLT)) -- x < y <=> y >= x
+
+        -- Bitwise operations
+        MO_And   w -> bitOp w (\d x y -> unitOL $ annExpr expr (AND d x y))
+        MO_Or    w -> bitOp w (\d x y -> unitOL $ annExpr expr (OR d x y))
+        MO_Xor   w -> bitOp w (\d x y -> unitOL $ annExpr expr (XOR d x y))
+        MO_Shl   w -> intOp False w (\d x y -> unitOL $ annExpr expr (LSL d x y))
+        MO_U_Shr w -> intOp False w (\d x y -> unitOL $ annExpr expr (LSR d x y))
+        MO_S_Shr w -> intOp True  w (\d x y -> unitOL $ annExpr expr (ASR d x y))
+
+        op -> pprPanic "getRegister' (unhandled dyadic CmmMachOp): " $ pprMachOp op <+> text "in" <+> pdoc plat expr
+    CmmMachOp _op _xs
+      -> pprPanic "getRegister' (variadic CmmMachOp): " (pdoc plat expr)
+
+  where
+    isNbitEncodeable :: Int -> Integer -> Bool
+    isNbitEncodeable n i = let shift = n - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift)
+    -- N.B. MUL does not set the overflow flag.
+    -- Return 0 when the operation cannot overflow, /= 0 otherwise
+    do_mul_may_oflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
+    do_mul_may_oflo w _x _y | w > W64 = pprPanic "Cannot multiply larger than 64bit" (ppr w)
+    do_mul_may_oflo w@W64 x y = do
+      (reg_x, format_x, code_x) <- getSomeReg x
+      (reg_y, format_y, code_y) <- getSomeReg y
+      -- TODO: Can't we clobber reg_x and reg_y to save registers?
+      lo <- getNewRegNat II64
+      hi <- getNewRegNat II64
+      -- TODO: Overhaul CSET: 3rd operand isn't needed for SNEZ
+      let nonSense = OpImm (ImmInt 0)
+      pure $
+        Any
+          (intFormat w)
+          ( \dst ->
+              code_x
+                `appOL` signExtend (formatToWidth format_x) W64 reg_x reg_x
+                `appOL` code_y
+                `appOL` signExtend (formatToWidth format_y) W64 reg_y reg_y
+                `appOL` toOL
+                  [ annExpr expr (SMULH (OpReg w hi) (OpReg w reg_x) (OpReg w reg_y)),
+                    MUL (OpReg w lo) (OpReg w reg_x) (OpReg w reg_y),
+                    ASR (OpReg w lo) (OpReg w lo) (OpImm (ImmInt (widthInBits W64 - 1))),
+                    ann
+                      (text "Set flag if result of MULH contains more than sign bits.")
+                      (XOR (OpReg w hi) (OpReg w hi) (OpReg w lo)),
+                    CSET (OpReg w dst) (OpReg w hi) nonSense NE
+                  ]
+          )
+    do_mul_may_oflo w x y = do
+      (reg_x, format_x, code_x) <- getSomeReg x
+      (reg_y, format_y, code_y) <- getSomeReg y
+      let width_x = formatToWidth format_x
+          width_y = formatToWidth format_y
+      if w > width_x && w > width_y
+        then
+          pure $
+            Any
+              (intFormat w)
+              ( \dst ->
+                  -- 8bit * 8bit cannot overflow 16bit
+                  -- 16bit * 16bit cannot overflow 32bit
+                  -- 32bit * 32bit cannot overflow 64bit
+                  unitOL $ annExpr expr (ADD (OpReg w dst) zero (OpImm (ImmInt 0)))
+              )
+        else do
+          let use32BitMul = w <= W32 && width_x <= W32 && width_y <= W32
+              nonSense = OpImm (ImmInt 0)
+          if use32BitMul
+            then do
+              narrowedReg <- getNewRegNat II64
+              pure $
+                Any
+                  (intFormat w)
+                  ( \dst ->
+                      code_x
+                        `appOL` signExtend (formatToWidth format_x) W32 reg_x reg_x
+                        `appOL` code_y
+                        `appOL` signExtend (formatToWidth format_y) W32 reg_y reg_y
+                        `snocOL` annExpr expr (MUL (OpReg W32 dst) (OpReg W32 reg_x) (OpReg W32 reg_y))
+                        `appOL` signExtendAdjustPrecission W32 w dst narrowedReg
+                        `appOL` toOL
+                          [ ann
+                              (text "Check if the multiplied value fits in the narrowed register")
+                              (SUB (OpReg w dst) (OpReg w dst) (OpReg w narrowedReg)),
+                            CSET (OpReg w dst) (OpReg w dst) nonSense NE
+                          ]
+                  )
+            else
+              pure $
+                Any
+                  (intFormat w)
+                  ( \dst ->
+                      -- Do not handle this unlikely case. Just tell that it may overflow.
+                      unitOL $ annExpr expr (ADD (OpReg w dst) zero (OpImm (ImmInt 1)))
+                  )
+
+-- | Instructions to sign-extend the value in the given register from width @w@
+-- up to width @w'@.
+signExtendReg :: Width -> Width -> Reg -> NatM (Reg, OrdList Instr)
+signExtendReg w _w' r | w == W64 = pure (r, nilOL)
+signExtendReg w w' r = do
+  r' <- getNewRegNat (intFormat w')
+  let instrs = signExtend w w' r r'
+  pure (r', instrs)
+
+-- | Sign extends to 64bit, if needed
+--
+-- Source `Reg` @r@ stays untouched, while the conversion happens on destination
+-- `Reg` @r'@.
+signExtend :: Width -> Width -> Reg -> Reg -> OrdList Instr
+signExtend w w' _r _r' | w > w' = pprPanic "This is not a sign extension, but a truncation." $ ppr w <> text "->" <+> ppr w'
+signExtend w w' _r _r' | w > W64 || w' > W64 = pprPanic "Unexpected width (max is 64bit):" $ ppr w <> text "->" <+> ppr w'
+signExtend w w' r r' | w == W64 && w' == W64 && r == r' = nilOL
+signExtend w w' r r' | w == W64 && w' == W64 = unitOL $ MOV (OpReg w' r') (OpReg w r)
+signExtend w w' r r'
+  | w == W32 && w' == W64 =
+      unitOL $
+        ann
+          (text "sign-extend register (SEXT.W)" <+> ppr r <+> ppr w <> text "->" <> ppr w')
+          -- `ADDIW r r 0` is the pseudo-op SEXT.W
+          (ADD (OpReg w' r') (OpReg w r) (OpImm (ImmInt 0)))
+signExtend w w' r r' =
+  toOL
+    [ ann
+        (text "narrow register signed" <+> ppr r <> char ':' <> ppr w <> text "->" <> ppr r <> char ':' <> ppr w')
+        (LSL (OpReg w' r') (OpReg w r) (OpImm (ImmInt shift))),
+      -- signed (arithmetic) right shift
+      ASR (OpReg w' r') (OpReg w' r') (OpImm (ImmInt shift))
+    ]
+  where
+    shift = 64 - widthInBits w
+
+-- | Sign extends to 64bit, if needed and reduces the precission to the target `Width` (@w'@)
+--
+-- Source `Reg` @r@ stays untouched, while the conversion happens on destination
+-- `Reg` @r'@.
+signExtendAdjustPrecission :: Width -> Width -> Reg -> Reg -> OrdList Instr
+signExtendAdjustPrecission w w' _r _r' | w > W64 || w' > W64 = pprPanic "Unexpected width (max is 64bit):" $ ppr w <> text "->" <+> ppr w'
+signExtendAdjustPrecission w w' r r' | w == W64 && w' == W64 && r == r' = nilOL
+signExtendAdjustPrecission w w' r r' | w == W64 && w' == W64 = unitOL $ MOV (OpReg w' r') (OpReg w r)
+signExtendAdjustPrecission w w' r r'
+  | w == W32 && w' == W64 =
+      unitOL $
+        ann
+          (text "sign-extend register (SEXT.W)" <+> ppr r <+> ppr w <> text "->" <> ppr w')
+          -- `ADDIW r r 0` is the pseudo-op SEXT.W
+          (ADD (OpReg w' r') (OpReg w r) (OpImm (ImmInt 0)))
+signExtendAdjustPrecission w w' r r' | w > w' =
+  toOL
+    [ ann
+        (text "narrow register signed" <+> ppr r <> char ':' <> ppr w <> text "->" <> ppr r <> char ':' <> ppr w')
+        (LSL (OpReg w' r') (OpReg w r) (OpImm (ImmInt shift))),
+      -- signed (arithmetic) right shift
+      ASR (OpReg w' r') (OpReg w' r') (OpImm (ImmInt shift))
+    ]
+  where
+    shift = 64 - widthInBits w'
+signExtendAdjustPrecission w w' r r' =
+  toOL
+    [ ann
+        (text "sign extend register" <+> ppr r <> char ':' <> ppr w <> text "->" <> ppr r <> char ':' <> ppr w')
+        (LSL (OpReg w' r') (OpReg w r) (OpImm (ImmInt shift))),
+      -- signed (arithmetic) right shift
+      ASR (OpReg w' r') (OpReg w' r') (OpImm (ImmInt shift))
+    ]
+  where
+    shift = 64 - widthInBits w
+
+-- | Instructions to truncate the value in the given register from width @w@
+-- to width @w'@.
+--
+-- In other words, it just cuts the width out of the register. N.B.: This
+-- ignores signedness (no sign extension takes place)!
+truncateReg :: Width -> Width -> Reg -> OrdList Instr
+truncateReg _w w' _r | w' == W64 = nilOL
+truncateReg _w w' r | w' > W64 = pprPanic "Cannot truncate to width bigger than register size (max is 64bit):" $ text (show r) <> char ':' <+> ppr w'
+truncateReg w _w' r | w > W64 = pprPanic "Unexpected register size (max is 64bit):" $ text (show r) <> char ':' <+> ppr w
+truncateReg w w' r =
+  toOL
+    [ ann
+        (text "truncate register" <+> ppr r <+> ppr w <> text "->" <> ppr w')
+        (LSL (OpReg w' r) (OpReg w r) (OpImm (ImmInt shift))),
+      -- SHL ignores signedness!
+      LSR (OpReg w' r) (OpReg w r) (OpImm (ImmInt shift))
+    ]
+  where
+    shift = 64 - widthInBits w'
+
+-- | Given a 'Register', produce a new 'Register' with an instruction block
+-- which will check the value for alignment. Used for @-falignment-sanitisation@.
+addAlignmentCheck :: Int -> Width -> Register -> NatM Register
+addAlignmentCheck align wordWidth reg = do
+  jumpReg <- getNewRegNat II64
+  cmpReg <- getNewRegNat II64
+  okayLblId <- getBlockIdNat
+
+  pure $ case reg of
+    Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt jumpReg cmpReg okayLblId reg)
+    Any fmt f -> Any fmt (\reg -> f reg `appOL` check fmt jumpReg cmpReg okayLblId reg)
+  where
+    -- TODO: Reduce amount of parameters by making this a let binding
+    check :: Format -> Reg -> Reg -> BlockId -> Reg -> InstrBlock
+    check fmt jumpReg cmpReg okayLblId reg =
+      let width = formatToWidth fmt
+       in assert (not $ isFloatFormat fmt)
+            $ toOL
+              [ ann
+                  (text "Alignment check - alignment: " <> int align <> text ", word width: " <> text (show wordWidth))
+                  (AND (OpReg width cmpReg) (OpReg width reg) (OpImm $ ImmInt $ align - 1))
+                  , BCOND EQ (OpReg width cmpReg) zero (TBlock okayLblId)
+                  , COMMENT (text "Alignment check failed")
+                  , LDR II64 (OpReg W64 jumpReg) (OpImm $ ImmCLbl mkBadAlignmentLabel)
+                  , J (TReg jumpReg)
+                  , NEWBLOCK okayLblId
+              ]
+
+-- -----------------------------------------------------------------------------
+--  The 'Amode' type: Memory addressing modes passed up the tree.
+data Amode = Amode AddrMode InstrBlock
+
+-- | Provide the value of a `CmmExpr` with an `Amode`
+--
+-- N.B. this function should be used to provide operands to load and store
+-- instructions with signed 12bit wide immediates (S & I types). For other
+-- immediate sizes and formats (e.g. B type uses multiples of 2) this function
+-- would need to be adjusted.
+getAmode :: Platform
+         -> Width     -- ^ width of loaded value
+         -> CmmExpr
+         -> NatM Amode
+-- TODO: Specialize stuff we can destructure here.
+
+-- LDR/STR: Immediate can be represented with 12bits
+getAmode platform w (CmmRegOff reg off)
+  | w <= W64, fitsIn12bitImm off
+  = return $ Amode (AddrRegImm reg' off') nilOL
+    where reg' = getRegisterReg platform reg
+          off' = ImmInt off
+
+-- For Stores we often see something like this:
+-- CmmStore (CmmMachOp (MO_Add w) [CmmLoad expr, CmmLit (CmmInt n w')]) (expr2)
+-- E.g. a CmmStoreOff really. This can be translated to `str $expr2, [$expr, #n ]
+-- for `n` in range.
+getAmode _platform _ (CmmMachOp (MO_Add _w) [expr, CmmLit (CmmInt off _w')])
+  | fitsIn12bitImm off
+  = do (reg, _format, code) <- getSomeReg expr
+       return $ Amode (AddrRegImm reg (ImmInteger off)) code
+
+getAmode _platform _ (CmmMachOp (MO_Sub _w) [expr, CmmLit (CmmInt off _w')])
+  | fitsIn12bitImm (-off)
+  = do (reg, _format, code) <- getSomeReg expr
+       return $ Amode (AddrRegImm reg (ImmInteger (-off))) code
+
+-- Generic case
+getAmode _platform _ expr
+  = do (reg, _format, code) <- getSomeReg expr
+       return $ Amode (AddrReg reg) code
+
+-- -----------------------------------------------------------------------------
+-- Generating assignments
+
+-- Assignments are really at the heart of the whole code generation
+-- business.  Almost all top-level nodes of any real importance are
+-- assignments, which correspond to loads, stores, or register
+-- transfers.  If we're really lucky, some of the register transfers
+-- will go away, because we can use the destination register to
+-- complete the code generation for the right hand side.  This only
+-- fails when the right hand side is forced into a fixed register
+-- (e.g. the result of a call).
+
+assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_IntCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+
+assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_FltCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+
+assignMem_IntCode rep addrE srcE
+  = do
+    (src_reg, _format, code) <- getSomeReg srcE
+    platform <- getPlatform
+    let w = formatToWidth rep
+    Amode addr addr_code <- getAmode platform w addrE
+    return $ COMMENT (text "CmmStore" <+> parens (text (show addrE)) <+> parens (text (show srcE)))
+            `consOL` (code
+            `appOL` addr_code
+            `snocOL` STR rep (OpReg w src_reg) (OpAddr addr))
+
+assignReg_IntCode _ reg src
+  = do
+    platform <- getPlatform
+    let dst = getRegisterReg platform reg
+    r <- getRegister src
+    return $ case r of
+      Any _ code              -> COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src))) `consOL` code dst
+      Fixed format freg fcode -> COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src))) `consOL` (fcode `snocOL` MOV (OpReg (formatToWidth format) dst) (OpReg (formatToWidth format) freg))
+
+-- Let's treat Floating point stuff
+-- as integer code for now. Opaque.
+assignMem_FltCode = assignMem_IntCode
+assignReg_FltCode = assignReg_IntCode
+
+-- -----------------------------------------------------------------------------
+-- Jumps
+-- AArch64 has 26bits for targets, whereas RiscV only has 20.
+-- Thus we need to distinguish between far (outside of the)
+-- current compilation unit. And regular branches.
+-- RiscV has ±2MB of displacement, whereas AArch64 has ±128MB.
+-- Thus for most branches we can get away with encoding it
+-- directly in the instruction rather than always loading the
+-- address into a register and then using that to jump.
+-- Under the assumption that our linked build product is less than
+-- ~2*128MB of TEXT, and there are no jump that span the whole
+-- TEXT segment.
+-- Something where riscv's compressed instruction might come in
+-- handy.
+genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock
+genJump expr = do
+    (target, _format, code) <- getSomeReg expr
+    return (code `appOL` unitOL (annExpr expr (J (TReg target))))
+
+-- -----------------------------------------------------------------------------
+--  Unconditional branches
+genBranch :: BlockId -> NatM InstrBlock
+genBranch = return . toOL . mkJumpInstr
+
+-- -----------------------------------------------------------------------------
+-- Conditional branches
+genCondJump
+    :: BlockId
+    -> CmmExpr
+    -> NatM InstrBlock
+genCondJump bid expr = do
+    case expr of
+      -- Optimized == 0 case.
+      CmmMachOp (MO_Eq w) [x, CmmLit (CmmInt 0 _)] -> do
+        (reg_x, _format_x, code_x) <- getSomeReg x
+        return $ code_x `snocOL` annExpr expr (BCOND EQ zero (OpReg w reg_x) (TBlock bid))
+
+      -- Optimized /= 0 case.
+      CmmMachOp (MO_Ne w) [x, CmmLit (CmmInt 0 _)] -> do
+        (reg_x, _format_x, code_x) <- getSomeReg x
+        return $ code_x `snocOL`  annExpr expr (BCOND NE zero (OpReg w reg_x) (TBlock bid))
+
+      -- Generic case.
+      CmmMachOp mop [x, y] -> do
+
+        let ubcond w cmp = do
+                -- compute both sides.
+                (reg_x, format_x, code_x) <- getSomeReg x
+                (reg_y, format_y, code_y) <- getSomeReg y
+                let x' = OpReg w reg_x
+                    y' = OpReg w reg_y
+                return $ case w of
+                  w | w == W8 || w == W16 -> code_x `appOL`
+                      truncateReg (formatToWidth format_x) w reg_x  `appOL`
+                      code_y `appOL`
+                      truncateReg (formatToWidth format_y) w reg_y  `appOL`
+                      code_y `snocOL`
+                      annExpr expr (BCOND cmp x' y' (TBlock bid))
+                  _   -> code_x `appOL` code_y `snocOL` annExpr expr (BCOND cmp x' y' (TBlock bid))
+
+            sbcond w cmp = do
+              -- compute both sides.
+              (reg_x, format_x, code_x) <- getSomeReg x
+              (reg_y, format_y, code_y) <- getSomeReg y
+              let x' = OpReg w reg_x
+                  y' = OpReg w reg_y
+              return $ case w of
+                w | w `elem` [W8, W16, W32] ->
+                  code_x
+                    `appOL` signExtend (formatToWidth format_x) W64 reg_x reg_x
+                    `appOL` code_y
+                    `appOL` signExtend (formatToWidth format_y) W64 reg_y reg_y
+                    `appOL` unitOL (annExpr expr (BCOND cmp x' y' (TBlock bid)))
+                _ -> code_x `appOL` code_y `appOL` unitOL (annExpr expr (BCOND cmp x' y' (TBlock bid)))
+
+            fbcond w cmp = do
+              -- ensure we get float regs
+              (reg_fx, _format_fx, code_fx) <- getFloatReg x
+              (reg_fy, _format_fy, code_fy) <- getFloatReg y
+              oneReg <- getNewRegNat II64
+              return $ code_fx `appOL`
+                       code_fy `snocOL`
+                       annExpr expr (CSET  ip (OpReg w reg_fx) (OpReg w reg_fy) cmp) `snocOL`
+                       MOV (OpReg W64 oneReg) (OpImm (ImmInt 1)) `snocOL`
+                       BCOND EQ ip (OpReg w oneReg) (TBlock bid)
+
+        case mop of
+          MO_F_Eq w -> fbcond w EQ
+          MO_F_Ne w -> fbcond w NE
+
+          MO_F_Gt w -> fbcond w FGT
+          MO_F_Ge w -> fbcond w FGE
+          MO_F_Lt w -> fbcond w FLT
+          MO_F_Le w -> fbcond w FLE
+
+          MO_Eq w   -> sbcond w EQ
+          MO_Ne w   -> sbcond w NE
+
+          MO_S_Gt w -> sbcond w SGT
+          MO_S_Ge w -> sbcond w SGE
+          MO_S_Lt w -> sbcond w SLT
+          MO_S_Le w -> sbcond w SLE
+          MO_U_Gt w -> ubcond w UGT
+          MO_U_Ge w -> ubcond w UGE
+          MO_U_Lt w -> ubcond w ULT
+          MO_U_Le w -> ubcond w ULE
+          _ -> pprPanic "RV64.genCondJump:case mop: " (text $ show expr)
+      _ -> pprPanic "RV64.genCondJump: " (text $ show expr)
+
+
+genCondBranch
+    :: BlockId      -- the source of the jump
+    -> BlockId      -- the true branch target
+    -> BlockId      -- the false branch target
+    -> CmmExpr      -- the condition on which to branch
+    -> NatM InstrBlock -- Instructions
+
+genCondBranch _ true false expr = do
+  b1 <- genCondJump true expr
+  b2 <- genBranch false
+  return (b1 `appOL` b2)
+
+-- -----------------------------------------------------------------------------
+--  Generating C calls
+
+-- Now the biggest nightmare---calls.  Most of the nastiness is buried in
+-- @get_arg@, which moves the arguments to the correct registers/stack
+-- locations.  Apart from that, the code is easy.
+--
+-- As per *convention*:
+-- x0-x7:   (volatile) argument registers
+-- x8:      (volatile) indirect result register / Linux syscall no
+-- x9-x15:  (volatile) caller saved regs
+-- x16,x17: (volatile) intra-procedure-call registers
+-- x18:     (volatile) platform register. don't use for portability
+-- x19-x28: (non-volatile) callee save regs
+-- x29:     (non-volatile) frame pointer
+-- x30:                    link register
+-- x31:                    stack pointer / zero reg
+--
+-- Thus, this is what a c function will expect. Find the arguments in x0-x7,
+-- anything above that on the stack.  We'll ignore c functions with more than
+-- 8 arguments for now.  Sorry.
+--
+-- We need to make sure we preserve x9-x15, don't want to touch x16, x17.
+
+-- Note [PLT vs GOT relocations]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- When linking objects together, we may need to lookup foreign references. That
+-- is symbolic references to functions or values in other objects. When
+-- compiling the object, we can not know where those elements will end up in
+-- memory (relative to the current location). Thus the use of symbols. There
+-- are two types of items we are interested, code segments we want to jump to
+-- and continue execution there (functions, ...), and data items we want to look
+-- up (strings, numbers, ...). For functions we can use the fact that we can use
+-- an intermediate jump without visibility to the programs execution.  If we
+-- want to jump to a function that is simply too far away to reach for the B/BL
+-- instruction, we can create a small piece of code that loads the full target
+-- address and jumps to that on demand. Say f wants to call g, however g is out
+-- of range for a direct jump, we can create a function h in range for f, that
+-- will load the address of g, and jump there. The area where we construct h
+-- is called the Procedure Linking Table (PLT), we have essentially replaced
+-- f -> g with f -> h -> g.  This is fine for function calls.  However if we
+-- want to lookup values, this trick doesn't work, so we need something else.
+-- We will instead reserve a slot in memory, and have a symbol pointing to that
+-- slot. Now what we essentially do is, we reference that slot, and expect that
+-- slot to hold the final resting address of the data we are interested in.
+-- Thus what that symbol really points to is the location of the final data.
+-- The block of memory where we hold all those slots is the Global Offset Table
+-- (GOT).  Instead of x <- $foo, we now do y <- $fooPtr, and x <- [$y].
+--
+-- FIXME: Update for RISCV, the below is still AArch64.
+-- For JUMP/CALLs we have 26bits (+/- 128MB), for conditional branches we only
+-- have 19bits (+/- 1MB).  Symbol lookups are also within +/- 1MB, thus for most
+-- of the LOAD/STOREs we'd want to use adrp, and add to compute a value within
+-- 4GB of the PC, and load that.  For anything outside of that range, we'd have
+-- to go through the GOT.
+--
+--  adrp x0, <symbol>
+--  add x0, :lo:<symbol>
+--
+-- will compute the address of <symbol> int x0 if <symbol> is within 4GB of the
+-- PC.
+--
+-- If we want to get the slot in the global offset table (GOT), we can do this:
+--
+--   adrp x0, #:got:<symbol>
+--   ldr x0, [x0, #:got_lo12:<symbol>]
+--
+-- this will compute the address anywhere in the addressable 64bit space into
+-- x0, by loading the address from the GOT slot.
+--
+-- To actually get the value of <symbol>, we'd need to ldr x0, x0 still, which
+-- for the first case can be optimized to use ldr x0, [x0, #:lo12:<symbol>]
+-- instead of the add instruction.
+--
+-- As the memory model for AArch64 for PIC is considered to be +/- 4GB, we do
+-- not need to go through the GOT, unless we want to address the full address
+-- range within 64bit.
+
+genCCall
+    :: ForeignTarget      -- function to call
+    -> [CmmFormal]        -- where to put the result
+    -> [CmmActual]        -- arguments (of mixed type)
+    -> BlockId            -- The block we are in
+    -> NatM (InstrBlock, Maybe BlockId)
+-- TODO: Specialize where we can.
+-- Generic impl
+genCCall target dest_regs arg_regs bid = do
+  -- we want to pass arg_regs into allArgRegs
+  -- pprTraceM "genCCall target" (ppr target)
+  -- pprTraceM "genCCall formal" (ppr dest_regs)
+  -- pprTraceM "genCCall actual" (ppr arg_regs)
+
+  case target of
+    -- The target :: ForeignTarget call can either
+    -- be a foreign procedure with an address expr
+    -- and a calling convention.
+    ForeignTarget expr _cconv -> do
+      (call_target, call_target_code) <- case expr of
+        -- if this is a label, let's just directly to it.  This will produce the
+        -- correct CALL relocation for BL...
+        -- While this works on aarch64, for _most_ labels, it will fall short
+        -- where label branching only works for shoter distances (e.g. riscv)
+        -- (CmmLit (CmmLabel lbl)) -> pure (TLabel lbl, nilOL)
+        -- ... if it's not a label--well--let's compute the expression into a
+        -- register and jump to that. See Note [PLT vs GOT relocations]
+        _ -> do (reg, _format, reg_code) <- getSomeReg expr
+                pure (TReg reg, reg_code)
+      -- compute the code and register logic for all arg_regs.
+      -- this will give us the format information to match on.
+      arg_regs' <- mapM getSomeReg arg_regs
+
+      -- Now this is stupid.  Our Cmm expressions doesn't carry the proper sizes
+      -- so while in Cmm we might get W64 incorrectly for an int, that is W32 in
+      -- STG; this thenn breaks packing of stack arguments, if we need to pack
+      -- for the pcs, e.g. darwinpcs.  Option one would be to fix the Int type
+      -- in Cmm proper. Option two, which we choose here is to use extended Hint
+      -- information to contain the size information and use that when packing
+      -- arguments, spilled onto the stack.
+      let (_res_hints, arg_hints) = foreignTargetHints target
+          arg_regs'' = zipWith (\(r, f, c) h -> (r,f,h,c)) arg_regs' arg_hints
+
+      (stackSpace', passRegs, passArgumentsCode) <- passArguments allGpArgRegs allFpArgRegs arg_regs'' 0 [] nilOL
+
+      -- if we pack the stack, we may need to adjust to multiple of 8byte.
+      -- if we don't pack the stack, it will always be multiple of 8.
+      let stackSpace = if stackSpace' `mod` 8 /= 0
+                       then 8 * (stackSpace' `div` 8 + 1)
+                       else stackSpace'
+
+      (returnRegs, readResultsCode)   <- readResults allGpArgRegs allFpArgRegs dest_regs [] nilOL
+
+      let moveStackDown 0 = toOL [ PUSH_STACK_FRAME
+                                 , DELTA (-16) ]
+          moveStackDown i | odd i = moveStackDown (i + 1)
+          moveStackDown i = toOL [ PUSH_STACK_FRAME
+                                 , SUB (OpReg W64 (regSingle 2)) (OpReg W64 (regSingle 2)) (OpImm (ImmInt (8 * i)))
+                                 , DELTA (-8 * i - 16) ]
+          moveStackUp 0 = toOL [ POP_STACK_FRAME
+                               , DELTA 0 ]
+          moveStackUp i | odd i = moveStackUp (i + 1)
+          moveStackUp i = toOL [ ADD (OpReg W64 (regSingle 2)) (OpReg W64 (regSingle 2)) (OpImm (ImmInt (8 * i)))
+                               , POP_STACK_FRAME
+                               , DELTA 0 ]
+
+      let code = call_target_code          -- compute the label (possibly into a register)
+            `appOL` moveStackDown (stackSpace `div` 8)
+            `appOL` passArgumentsCode      -- put the arguments into x0, ...
+            `snocOL` BL call_target passRegs returnRegs -- branch and link.
+            `appOL` readResultsCode        -- parse the results into registers
+            `appOL` moveStackUp (stackSpace `div` 8)
+      return (code, Nothing)
+
+    PrimTarget MO_F32_Fabs
+      | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
+        unaryFloatOp W32 (\d x -> unitOL $ FABS d x) arg_reg dest_reg
+    PrimTarget MO_F64_Fabs
+      | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
+        unaryFloatOp W64 (\d x -> unitOL $ FABS d x) arg_reg dest_reg
+
+    -- or a possibly side-effecting machine operation
+    -- mop :: CallishMachOp (see GHC.Cmm.MachOp)
+    PrimTarget mop -> do
+      -- We'll need config to construct forien targets
+      case mop of
+        -- 64 bit float ops
+        MO_F64_Pwr   -> mkCCall "pow"
+
+        MO_F64_Sin   -> mkCCall "sin"
+        MO_F64_Cos   -> mkCCall "cos"
+        MO_F64_Tan   -> mkCCall "tan"
+
+        MO_F64_Sinh  -> mkCCall "sinh"
+        MO_F64_Cosh  -> mkCCall "cosh"
+        MO_F64_Tanh  -> mkCCall "tanh"
+
+        MO_F64_Asin  -> mkCCall "asin"
+        MO_F64_Acos  -> mkCCall "acos"
+        MO_F64_Atan  -> mkCCall "atan"
+
+        MO_F64_Asinh -> mkCCall "asinh"
+        MO_F64_Acosh -> mkCCall "acosh"
+        MO_F64_Atanh -> mkCCall "atanh"
+
+        MO_F64_Log   -> mkCCall "log"
+        MO_F64_Log1P -> mkCCall "log1p"
+        MO_F64_Exp   -> mkCCall "exp"
+        MO_F64_ExpM1 -> mkCCall "expm1"
+        MO_F64_Fabs  -> mkCCall "fabs"
+        MO_F64_Sqrt  -> mkCCall "sqrt"
+
+        -- 32 bit float ops
+        MO_F32_Pwr   -> mkCCall "powf"
+
+        MO_F32_Sin   -> mkCCall "sinf"
+        MO_F32_Cos   -> mkCCall "cosf"
+        MO_F32_Tan   -> mkCCall "tanf"
+        MO_F32_Sinh  -> mkCCall "sinhf"
+        MO_F32_Cosh  -> mkCCall "coshf"
+        MO_F32_Tanh  -> mkCCall "tanhf"
+        MO_F32_Asin  -> mkCCall "asinf"
+        MO_F32_Acos  -> mkCCall "acosf"
+        MO_F32_Atan  -> mkCCall "atanf"
+        MO_F32_Asinh -> mkCCall "asinhf"
+        MO_F32_Acosh -> mkCCall "acoshf"
+        MO_F32_Atanh -> mkCCall "atanhf"
+        MO_F32_Log   -> mkCCall "logf"
+        MO_F32_Log1P -> mkCCall "log1pf"
+        MO_F32_Exp   -> mkCCall "expf"
+        MO_F32_ExpM1 -> mkCCall "expm1f"
+        MO_F32_Fabs  -> mkCCall "fabsf"
+        MO_F32_Sqrt  -> mkCCall "sqrtf"
+
+        -- 64-bit primops
+        MO_I64_ToI   -> mkCCall "hs_int64ToInt"
+        MO_I64_FromI -> mkCCall "hs_intToInt64"
+        MO_W64_ToW   -> mkCCall "hs_word64ToWord"
+        MO_W64_FromW -> mkCCall "hs_wordToWord64"
+        MO_x64_Neg   -> mkCCall "hs_neg64"
+        MO_x64_Add   -> mkCCall "hs_add64"
+        MO_x64_Sub   -> mkCCall "hs_sub64"
+        MO_x64_Mul   -> mkCCall "hs_mul64"
+        MO_I64_Quot  -> mkCCall "hs_quotInt64"
+        MO_I64_Rem   -> mkCCall "hs_remInt64"
+        MO_W64_Quot  -> mkCCall "hs_quotWord64"
+        MO_W64_Rem   -> mkCCall "hs_remWord64"
+        MO_x64_And   -> mkCCall "hs_and64"
+        MO_x64_Or    -> mkCCall "hs_or64"
+        MO_x64_Xor   -> mkCCall "hs_xor64"
+        MO_x64_Not   -> mkCCall "hs_not64"
+        MO_x64_Shl   -> mkCCall "hs_uncheckedShiftL64"
+        MO_I64_Shr   -> mkCCall "hs_uncheckedIShiftRA64"
+        MO_W64_Shr   -> mkCCall "hs_uncheckedShiftRL64"
+        MO_x64_Eq    -> mkCCall "hs_eq64"
+        MO_x64_Ne    -> mkCCall "hs_ne64"
+        MO_I64_Ge    -> mkCCall "hs_geInt64"
+        MO_I64_Gt    -> mkCCall "hs_gtInt64"
+        MO_I64_Le    -> mkCCall "hs_leInt64"
+        MO_I64_Lt    -> mkCCall "hs_ltInt64"
+        MO_W64_Ge    -> mkCCall "hs_geWord64"
+        MO_W64_Gt    -> mkCCall "hs_gtWord64"
+        MO_W64_Le    -> mkCCall "hs_leWord64"
+        MO_W64_Lt    -> mkCCall "hs_ltWord64"
+
+        -- Conversion
+        MO_UF_Conv w        -> mkCCall (word2FloatLabel w)
+
+        -- Optional MachOps
+        -- These are enabled/disabled by backend flags: GHC.StgToCmm.Config
+        MO_S_Mul2     _w -> unsupported mop
+        MO_S_QuotRem  _w -> unsupported mop
+        MO_U_QuotRem  _w -> unsupported mop
+        MO_U_QuotRem2 _w -> unsupported mop
+        MO_Add2       _w -> unsupported mop
+        MO_AddWordC   _w -> unsupported mop
+        MO_SubWordC   _w -> unsupported mop
+        MO_AddIntC    _w -> unsupported mop
+        MO_SubIntC    _w -> unsupported mop
+        MO_U_Mul2     _w -> unsupported mop
+
+        -- Memory Ordering
+        -- The concrete encoding is copied from load_load_barrier() and write_barrier() (SMP.h)
+        -- TODO: This needs to be changed for https://gitlab.haskell.org/ghc/ghc/-/merge_requests/10628
+        -- The related C functions are:
+        -- atomic_thread_fence(memory_order_acquire);
+        -- atomic_thread_fence(memory_order_release);
+        MO_ReadBarrier      ->  return (unitOL (DMBSY DmbRead DmbRead), Nothing)
+        MO_WriteBarrier     ->  return (unitOL (DMBSY DmbWrite DmbWrite), Nothing)
+        MO_Touch            ->  return (nilOL, Nothing) -- Keep variables live (when using interior pointers)
+        -- Prefetch
+        MO_Prefetch_Data _n -> return (nilOL, Nothing) -- Prefetch hint.
+
+        -- Memory copy/set/move/cmp, with alignment for optimization
+
+        -- TODO Optimize and use e.g. quad registers to move memory around instead
+        -- of offloading this to memcpy. For small memcpys we can utilize
+        -- the 128bit quad registers in NEON to move block of bytes around.
+        -- Might also make sense of small memsets? Use xzr? What's the function
+        -- call overhead?
+        MO_Memcpy  _align   -> mkCCall "memcpy"
+        MO_Memset  _align   -> mkCCall "memset"
+        MO_Memmove _align   -> mkCCall "memmove"
+        MO_Memcmp  _align   -> mkCCall "memcmp"
+
+        MO_SuspendThread    -> mkCCall "suspendThread"
+        MO_ResumeThread     -> mkCCall "resumeThread"
+
+        MO_PopCnt w         -> mkCCall (popCntLabel w)
+        MO_Pdep w           -> mkCCall (pdepLabel w)
+        MO_Pext w           -> mkCCall (pextLabel w)
+        MO_Clz w            -> mkCCall (clzLabel w)
+        MO_Ctz w            -> mkCCall (ctzLabel w)
+        MO_BSwap w          -> mkCCall (bSwapLabel w)
+        MO_BRev w           -> mkCCall (bRevLabel w)
+
+        -- Atomic read-modify-write.
+        mo@(MO_AtomicRead w ord)
+          | [p_reg] <- arg_regs
+          , [dst_reg] <- dest_regs -> do
+              (p, _fmt_p, code_p) <- getSomeReg p_reg
+              platform <- getPlatform
+              -- See __atomic_load_n (in C)
+              let instrs = case ord of
+                      MemOrderRelaxed -> unitOL $ ann moDescr (LDR (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p))
+                      MemOrderAcquire -> toOL [
+                                                ann moDescr (LDR (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p)),
+                                                DMBSY DmbRead DmbReadWrite
+                                              ]
+                      MemOrderSeqCst -> toOL [
+                                                ann moDescr (DMBSY DmbReadWrite DmbReadWrite),
+                                                LDR (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p),
+                                                DMBSY DmbRead DmbReadWrite
+                                              ]
+                      MemOrderRelease -> panic $ "Unexpected MemOrderRelease on an AtomicRead: " ++ show mo
+                  dst = getRegisterReg platform (CmmLocal dst_reg)
+                  moDescr = (text . show) mo
+                  code =
+                    code_p `appOL`
+                    instrs
+              return (code, Nothing)
+          | otherwise -> panic "mal-formed AtomicRead"
+        mo@(MO_AtomicWrite w ord)
+          | [p_reg, val_reg] <- arg_regs -> do
+              (p, _fmt_p, code_p) <- getSomeReg p_reg
+              (val, fmt_val, code_val) <- getSomeReg val_reg
+              -- See __atomic_store_n (in C)
+              let instrs = case ord of
+                      MemOrderRelaxed -> unitOL $ ann moDescr (STR fmt_val (OpReg w val) (OpAddr $ AddrReg p))
+                      MemOrderSeqCst  -> toOL [
+                                                ann moDescr (DMBSY DmbReadWrite DmbWrite),
+                                                STR fmt_val (OpReg w val) (OpAddr $ AddrReg p)
+                                              ]
+                      MemOrderRelease -> toOL [
+                                                ann moDescr (DMBSY DmbReadWrite DmbWrite),
+                                                STR fmt_val (OpReg w val) (OpAddr $ AddrReg p)
+                                              ]
+                      MemOrderAcquire ->  panic $ "Unexpected MemOrderAcquire on an AtomicWrite" ++ show mo
+                  moDescr = (text . show) mo
+                  code =
+                    code_p `appOL`
+                    code_val `appOL`
+                    instrs
+              return (code, Nothing)
+          | otherwise -> panic "mal-formed AtomicWrite"
+        MO_AtomicRMW w amop -> mkCCall (atomicRMWLabel w amop)
+        MO_Cmpxchg w        -> mkCCall (cmpxchgLabel w)
+        -- -- Should be an AtomicRMW variant eventually.
+        -- -- Sequential consistent.
+        -- TODO: this should be implemented properly!
+        MO_Xchg w           -> mkCCall (xchgLabel w)
+
+  where
+    unsupported :: Show a => a -> b
+    unsupported mop = panic ("outOfLineCmmOp: " ++ show mop
+                          ++ " not supported here")
+    mkCCall :: FastString -> NatM (InstrBlock, Maybe BlockId)
+    mkCCall name = do
+      config <- getConfig
+      target <- cmmMakeDynamicReference config CallReference $
+          mkForeignLabel name Nothing ForeignLabelInThisPackage IsFunction
+      let cconv = ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn
+      genCCall (ForeignTarget target cconv) dest_regs arg_regs bid
+
+    -- Implementiation of the RISCV ABI calling convention.
+    -- https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/948463cd5dbebea7c1869e20146b17a2cc8fda2f/riscv-cc.adoc#integer-calling-convention
+    passArguments :: [Reg] -> [Reg] -> [(Reg, Format, ForeignHint, InstrBlock)] -> Int -> [Reg] -> InstrBlock -> NatM (Int, [Reg], InstrBlock)
+    -- Base case: no more arguments to pass (left)
+    passArguments _ _ [] stackSpace accumRegs accumCode = return (stackSpace, accumRegs, accumCode)
+
+    -- Still have GP regs, and we want to pass an GP argument.
+    passArguments (gpReg:gpRegs) fpRegs ((r, format, hint, code_r):args) stackSpace accumRegs accumCode | isIntFormat format = do
+      -- RISCV64 Integer Calling Convention: "When passed in registers or on the
+      -- stack, integer scalars narrower than XLEN bits are widened according to
+      -- the sign of their type up to 32 bits, then sign-extended to XLEN bits."
+      let w = formatToWidth format
+          assignArg = if hint == SignedHint then
+             COMMENT (text "Pass gp argument sign-extended (SignedHint): " <> ppr r) `consOL`
+                       signExtend w W64 r gpReg
+
+            else toOL [COMMENT (text "Pass gp argument sign-extended (SignedHint): " <> ppr r)
+                     , MOV (OpReg w gpReg) (OpReg w r)]
+          accumCode' = accumCode `appOL`
+                       code_r `appOL`
+                       assignArg
+      passArguments gpRegs fpRegs args stackSpace (gpReg:accumRegs) accumCode'
+
+    -- Still have FP regs, and we want to pass an FP argument.
+    passArguments gpRegs (fpReg:fpRegs) ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isFloatFormat format = do
+      let w = formatToWidth format
+          mov = MOV (OpReg w fpReg) (OpReg w r)
+          accumCode' = accumCode `appOL`
+                       code_r `snocOL`
+                       ann (text "Pass fp argument: " <> ppr r) mov
+      passArguments gpRegs fpRegs args stackSpace (fpReg:accumRegs) accumCode'
+
+    -- No mor regs left to pass. Must pass on stack.
+    passArguments [] [] ((r, format, hint, code_r) : args) stackSpace accumRegs accumCode = do
+      let w = formatToWidth format
+          space = 8
+          str = STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 2) (ImmInt stackSpace)))
+          stackCode =
+            if hint == SignedHint
+              then
+                code_r
+                  `appOL` signExtend w W64 r ipReg
+                  `snocOL` ann (text "Pass signed argument (size " <> ppr w <> text ") on the stack: " <> ppr ipReg) str
+              else
+                code_r
+                  `snocOL` ann (text "Pass unsigned argument (size " <> ppr w <> text ") on the stack: " <> ppr r) str
+      passArguments [] [] args (stackSpace + space) accumRegs (stackCode `appOL` accumCode)
+
+    -- Still have fpRegs left, but want to pass a GP argument. Must be passed on the stack then.
+    passArguments [] fpRegs ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isIntFormat format = do
+      let w = formatToWidth format
+          space = 8
+          str = STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 2) (ImmInt stackSpace)))
+          stackCode = code_r `snocOL`
+                      ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) str
+      passArguments [] fpRegs args (stackSpace+space) accumRegs (stackCode `appOL` accumCode)
+
+    -- Still have gpRegs left, but want to pass a FP argument. Must be passed in gpReg then.
+    passArguments (gpReg:gpRegs) [] ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isFloatFormat format = do
+      let w = formatToWidth format
+          mov = MOV (OpReg w gpReg) (OpReg w r)
+          accumCode' = accumCode `appOL`
+                       code_r `snocOL`
+                       ann (text "Pass fp argument in gpReg: " <> ppr r) mov
+      passArguments gpRegs [] args stackSpace (gpReg:accumRegs) accumCode'
+
+    passArguments _ _ _ _ _ _ = pprPanic "passArguments" (text "invalid state")
+
+    readResults :: [Reg] -> [Reg] -> [LocalReg] -> [Reg]-> InstrBlock -> NatM ([Reg], InstrBlock)
+    readResults _ _ [] accumRegs accumCode = return (accumRegs, accumCode)
+    readResults [] _ _ _ _ = do
+      platform <- getPlatform
+      pprPanic "genCCall, out of gp registers when reading results" (pdoc platform target)
+    readResults _ [] _ _ _ = do
+      platform <- getPlatform
+      pprPanic "genCCall, out of fp registers when reading results" (pdoc platform target)
+    readResults (gpReg:gpRegs) (fpReg:fpRegs) (dst:dsts) accumRegs accumCode = do
+      -- gp/fp reg -> dst
+      platform <- getPlatform
+      let rep = cmmRegType (CmmLocal dst)
+          format = cmmTypeFormat rep
+          w   = cmmRegWidth (CmmLocal dst)
+          r_dst = getRegisterReg platform (CmmLocal dst)
+      if isFloatFormat format
+        then readResults (gpReg:gpRegs) fpRegs dsts (fpReg:accumRegs) (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w fpReg))
+        else readResults gpRegs (fpReg:fpRegs) dsts (gpReg:accumRegs) $
+          accumCode `snocOL`
+          MOV (OpReg w r_dst) (OpReg w gpReg) `appOL`
+          -- truncate, otherwise an unexpectedly big value might be used in upfollowing calculations
+          truncateReg W64 w r_dst
+
+    unaryFloatOp w op arg_reg dest_reg = do
+      platform <- getPlatform
+      (reg_fx, _format_x, code_fx) <- getFloatReg arg_reg
+      let dst = getRegisterReg platform (CmmLocal dest_reg)
+      let code = code_fx `appOL` op (OpReg w dst) (OpReg w reg_fx)
+      return (code, Nothing)
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/RV64/Cond.hs
===================================================================
--- /dev/null
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/RV64/Cond.hs
@@ -0,0 +1,39 @@
+module GHC.CmmToAsm.RV64.Cond where
+
+import GHC.Prelude
+
+-- | Condition codes.
+--
+-- Used in conditional branches and bit setters. According to the available
+-- instruction set, some conditions are encoded as their negated opposites. I.e.
+-- these are logical things that don't necessarily map 1:1 to hardware/ISA.
+data Cond
+  = -- | int and float
+    EQ
+  | -- | int and float
+    NE
+  | -- | signed less than
+    SLT
+  | -- | signed less than or equal
+    SLE
+  | -- | signed greater than or equal
+    SGE
+  | -- | signed greater than
+    SGT
+  | -- | unsigned less than
+    ULT
+  | -- | unsigned less than or equal
+    ULE
+  | -- | unsigned greater than or equal
+    UGE
+  | -- | unsigned greater than
+    UGT
+  | -- | floating point instruction @flt@
+    FLT
+  | -- | floating point instruction @fle@
+    FLE
+  | -- | floating point instruction @fge@
+    FGE
+  | -- | floating point instruction @fgt@
+    FGT
+  deriving (Eq, Show)
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/RV64/Instr.hs
===================================================================
--- /dev/null
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/RV64/Instr.hs
@@ -0,0 +1,798 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+module GHC.CmmToAsm.RV64.Instr
+
+where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.RV64.Cond
+import GHC.CmmToAsm.RV64.Regs
+
+import GHC.CmmToAsm.Instr (RegUsage(..))
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Types
+import GHC.CmmToAsm.Utils
+import GHC.CmmToAsm.Config
+import GHC.Platform.Reg
+
+import GHC.Platform.Regs
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm
+import GHC.Cmm.CLabel
+import GHC.Utils.Outputable
+import GHC.Platform
+import GHC.Types.Unique.Supply
+
+import GHC.Utils.Panic
+
+import Data.Maybe
+
+import GHC.Stack
+
+-- | Stack frame header size in bytes.
+--
+-- The stack frame header is made of the values that are always saved
+-- (regardless of the context.) It consists of the saved return address and a
+-- pointer to the previous frame. Thus, its size is two stack frame slots which
+-- equals two addresses/words (2 * 8 byte).
+stackFrameHeaderSize :: Int
+stackFrameHeaderSize = 2 * spillSlotSize
+
+-- | All registers are 8 byte wide.
+spillSlotSize :: Int
+spillSlotSize = 8
+
+-- | The number of bytes that the stack pointer should be aligned
+-- to.
+stackAlign :: Int
+stackAlign = 16
+
+-- | The number of spill slots available without allocating more.
+maxSpillSlots :: NCGConfig -> Int
+maxSpillSlots config
+    = ((ncgSpillPreallocSize config - stackFrameHeaderSize)
+         `div` spillSlotSize) - 1
+
+-- | Convert a spill slot number to a *byte* offset, with no sign.
+spillSlotToOffset :: Int -> Int
+spillSlotToOffset slot
+   = stackFrameHeaderSize + spillSlotSize * slot
+
+-- | Get the registers that are being used by this instruction.
+-- regUsage doesn't need to do any trickery for jumps and such.
+-- Just state precisely the regs read and written by that insn.
+-- The consequences of control flow transfers, as far as register
+-- allocation goes, are taken care of by the register allocator.
+--
+-- RegUsage = RU [<read regs>] [<write regs>]
+
+instance Outputable RegUsage where
+    ppr (RU reads writes) = text "RegUsage(reads:" <+> ppr reads <> comma <+> text "writes:" <+> ppr writes <> char ')'
+
+regUsageOfInstr :: Platform -> Instr -> RegUsage
+regUsageOfInstr platform instr = case instr of
+  ANN _ i                  -> regUsageOfInstr platform i
+  COMMENT{}                -> usage ([], [])
+  MULTILINE_COMMENT{}      -> usage ([], [])
+  PUSH_STACK_FRAME         -> usage ([], [])
+  POP_STACK_FRAME          -> usage ([], [])
+  DELTA{}                  -> usage ([], [])
+
+  -- 1. Arithmetic Instructions ------------------------------------------------
+  ADD dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  MUL dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  NEG dst src              -> usage (regOp src, regOp dst)
+  SMULH dst src1 src2      -> usage (regOp src1 ++ regOp src2, regOp dst)
+  DIV dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  REM dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  REMU dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+  SUB dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  DIVU dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+
+  -- 2. Bit Manipulation Instructions ------------------------------------------
+  -- 3. Logical and Move Instructions ------------------------------------------
+  AND dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  OR dst src1 src2         -> usage (regOp src1 ++ regOp src2, regOp dst)
+  ASR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  XOR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  LSL dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  LSR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  MOV dst src              -> usage (regOp src, regOp dst)
+  -- ORI's third operand is always an immediate
+  ORI dst src1 _           -> usage (regOp src1, regOp dst)
+  XORI dst src1 _          -> usage (regOp src1, regOp dst)
+  -- 4. Branch Instructions ----------------------------------------------------
+  J t                      -> usage (regTarget t, [])
+  J_TBL _ _ t              -> usage ([t], [])
+  B t                      -> usage (regTarget t, [])
+  BCOND _ l r t            -> usage (regTarget t ++ regOp l ++ regOp r, [])
+  BL t ps _rs              -> usage (regTarget t ++ ps, callerSavedRegisters)
+
+  -- 5. Atomic Instructions ----------------------------------------------------
+  -- 6. Conditional Instructions -----------------------------------------------
+  CSET dst l r _           -> usage (regOp l ++ regOp r, regOp dst)
+  -- 7. Load and Store Instructions --------------------------------------------
+  STR _ src dst            -> usage (regOp src ++ regOp dst, [])
+  -- STLR _ src dst      L     -> usage (regOp src ++ regOp dst, [])
+  LDR _ dst src            -> usage (regOp src, regOp dst)
+  LDRU _ dst src           -> usage (regOp src, regOp dst)
+
+  -- 8. Synchronization Instructions -------------------------------------------
+  DMBSY _ _                  -> usage ([], [])
+
+  -- 9. Floating Point Instructions --------------------------------------------
+  FCVT dst src             -> usage (regOp src, regOp dst)
+  SCVTF dst src            -> usage (regOp src, regOp dst)
+  FCVTZS dst src           -> usage (regOp src, regOp dst)
+  FABS dst src             -> usage (regOp src, regOp dst)
+
+  _ -> panic $ "regUsageOfInstr: " ++ instrCon instr
+
+  where
+        -- filtering the usage is necessary, otherwise the register
+        -- allocator will try to allocate pre-defined fixed stg
+        -- registers as well, as they show up.
+        usage (src, dst) = RU (filter (interesting platform) src)
+                              (filter (interesting platform) dst)
+
+        regAddr :: AddrMode -> [Reg]
+        regAddr (AddrRegImm r1 _)  = [r1]
+        regAddr (AddrReg r1)       = [r1]
+        regOp :: Operand -> [Reg]
+        regOp (OpReg _ r1) = [r1]
+        regOp (OpAddr a) = regAddr a
+        regOp (OpImm _) = []
+        regTarget :: Target -> [Reg]
+        regTarget (TBlock _) = []
+        regTarget (TReg r1)  = [r1]
+
+        -- Is this register interesting for the register allocator?
+        interesting :: Platform -> Reg -> Bool
+        interesting _        (RegVirtual _)                 = True
+        interesting _        (RegReal (RealRegSingle (-1))) = False
+        interesting platform (RegReal (RealRegSingle i))    = freeReg platform i
+
+-- Save caller save registers
+-- This is x0-x18
+--
+-- For SIMD/FP Registers:
+-- Registers v8-v15 must be preserved by a callee across subroutine calls;
+-- the remaining registers (v0-v7, v16-v31) do not need to be preserved (or
+-- should be preserved by the caller). Additionally, only the bottom 64 bits
+-- of each value stored in v8-v15 need to be preserved [7]; it is the
+-- responsibility of the caller to preserve larger values.
+--
+-- .---------------------------------------------------------------------------------------------------------------------------------------------------------------.
+-- |  0 |  1 |  2 |  3 |  4 |  5 |  6 |  7 |  8 |  9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
+-- | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 42 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |
+-- |== General Purpose registers ==================================================================================================================================|
+-- | ZR | RA | SP | GP | TP | <- tmp r. -> | FP | <- | <---- argument passing -------------> | -- callee saved ------------------------------> | <--- tmp regs --> |
+-- | -- | -- | -- | -- | -- | <- free r. > | -- | BR | <---- free registers ---------------> | SP | HP | R1 | R2 | R3 | R4 | R5 | R6 | R7 | SL | <-- free regs --> |
+-- |== SIMD/FP Registers ==========================================================================================================================================|
+-- | <--- temporary registers -----------> | <------ | <---- argument passing -------------> | -- callee saved ------------------------------> | <--- tmp regs --> |
+-- | <---------- free registers ---------> | F1 | F2 | <---- free registers ---------------> | F3 | F4 | F5 | F6 | D1 | D2 | D3 | D4 | D5 | D6 | -- | -- | -- | -- |
+-- '---------------------------------------------------------------------------------------------------------------------------------------------------------------'
+-- ZR: Zero, RA: Return Address, SP: Stack Pointer, GP: Global Pointer, TP: Thread Pointer, FP: Frame Pointer
+-- BR: Base, SL: SpLim
+callerSavedRegisters :: [Reg]
+callerSavedRegisters =
+  map regSingle [t0RegNo .. t2RegNo]
+    ++ map regSingle [a0RegNo .. a7RegNo]
+    ++ map regSingle [t3RegNo .. t6RegNo]
+    ++ map regSingle [ft0RegNo .. ft7RegNo]
+    ++ map regSingle [fa0RegNo .. fa7RegNo]
+
+-- | Apply a given mapping to all the register references in this
+-- instruction.
+patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
+patchRegsOfInstr instr env = case instr of
+    -- 0. Meta Instructions
+    ANN d i             -> ANN d (patchRegsOfInstr i env)
+    COMMENT{}           -> instr
+    MULTILINE_COMMENT{} -> instr
+    PUSH_STACK_FRAME    -> instr
+    POP_STACK_FRAME     -> instr
+    DELTA{}             -> instr
+    -- 1. Arithmetic Instructions ----------------------------------------------
+    ADD o1 o2 o3   -> ADD (patchOp o1) (patchOp o2) (patchOp o3)
+    MUL o1 o2 o3   -> MUL (patchOp o1) (patchOp o2) (patchOp o3)
+    NEG o1 o2      -> NEG (patchOp o1) (patchOp o2)
+    SMULH o1 o2 o3 -> SMULH (patchOp o1) (patchOp o2)  (patchOp o3)
+    DIV o1 o2 o3   -> DIV (patchOp o1) (patchOp o2) (patchOp o3)
+    REM o1 o2 o3   -> REM (patchOp o1) (patchOp o2) (patchOp o3)
+    REMU o1 o2 o3  -> REMU (patchOp o1) (patchOp o2) (patchOp o3)
+    SUB o1 o2 o3   -> SUB  (patchOp o1) (patchOp o2) (patchOp o3)
+    DIVU o1 o2 o3  -> DIVU (patchOp o1) (patchOp o2) (patchOp o3)
+
+    -- 2. Bit Manipulation Instructions ----------------------------------------
+
+    -- 3. Logical and Move Instructions ----------------------------------------
+    AND o1 o2 o3   -> AND  (patchOp o1) (patchOp o2) (patchOp o3)
+    OR o1 o2 o3    -> OR   (patchOp o1) (patchOp o2) (patchOp o3)
+    ASR o1 o2 o3   -> ASR  (patchOp o1) (patchOp o2) (patchOp o3)
+    XOR o1 o2 o3   -> XOR  (patchOp o1) (patchOp o2) (patchOp o3)
+    LSL o1 o2 o3   -> LSL  (patchOp o1) (patchOp o2) (patchOp o3)
+    LSR o1 o2 o3   -> LSR  (patchOp o1) (patchOp o2) (patchOp o3)
+    MOV o1 o2      -> MOV  (patchOp o1) (patchOp o2)
+    -- o3 cannot be a register for ORI (always an immediate)
+    ORI o1 o2 o3   -> ORI  (patchOp o1) (patchOp o2) (patchOp o3)
+    XORI o1 o2 o3  -> XORI  (patchOp o1) (patchOp o2) (patchOp o3)
+
+    -- 4. Branch Instructions --------------------------------------------------
+    J t            -> J (patchTarget t)
+    J_TBL ids mbLbl t    -> J_TBL ids mbLbl (env t)
+    B t            -> B (patchTarget t)
+    BL t rs ts     -> BL (patchTarget t) rs ts
+    BCOND c o1 o2 t -> BCOND c (patchOp o1) (patchOp o2) (patchTarget t)
+
+    -- 5. Atomic Instructions --------------------------------------------------
+    -- 6. Conditional Instructions ---------------------------------------------
+    CSET o l r c   -> CSET (patchOp o) (patchOp l) (patchOp r) c
+    -- 7. Load and Store Instructions ------------------------------------------
+    STR f o1 o2    -> STR f (patchOp o1) (patchOp o2)
+    -- STLR f o1 o2   -> STLR f (patchOp o1) (patchOp o2)
+    LDR f o1 o2    -> LDR f (patchOp o1) (patchOp o2)
+    LDRU f o1 o2    -> LDRU f (patchOp o1) (patchOp o2)
+
+    -- 8. Synchronization Instructions -----------------------------------------
+    DMBSY o1 o2    -> DMBSY o1 o2
+
+    -- 9. Floating Point Instructions ------------------------------------------
+    FCVT o1 o2     -> FCVT (patchOp o1) (patchOp o2)
+    SCVTF o1 o2    -> SCVTF (patchOp o1) (patchOp o2)
+    FCVTZS o1 o2   -> FCVTZS (patchOp o1) (patchOp o2)
+    FABS o1 o2     -> FABS (patchOp o1) (patchOp o2)
+    _              -> panic $ "patchRegsOfInstr: " ++ instrCon instr
+    where
+        patchOp :: Operand -> Operand
+        patchOp (OpReg w r) = OpReg w (env r)
+        patchOp (OpAddr a) = OpAddr (patchAddr a)
+        patchOp op = op
+        patchTarget :: Target -> Target
+        patchTarget (TReg r) = TReg (env r)
+        patchTarget t = t
+        patchAddr :: AddrMode -> AddrMode
+        patchAddr (AddrRegImm r1 i)  = AddrRegImm (env r1) i
+        patchAddr (AddrReg r) = AddrReg (env r)
+--------------------------------------------------------------------------------
+
+-- | Checks whether this instruction is a jump/branch instruction.
+--
+-- One that can change the flow of control in a way that the
+-- register allocator needs to worry about.
+isJumpishInstr :: Instr -> Bool
+isJumpishInstr instr = case instr of
+  ANN _ i -> isJumpishInstr i
+  J {} -> True
+  J_TBL {} -> True
+  B {} -> True
+  BL {} -> True
+  BCOND {} -> True
+  _ -> False
+
+-- | Get the `BlockId`s of the jump destinations (if any)
+jumpDestsOfInstr :: Instr -> [BlockId]
+jumpDestsOfInstr (ANN _ i) = jumpDestsOfInstr i
+jumpDestsOfInstr (J t) = [id | TBlock id <- [t]]
+jumpDestsOfInstr (J_TBL ids _mbLbl _r) = catMaybes ids
+jumpDestsOfInstr (B t) = [id | TBlock id <- [t]]
+jumpDestsOfInstr (BL t _ _) = [id | TBlock id <- [t]]
+jumpDestsOfInstr (BCOND _ _ _ t) = [id | TBlock id <- [t]]
+jumpDestsOfInstr _ = []
+
+-- | Change the destination of this (potential) jump instruction.
+--
+-- Used in the linear allocator when adding fixup blocks for join
+-- points.
+patchJumpInstr :: Instr -> (BlockId -> BlockId) -> Instr
+patchJumpInstr instr patchF =
+  case instr of
+    ANN d i -> ANN d (patchJumpInstr i patchF)
+    J (TBlock bid) -> J (TBlock (patchF bid))
+    J_TBL ids mbLbl r -> J_TBL (map (fmap patchF) ids) mbLbl r
+    B (TBlock bid) -> B (TBlock (patchF bid))
+    BL (TBlock bid) ps rs -> BL (TBlock (patchF bid)) ps rs
+    BCOND c o1 o2 (TBlock bid) -> BCOND c o1 o2 (TBlock (patchF bid))
+    _ -> panic $ "patchJumpInstr: " ++ instrCon instr
+
+-- -----------------------------------------------------------------------------
+-- Note [Spills and Reloads]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~
+-- We reserve @RESERVED_C_STACK_BYTES@ on the C stack for spilling and reloading
+-- registers.  AArch64s maximum displacement for SP relative spills and reloads
+-- is essentially [-256,255], or [0, 0xFFF]*8 = [0, 32760] for 64bits.
+--
+-- The @RESERVED_C_STACK_BYTES@ is 16k, so we can't address any location in a
+-- single instruction.  The idea is to use the Inter Procedure 0 (ip) register
+-- to perform the computations for larger offsets.
+--
+-- Using sp to compute the offset will violate assumptions about the stack pointer
+-- pointing to the top of the stack during signal handling.  As we can't force
+-- every signal to use its own stack, we have to ensure that the stack pointer
+-- always points to the top of the stack, and we can't use it for computation.
+--
+-- | An instruction to spill a register into a spill slot.
+mkSpillInstr ::
+  HasCallStack =>
+  NCGConfig ->
+  Reg -> -- ^ register to spill
+  Int -> -- ^ current stack delta
+  Int -> -- ^ spill slot to use
+  [Instr]
+mkSpillInstr _config reg delta slot =
+  case off - delta of
+    imm | fitsIn12bitImm imm -> [mkStrSpImm imm]
+    imm ->
+      [ movImmToIp imm,
+        addSpToIp,
+        mkStrIp
+      ]
+  where
+    fmt = case reg of
+      RegReal (RealRegSingle n) | n < d0RegNo -> II64
+      _ -> FF64
+    mkStrSpImm imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm spMachReg (ImmInt imm)))
+    movImmToIp imm = ANN (text "Spill: IP <- " <> int imm) $ MOV ip (OpImm (ImmInt imm))
+    addSpToIp = ANN (text "Spill: IP <- SP + IP ") $ ADD ip ip sp
+    mkStrIp = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrReg ipReg))
+
+    off = spillSlotToOffset slot
+
+mkLoadInstr
+   :: NCGConfig
+   -> Reg       -- ^ register to load
+   -> Int       -- ^ current stack delta
+   -> Int       -- ^ spill slot to use
+   -> [Instr]
+
+mkLoadInstr _config reg delta slot =
+  case off - delta of
+    imm | fitsIn12bitImm imm -> [mkLdrSpImm imm]
+    imm ->
+      [ movImmToIp imm,
+        addSpToIp,
+        mkLdrIp
+      ]
+  where
+    fmt = case reg of
+      RegReal (RealRegSingle n) | n < d0RegNo -> II64
+      _ -> FF64
+    mkLdrSpImm imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm spMachReg (ImmInt imm)))
+    movImmToIp imm = ANN (text "Reload: IP <- " <> int imm) $ MOV ip (OpImm (ImmInt imm))
+    addSpToIp = ANN (text "Reload: IP <- SP + IP ") $ ADD ip ip sp
+    mkLdrIp = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrReg ipReg))
+
+    off = spillSlotToOffset slot
+
+-- | See if this instruction is telling us the current C stack delta
+takeDeltaInstr :: Instr -> Maybe Int
+takeDeltaInstr (ANN _ i) = takeDeltaInstr i
+takeDeltaInstr (DELTA i) = Just i
+takeDeltaInstr _         = Nothing
+
+-- | Not real instructions.  Just meta data
+isMetaInstr :: Instr -> Bool
+isMetaInstr instr =
+  case instr of
+    ANN _ i -> isMetaInstr i
+    COMMENT {} -> True
+    MULTILINE_COMMENT {} -> True
+    LOCATION {} -> True
+    LDATA {} -> True
+    NEWBLOCK {} -> True
+    DELTA {} -> True
+    PUSH_STACK_FRAME -> True
+    POP_STACK_FRAME -> True
+    _ -> False
+
+-- | Copy the value in a register to another one.
+--
+-- Must work for all register classes.
+mkRegRegMoveInstr :: Reg -> Reg -> Instr
+mkRegRegMoveInstr src dst = ANN desc instr
+  where
+    desc = text "Reg->Reg Move: " <> ppr src <> text " -> " <> ppr dst
+    instr = MOV (operandFromReg dst) (operandFromReg src)
+
+-- | Take the source and destination from this (potential) reg -> reg move instruction
+--
+-- We have to be a bit careful here: A `MOV` can also mean an implicit
+-- conversion. This case is filtered out.
+takeRegRegMoveInstr :: Instr -> Maybe (Reg, Reg)
+takeRegRegMoveInstr (MOV (OpReg width dst) (OpReg width' src))
+  | width == width' && (isFloatReg dst == isFloatReg src) = pure (src, dst)
+takeRegRegMoveInstr _ = Nothing
+
+-- | Make an unconditional jump instruction.
+mkJumpInstr :: BlockId -> [Instr]
+mkJumpInstr = pure . B . TBlock
+
+-- | Decrement @sp@ to allocate stack space.
+--
+-- The stack grows downwards, so we decrement the stack pointer by @n@ (bytes).
+-- This is dual to `mkStackDeallocInstr`. @sp@ is the RISCV stack pointer, not
+-- to be confused with the STG stack pointer.
+mkStackAllocInstr :: Platform -> Int -> [Instr]
+mkStackAllocInstr _platform = moveSp . negate
+
+-- | Increment SP to deallocate stack space.
+--
+-- The stack grows downwards, so we increment the stack pointer by @n@ (bytes).
+-- This is dual to `mkStackAllocInstr`. @sp@ is the RISCV stack pointer, not to
+-- be confused with the STG stack pointer.
+mkStackDeallocInstr :: Platform -> Int -> [Instr]
+mkStackDeallocInstr _platform = moveSp
+
+moveSp :: Int -> [Instr]
+moveSp n
+  | n == 0 = []
+  | n /= 0 && fitsIn12bitImm n = pure . ANN desc $ ADD sp sp (OpImm (ImmInt n))
+  | otherwise =
+      -- This ends up in three effective instructions. We could get away with
+      -- two for intMax12bit < n < 3 * intMax12bit by recursing once. However,
+      -- this way is likely less surprising.
+      [ ANN desc (MOV ip (OpImm (ImmInt n))),
+        ADD sp sp ip
+      ]
+  where
+    desc = text "Move SP:" <+> int n
+
+--
+-- See Note [extra spill slots] in X86/Instr.hs
+--
+allocMoreStack
+  :: Platform
+  -> Int
+  -> NatCmmDecl statics GHC.CmmToAsm.RV64.Instr.Instr
+  -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.RV64.Instr.Instr, [(BlockId,BlockId)])
+
+allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
+allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
+    let entries = entryBlocks proc
+
+    uniqs <- getUniquesM
+
+    let
+      delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
+        where x = slots * spillSlotSize -- sp delta
+
+      alloc   = mkStackAllocInstr   platform delta
+      dealloc = mkStackDeallocInstr platform delta
+
+      retargetList = zip entries (map mkBlockId uniqs)
+
+      new_blockmap :: LabelMap BlockId
+      new_blockmap = mapFromList retargetList
+
+      insert_stack_insn (BasicBlock id insns)
+        | Just new_blockid <- mapLookup id new_blockmap
+        = [ BasicBlock id $ alloc ++ [ B (TBlock new_blockid) ]
+          , BasicBlock new_blockid block' ]
+        | otherwise
+        = [ BasicBlock id block' ]
+        where
+          block' = foldr insert_dealloc [] insns
+
+      insert_dealloc insn r = case insn of
+        J _ -> dealloc ++ (insn : r)
+        J_TBL {} -> dealloc ++ (insn : r)
+        ANN _ e -> insert_dealloc e r
+        _other | jumpDestsOfInstr insn /= []
+            -> patchJumpInstr insn retarget : r
+        _other -> insn : r
+
+        where retarget b = fromMaybe b (mapLookup b new_blockmap)
+
+      new_code = concatMap insert_stack_insn code
+    return (CmmProc info lbl live (ListGraph new_code), retargetList)
+
+-- -----------------------------------------------------------------------------
+-- Machine's assembly language
+
+-- We have a few common "instructions" (nearly all the pseudo-ops) but
+-- mostly all of 'Instr' is machine-specific.
+
+-- RV64 reference card: https://cs61c.org/sp23/pdfs/resources/reference-card.pdf
+-- RV64 pseudo instructions: https://github.com/riscv-non-isa/riscv-asm-manual/blob/master/riscv-asm.md#-a-listing-of-standard-risc-v-pseudoinstructions
+-- We will target: RV64G(C). That is G = I+A+F+S+D
+-- I: Integer Multiplication and Division
+-- A: Atomic Instructions
+-- F: Single Precision
+-- D: Double Precision
+-- C: Compressed (though we won't use that).
+
+-- This most notably leaves out B. (Bit Manipulation) instructions.
+
+data Instr
+    -- comment pseudo-op
+    = COMMENT SDoc
+    | MULTILINE_COMMENT SDoc
+
+    -- Annotated instruction. Should print <instr> # <doc>
+    | ANN SDoc Instr
+
+    -- location pseudo-op (file, line, col, name)
+    | LOCATION Int Int Int String
+
+    -- some static data spat out during code
+    -- generation.  Will be extracted before
+    -- pretty-printing.
+    | LDATA   Section RawCmmStatics
+
+    -- start a new basic block.  Useful during
+    -- codegen, removed later.  Preceding
+    -- instruction should be a jump, as per the
+    -- invariants for a BasicBlock (see Cmm).
+    | NEWBLOCK BlockId
+
+    -- specify current stack offset for
+    -- benefit of subsequent passes
+    | DELTA   Int
+
+    -- 0. Pseudo Instructions --------------------------------------------------
+    | PUSH_STACK_FRAME
+    | POP_STACK_FRAME
+
+    -- == Base Instructions (I) ================================================
+    -- 1. Arithmetic Instructions ----------------------------------------------
+    -- all of these instructions can also take an immediate, in which case they
+    -- hafe a suffix I (except for U suffix, where it's IU then. E.g. SLTIU).
+    | ADD Operand Operand Operand -- rd = rs1 + rs2
+    | SUB Operand Operand Operand -- rd = rs1 - rs2
+
+    | AND Operand Operand Operand -- rd = rs1 & rs2
+    | OR  Operand Operand Operand -- rd = rs1 | rs2
+    -- | XOR Operand Operand Operand -- rd = rs1 ^ rs2
+    | LSL {- SLL -} Operand Operand Operand -- rd = rs1 << rs2 (zero ext)
+    | LSR {- SRL -} Operand Operand Operand -- rd = rs1 >> rs2 (zero ext)
+    | ASR {- SRA -} Operand Operand Operand -- rd = rs1 >> rs2 (sign ext)
+
+    -- 2. Memory Load/Store Instructions ---------------------------------------
+    -- Unlike arm, we don't have register shorthands for size.
+    -- We do however have {L,S}{B,H,W,D}[U] instructions for Load/Store, Byte, Half, Word, Double, (Unsigned).
+    -- Reusing the arm logic with the _format_ specifier will hopefully work.
+    | STR Format Operand Operand -- str Xn, address-mode // Xn -> *addr
+    | LDR Format Operand Operand -- ldr Xn, address-mode // Xn <- *addr (sign-extended)
+    | LDRU Format Operand Operand -- ldr Xn, address-mode // Xn <- *addr (unsigned)
+
+    -- 3. Control Flow ---------------------------------------------------------
+    -- B{EQ,GE,GEU,LT,LTU}, these are effectively BCOND from AArch64;
+    -- however, AArch64 desugars them into CMP + BCOND. So these are a bit more
+    -- powerful.
+    -- JAL / JARL are effectively the BL instruction from AArch64.
+
+    | MUL Operand Operand Operand -- rd = rn × rm
+
+
+    -- Pseudo/synthesized:
+    | NEG Operand Operand -- rd = -op2
+
+    | DIV Operand Operand Operand -- rd = rn ÷ rm
+    | REM Operand Operand Operand -- rd = rn % rm (signed)
+    | REMU Operand Operand Operand -- rd = rn % rm (unsigned)
+
+    -- TODO: Rename: MULH
+    | SMULH Operand Operand Operand
+    | DIVU Operand Operand Operand -- rd = rn ÷ rm
+
+    -- 2. Bit Manipulation Instructions ----------------------------------------
+
+    -- 3. Logical and Move Instructions ----------------------------------------
+    -- | AND Operand Operand Operand -- rd = rn & op2
+    -- | ANDS Operand Operand Operand -- rd = rn & op2
+    -- | ASR Operand Operand Operand -- rd = rn ≫ rm  or  rd = rn ≫ #i, i is 6 bits
+    | XOR Operand Operand Operand -- rd = rn ⊕ op2
+    -- | LSL Operand Operand Operand -- rd = rn ≪ rm  or rd = rn ≪ #i, i is 6 bits
+    -- | LSR Operand Operand Operand -- rd = rn ≫ rm  or rd = rn ≫ #i, i is 6 bits
+    | MOV Operand Operand -- rd = rn  or  rd = #i
+    | ORI Operand Operand Operand -- rd = rn | op2
+    | XORI Operand Operand Operand -- rd = rn `xor` imm
+    -- Load and stores.
+
+    -- Conditional instructions
+    -- This is a synthetic operation.
+    | CSET Operand Operand Operand Cond   -- if(o2 cond o3) op <- 1 else op <- 0
+
+    -- Branching.
+    -- TODO: Unused
+    | J Target            -- like B, but only generated from genJump. Used to distinguish genJumps from others.
+    -- | A `J` instruction with data for switch jump tables
+    | J_TBL [Maybe BlockId] (Maybe CLabel) Reg
+    | B Target            -- unconditional branching b/br. (To a blockid, label or register)
+    | BL Target [Reg] [Reg] -- branch and link (e.g. set x30 to next pc, and branch)
+    | BCOND Cond Operand Operand Target   -- branch with condition. b.<cond>
+
+    -- 8. Synchronization Instructions -----------------------------------------
+    | DMBSY DmbType DmbType
+    -- 9. Floating Point Instructions
+    -- Float ConVerT
+    | FCVT Operand Operand
+    -- Signed ConVerT Float
+    | SCVTF Operand Operand
+    -- Float ConVerT to Zero Signed
+    | FCVTZS Operand Operand
+    -- Float ABSolute value
+    | FABS Operand Operand
+
+data DmbType = DmbRead | DmbWrite | DmbReadWrite
+
+instrCon :: Instr -> String
+instrCon i =
+    case i of
+      COMMENT{} -> "COMMENT"
+      MULTILINE_COMMENT{} -> "COMMENT"
+      ANN{} -> "ANN"
+      LOCATION{} -> "LOCATION"
+      LDATA{} -> "LDATA"
+      NEWBLOCK{} -> "NEWBLOCK"
+      DELTA{} -> "DELTA"
+      PUSH_STACK_FRAME{} -> "PUSH_STACK_FRAME"
+      POP_STACK_FRAME{} -> "POP_STACK_FRAME"
+      ADD{} -> "ADD"
+      OR{} -> "OR"
+      MUL{} -> "MUL"
+      NEG{} -> "NEG"
+      DIV{} -> "DIV"
+      REM{} -> "REM"
+      REMU{} -> "REMU"
+      SMULH{} -> "SMULH"
+      SUB{} -> "SUB"
+      DIVU{} -> "DIVU"
+      AND{} -> "AND"
+      ASR{} -> "ASR"
+      XOR{} -> "XOR"
+      LSL{} -> "LSL"
+      LSR{} -> "LSR"
+      MOV{} -> "MOV"
+      ORI{} -> "ORI"
+      XORI{} -> "ORI"
+      STR{} -> "STR"
+      LDR{} -> "LDR"
+      LDRU{} -> "LDRU"
+      CSET{} -> "CSET"
+      J{} -> "J"
+      J_TBL{} -> "J_TBL"
+      B{} -> "B"
+      BL{} -> "BL"
+      BCOND{} -> "BCOND"
+      DMBSY{} -> "DMBSY"
+      FCVT{} -> "FCVT"
+      SCVTF{} -> "SCVTF"
+      FCVTZS{} -> "FCVTZS"
+      FABS{} -> "FABS"
+
+data Target
+    = TBlock BlockId
+    | TReg   Reg
+
+data Operand
+  = -- | register
+    OpReg Width Reg
+  | -- | immediate value
+    OpImm Imm
+  | -- | memory reference
+    OpAddr AddrMode
+  deriving (Eq, Show)
+
+operandFromReg :: Reg -> Operand
+operandFromReg = OpReg W64
+
+operandFromRegNo :: RegNo -> Operand
+operandFromRegNo = operandFromReg . regSingle
+
+zero, ra, sp, gp, tp, fp, ip :: Operand
+zero = operandFromReg zeroReg
+ra  = operandFromReg raReg
+sp  = operandFromReg spMachReg
+gp  = operandFromRegNo 3
+tp  = operandFromRegNo 4
+fp  = operandFromRegNo 8
+ip = operandFromReg ipReg
+
+x0,  x1,  x2,  x3,  x4,  x5,  x6,  x7  :: Operand
+x8,  x9,  x10, x11, x12, x13, x14, x15 :: Operand
+x16, x17, x18, x19, x20, x21, x22, x23 :: Operand
+x24, x25, x26, x27, x28, x29, x30, x31 :: Operand
+x0  = operandFromRegNo  x0RegNo
+x1  = operandFromRegNo  1
+x2  = operandFromRegNo  2
+x3  = operandFromRegNo  3
+x4  = operandFromRegNo  4
+x5  = operandFromRegNo  x5RegNo
+x6  = operandFromRegNo  6
+x7  = operandFromRegNo  x7RegNo
+x8  = operandFromRegNo  8
+x9  = operandFromRegNo  9
+x10 = operandFromRegNo x10RegNo
+x11 = operandFromRegNo 11
+x12 = operandFromRegNo 12
+x13 = operandFromRegNo 13
+x14 = operandFromRegNo 14
+x15 = operandFromRegNo 15
+x16 = operandFromRegNo 16
+x17 = operandFromRegNo x17RegNo
+x18 = operandFromRegNo 18
+x19 = operandFromRegNo 19
+x20 = operandFromRegNo 20
+x21 = operandFromRegNo 21
+x22 = operandFromRegNo 22
+x23 = operandFromRegNo 23
+x24 = operandFromRegNo 24
+x25 = operandFromRegNo 25
+x26 = operandFromRegNo 26
+x27 = operandFromRegNo 27
+x28 = operandFromRegNo x28RegNo
+x29 = operandFromRegNo 29
+x30 = operandFromRegNo 30
+x31 = operandFromRegNo x31RegNo
+
+d0,  d1,  d2,  d3,  d4,  d5,  d6,  d7  :: Operand
+d8,  d9,  d10, d11, d12, d13, d14, d15 :: Operand
+d16, d17, d18, d19, d20, d21, d22, d23 :: Operand
+d24, d25, d26, d27, d28, d29, d30, d31 :: Operand
+d0  = operandFromRegNo d0RegNo
+d1  = operandFromRegNo 33
+d2  = operandFromRegNo 34
+d3  = operandFromRegNo 35
+d4  = operandFromRegNo 36
+d5  = operandFromRegNo 37
+d6  = operandFromRegNo 38
+d7  = operandFromRegNo d7RegNo
+d8  = operandFromRegNo 40
+d9  = operandFromRegNo 41
+d10 = operandFromRegNo d10RegNo
+d11 = operandFromRegNo 43
+d12 = operandFromRegNo 44
+d13 = operandFromRegNo 45
+d14 = operandFromRegNo 46
+d15 = operandFromRegNo 47
+d16 = operandFromRegNo 48
+d17 = operandFromRegNo d17RegNo
+d18 = operandFromRegNo 50
+d19 = operandFromRegNo 51
+d20 = operandFromRegNo 52
+d21 = operandFromRegNo 53
+d22 = operandFromRegNo 54
+d23 = operandFromRegNo 55
+d24 = operandFromRegNo 56
+d25 = operandFromRegNo 57
+d26 = operandFromRegNo 58
+d27 = operandFromRegNo 59
+d28 = operandFromRegNo 60
+d29 = operandFromRegNo 61
+d30 = operandFromRegNo 62
+d31 = operandFromRegNo d31RegNo
+
+fitsIn12bitImm :: (Num a, Ord a) => a -> Bool
+fitsIn12bitImm off = off >= intMin12bit && off <= intMax12bit
+
+intMin12bit :: Num a => a
+intMin12bit = -2048
+
+intMax12bit :: Num a => a
+intMax12bit = 2047
+
+fitsIn32bits  :: (Num a, Ord a, Bits a) => a -> Bool
+fitsIn32bits i = (-1 `shiftL` 31) <= i && i <= (1 `shiftL` 31 -1)
+
+isNbitEncodeable :: Int -> Integer -> Bool
+isNbitEncodeable n i = let shift = n - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift)
+
+isEncodeableInWidth :: Width -> Integer -> Bool
+isEncodeableInWidth = isNbitEncodeable . widthInBits
+
+isIntOp :: Operand -> Bool
+isIntOp = not . isFloatOp
+
+isFloatOp :: Operand -> Bool
+isFloatOp (OpReg _ reg) | isFloatReg reg = True
+isFloatOp _ = False
+
+isFloatReg :: Reg -> Bool
+isFloatReg (RegReal (RealRegSingle i)) | i > 31 = True
+isFloatReg (RegVirtual (VirtualRegF _)) = True
+isFloatReg (RegVirtual (VirtualRegD _)) = True
+isFloatReg _ = False
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/RV64/Ppr.hs
===================================================================
--- /dev/null
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/RV64/Ppr.hs
@@ -0,0 +1,662 @@
+module GHC.CmmToAsm.RV64.Ppr (pprNatCmmDecl, pprInstr) where
+
+import GHC.Prelude hiding (EQ)
+
+import GHC.CmmToAsm.RV64.Instr
+import GHC.CmmToAsm.RV64.Regs
+import GHC.CmmToAsm.RV64.Cond
+import GHC.CmmToAsm.Ppr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Config
+import GHC.CmmToAsm.Types
+import GHC.CmmToAsm.Utils
+
+import GHC.Cmm hiding (topInfoTable)
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Types.Basic (Alignment, mkAlignment, alignmentBytes)
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+
+import GHC.Types.Unique ( pprUniqueAlways, getUnique )
+import GHC.Platform
+import GHC.Utils.Outputable
+
+import GHC.Utils.Panic
+
+-- TODO: Move function down to where it is used.
+pprProcAlignment :: IsDoc doc => NCGConfig -> doc
+pprProcAlignment config = maybe empty (pprAlign platform . mkAlignment) (ncgProcAlignment config)
+   where
+      platform = ncgPlatform config
+
+pprNatCmmDecl :: IsDoc doc => NCGConfig -> NatCmmDecl RawCmmStatics Instr -> doc
+pprNatCmmDecl config (CmmData section dats) =
+  pprSectionAlign config section $$ pprDatas config dats
+
+pprNatCmmDecl config proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
+  let platform = ncgPlatform config in
+  pprProcAlignment config $$
+  case topInfoTable proc of
+    Nothing ->
+        -- special case for code without info table:
+        pprSectionAlign config (Section Text lbl) $$
+        -- do not
+        -- pprProcAlignment config $$
+        pprLabel platform lbl $$ -- blocks guaranteed not null, so label needed
+        vcat (map (pprBasicBlock config top_info) blocks) $$
+        ppWhen (ncgDwarfEnabled config)
+          (line (pprBlockEndLabel platform lbl) $$ line (pprProcEndLabel platform lbl)) $$
+        pprSizeDecl platform lbl
+
+    Just (CmmStaticsRaw info_lbl _) ->
+      pprSectionAlign config (Section Text info_lbl) $$
+      -- pprProcAlignment config $$
+      (if platformHasSubsectionsViaSymbols platform
+          then line (pprAsmLabel platform (mkDeadStripPreventer info_lbl) <> char ':')
+          else empty) $$
+      vcat (map (pprBasicBlock config top_info) blocks) $$
+      ppWhen (ncgDwarfEnabled config) (line (pprProcEndLabel platform info_lbl)) $$
+      -- above: Even the first block gets a label, because with branch-chain
+      -- elimination, it might be the target of a goto.
+      (if platformHasSubsectionsViaSymbols platform
+       then -- See Note [Subsections Via Symbols]
+                line
+              $ text "\t.long "
+            <+> pprAsmLabel platform info_lbl
+            <+> char '-'
+            <+> pprAsmLabel platform (mkDeadStripPreventer info_lbl)
+       else empty) $$
+      pprSizeDecl platform info_lbl
+{-# SPECIALIZE pprNatCmmDecl :: NCGConfig -> NatCmmDecl RawCmmStatics Instr -> SDoc #-}
+{-# SPECIALIZE pprNatCmmDecl :: NCGConfig -> NatCmmDecl RawCmmStatics Instr -> HDoc #-} -- see Note [SPECIALIZE to HDoc] in GHC.Utils.Outputable
+
+pprLabel :: IsDoc doc => Platform -> CLabel -> doc
+pprLabel platform lbl =
+   pprGloblDecl platform lbl
+   $$ pprTypeDecl platform lbl
+   $$ line (pprAsmLabel platform lbl <> char ':')
+
+-- TODO: Delete unused parameter.
+pprAlign :: IsDoc doc => Platform -> Alignment -> doc
+pprAlign _platform alignment
+        = line $ text "\t.balign " <> int (alignmentBytes alignment)
+
+-- TODO: Delete unused parameters.
+-- | Print appropriate alignment for the given section type.
+pprAlignForSection :: IsDoc doc => Platform -> SectionType -> doc
+pprAlignForSection _platform _seg
+    -- .balign is stable, whereas .align is platform dependent.
+    = line (text "\t.balign 8") --  always 8
+
+-- | Print section header and appropriate alignment for that section.
+--
+-- This one will emit the header:
+--
+--     .section .text
+--     .balign 8
+--
+pprSectionAlign :: IsDoc doc => NCGConfig -> Section -> doc
+pprSectionAlign _config (Section (OtherSection _) _) =
+  panic "RV64.Ppr.pprSectionAlign: unknown section"
+pprSectionAlign config sec@(Section seg _) =
+    line (pprSectionHeader config sec)
+    $$ pprAlignForSection (ncgPlatform config) seg
+
+pprProcEndLabel :: IsLine doc => Platform -> CLabel -- ^ Procedure name
+                -> doc
+pprProcEndLabel platform lbl =
+    pprAsmLabel platform (mkAsmTempProcEndLabel lbl) <> colon
+
+pprBlockEndLabel :: IsLine doc => Platform -> CLabel -- ^ Block name
+                 -> doc
+pprBlockEndLabel platform lbl =
+    pprAsmLabel platform (mkAsmTempEndLabel lbl) <> colon
+
+-- | Output the ELF .size directive.
+pprSizeDecl :: IsDoc doc => Platform -> CLabel -> doc
+pprSizeDecl platform lbl
+ = if osElfTarget (platformOS platform)
+   then line (text "\t.size" <+> pprAsmLabel platform lbl <> text ", .-" <> pprAsmLabel platform lbl)
+   else empty
+
+pprBasicBlock :: IsDoc doc => NCGConfig -> LabelMap RawCmmStatics -> NatBasicBlock Instr
+              -> doc
+pprBasicBlock config info_env (BasicBlock blockid instrs)
+  = maybe_infotable $
+    pprLabel platform asmLbl $$
+    vcat (map (pprInstr platform) (id {-detectTrivialDeadlock-} optInstrs)) $$
+    ppWhen (ncgDwarfEnabled config) (
+      -- Emit both end labels since this may end up being a standalone
+      -- top-level block
+      line (pprBlockEndLabel platform asmLbl
+         <> pprProcEndLabel platform asmLbl)
+    )
+  where
+    -- TODO: Check if we can  filter more instructions here.
+    -- TODO: Shouldn't this be a more general check on a higher level?
+    -- Filter out identity moves. E.g. mov x18, x18 will be dropped.
+    optInstrs = filter f instrs
+      where f (MOV o1 o2) | o1 == o2 = False
+            f _ = True
+
+    asmLbl = blockLbl blockid
+    platform = ncgPlatform config
+    maybe_infotable c = case mapLookup blockid info_env of
+       Nothing   -> c
+       Just (CmmStaticsRaw info_lbl info) ->
+          --  pprAlignForSection platform Text $$
+           infoTableLoc $$
+           vcat (map (pprData config) info) $$
+           pprLabel platform info_lbl $$
+           c $$
+           ppWhen (ncgDwarfEnabled config)
+              (line (pprBlockEndLabel platform info_lbl))
+    -- Make sure the info table has the right .loc for the block
+    -- coming right after it. See Note [Info Offset]
+    infoTableLoc = case instrs of
+      (l@LOCATION{} : _) -> pprInstr platform l
+      _other             -> empty
+
+pprDatas :: IsDoc doc => NCGConfig -> RawCmmStatics -> doc
+-- See Note [emit-time elimination of static indirections] in "GHC.Cmm.CLabel".
+pprDatas config (CmmStaticsRaw alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _])
+  | lbl == mkIndStaticInfoLabel
+  , let labelInd (CmmLabelOff l _) = Just l
+        labelInd (CmmLabel l) = Just l
+        labelInd _ = Nothing
+  , Just ind' <- labelInd ind
+  , alias `mayRedirectTo` ind'
+  = pprGloblDecl (ncgPlatform config) alias
+    $$ line (text ".equiv" <+> pprAsmLabel (ncgPlatform config) alias <> comma <> pprAsmLabel (ncgPlatform config) ind')
+
+pprDatas config (CmmStaticsRaw lbl dats)
+  = vcat (pprLabel platform lbl : map (pprData config) dats)
+   where
+      platform = ncgPlatform config
+
+-- TODO: Unused parameter.
+pprData :: IsDoc doc => NCGConfig -> CmmStatic -> doc
+pprData _config (CmmString str) = line (pprString str)
+pprData _config (CmmFileEmbed path _) = line (pprFileEmbed path)
+
+-- TODO: AFAIK there no Darwin for RISCV, so we may consider to simplify this.
+pprData config (CmmUninitialised bytes)
+ = line $ let platform = ncgPlatform config
+          in if platformOS platform == OSDarwin
+                then text ".space " <> int bytes
+                else text ".skip "  <> int bytes
+
+pprData config (CmmStaticLit lit) = pprDataItem config lit
+
+pprGloblDecl :: IsDoc doc => Platform -> CLabel -> doc
+pprGloblDecl platform lbl
+  | not (externallyVisibleCLabel lbl) = empty
+  | otherwise = line (text "\t.globl " <> pprAsmLabel platform lbl)
+
+-- Note [Always use objects for info tables]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- See discussion in X86.Ppr for why this is necessary.  Essentially we need to
+-- ensure that we never pass function symbols when we might want to lookup the
+-- info table.  If we did, we could end up with procedure linking tables
+-- (PLT)s, and thus the lookup wouldn't point to the function, but into the
+-- jump table.
+--
+-- Fun fact: The LLVMMangler exists to patch this issue su on the LLVM side as
+-- well.
+pprLabelType' :: IsLine doc => Platform -> CLabel -> doc
+pprLabelType' platform lbl =
+  if isCFunctionLabel lbl || functionOkInfoTable then
+    text "@function"
+  else
+    text "@object"
+  where
+    functionOkInfoTable = platformTablesNextToCode platform &&
+      isInfoTableLabel lbl && not (isCmmInfoTableLabel lbl) && not (isConInfoTableLabel lbl)
+
+-- this is called pprTypeAndSizeDecl in PPC.Ppr
+pprTypeDecl :: IsDoc doc => Platform -> CLabel -> doc
+pprTypeDecl platform lbl
+    = if osElfTarget (platformOS platform) && externallyVisibleCLabel lbl
+      then line (text ".type " <> pprAsmLabel platform lbl <> text ", " <> pprLabelType' platform lbl)
+      else empty
+
+pprDataItem :: IsDoc doc => NCGConfig -> CmmLit -> doc
+pprDataItem config lit
+  = lines_ (ppr_item (cmmTypeFormat $ cmmLitType platform lit) lit)
+    where
+        platform = ncgPlatform config
+
+        imm = litToImm lit
+
+        ppr_item II8  _ = [text "\t.byte\t"  <> pprDataImm platform imm]
+        ppr_item II16 _ = [text "\t.short\t" <> pprDataImm platform imm]
+        ppr_item II32 _ = [text "\t.long\t"  <> pprDataImm platform imm]
+        ppr_item II64 _ = [text "\t.quad\t"  <> pprDataImm platform imm]
+
+        ppr_item FF32  (CmmFloat r _)
+           = let bs = floatToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> int (fromIntegral b)) bs
+
+        ppr_item FF64 (CmmFloat r _)
+           = let bs = doubleToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> int (fromIntegral b)) bs
+
+        ppr_item _ _ = pprPanic "pprDataItem:ppr_item" (text $ show lit)
+
+-- | Pretty print an immediate value in the @data@ section
+--
+-- This does not include any checks. We rely on the Assembler to check for
+-- errors. Use `pprOpImm` for immediates in instructions (operands.)
+pprDataImm :: IsLine doc => Platform -> Imm -> doc
+pprDataImm _ (ImmInt i)     = int i
+pprDataImm _ (ImmInteger i) = integer i
+pprDataImm p (ImmCLbl l)    = pprAsmLabel p l
+pprDataImm p (ImmIndex l i) = pprAsmLabel p l <> char '+' <> int i
+pprDataImm _ (ImmLit s)     = ftext s
+pprDataImm _ (ImmFloat f) = float (fromRational f)
+pprDataImm _ (ImmDouble d) = double (fromRational d)
+
+pprDataImm p (ImmConstantSum a b) = pprDataImm p a <> char '+' <> pprDataImm p b
+pprDataImm p (ImmConstantDiff a b) = pprDataImm p a <> char '-'
+                   <> lparen <> pprDataImm p b <> rparen
+
+asmComment :: SDoc -> SDoc
+asmComment c = text "#" <+> c
+
+asmDoubleslashComment :: SDoc -> SDoc
+asmDoubleslashComment c = text "//" <+> c
+
+asmMultilineComment :: SDoc -> SDoc
+asmMultilineComment c =  text "/*" $+$ c $+$ text "*/"
+
+-- | Pretty print an immediate operand of an instruction
+--
+-- The kinds of immediates we can use here is pretty limited: RISCV doesn't
+-- support index expressions (as e.g. Aarch64 does.) Floating points need to
+-- fit in range. As we don't need them, forbit them to save us from future
+-- troubles.
+pprOpImm :: (IsLine doc) => Platform -> Imm -> doc
+pprOpImm platform im = case im of
+  ImmInt i -> int i
+  ImmInteger i -> integer i
+  ImmCLbl l -> char '=' <> pprAsmLabel platform l
+  _ -> pprPanic "RV64.Ppr.pprOpImm" (text "Unsupported immediate for instruction operands" <> colon <+> (text . show) im)
+
+negOp :: Operand -> Operand
+negOp (OpImm (ImmInt i)) = OpImm (ImmInt (negate i))
+negOp (OpImm (ImmInteger i)) = OpImm (ImmInteger (negate i))
+negOp op = pprPanic "RV64.negOp" (text $ show op)
+
+pprOp :: IsLine doc => Platform -> Operand -> doc
+pprOp plat op = case op of
+  OpReg w r           -> pprReg w r
+  OpImm im          -> pprOpImm plat im
+  OpAddr (AddrRegImm r1 im) -> pprOpImm plat im <> char '(' <> pprReg W64 r1 <> char ')'
+  OpAddr (AddrReg r1)       -> text "0(" <+> pprReg W64 r1 <+> char ')'
+
+pprReg :: forall doc. IsLine doc => Width -> Reg -> doc
+pprReg w r = case r of
+  RegReal    (RealRegSingle i) -> ppr_reg_no i
+  -- virtual regs should not show up, but this is helpful for debugging.
+  RegVirtual (VirtualRegI u)   -> text "%vI_" <> pprUniqueAlways u
+  RegVirtual (VirtualRegF u)   -> text "%vF_" <> pprUniqueAlways u
+  RegVirtual (VirtualRegD u)   -> text "%vD_" <> pprUniqueAlways u
+  _                            -> pprPanic "RiscV64.pprReg" (text (show r) <+> ppr w)
+
+  where
+    ppr_reg_no :: Int -> doc
+    -- General Purpose Registers
+    ppr_reg_no 0 = text "zero"
+    ppr_reg_no 1 = text "ra"
+    ppr_reg_no 2 = text "sp"
+    ppr_reg_no 3 = text "gp"
+    ppr_reg_no 4 = text "tp"
+    ppr_reg_no 5 = text "t0"
+    ppr_reg_no 6 = text "t1"
+    ppr_reg_no 7 = text "t2"
+    ppr_reg_no 8 = text "s0"
+    ppr_reg_no 9 = text "s1"
+    ppr_reg_no 10 = text "a0"
+    ppr_reg_no 11 = text "a1"
+    ppr_reg_no 12 = text "a2"
+    ppr_reg_no 13 = text "a3"
+    ppr_reg_no 14 = text "a4"
+    ppr_reg_no 15 = text "a5"
+    ppr_reg_no 16 = text "a6"
+    ppr_reg_no 17 = text "a7"
+    ppr_reg_no 18 = text "s2"
+    ppr_reg_no 19 = text "s3"
+    ppr_reg_no 20 = text "s4"
+    ppr_reg_no 21 = text "s5"
+    ppr_reg_no 22 = text "s6"
+    ppr_reg_no 23 = text "s7"
+    ppr_reg_no 24 = text "s8"
+    ppr_reg_no 25 = text "s9"
+    ppr_reg_no 26 = text "s10"
+    ppr_reg_no 27 = text "s11"
+    ppr_reg_no 28 = text "t3"
+    ppr_reg_no 29 = text "t4"
+    ppr_reg_no 30 = text "t5"
+    ppr_reg_no 31 = text "t6"
+
+    -- Floating Point Registers
+    ppr_reg_no 32 = text "ft0"
+    ppr_reg_no 33 = text "ft1"
+    ppr_reg_no 34 = text "ft2"
+    ppr_reg_no 35 = text "ft3"
+    ppr_reg_no 36 = text "ft4"
+    ppr_reg_no 37 = text "ft5"
+    ppr_reg_no 38 = text "ft6"
+    ppr_reg_no 39 = text "ft7"
+    ppr_reg_no 40 = text "fs0"
+    ppr_reg_no 41 = text "fs1"
+    ppr_reg_no 42 = text "fa0"
+    ppr_reg_no 43 = text "fa1"
+    ppr_reg_no 44 = text "fa2"
+    ppr_reg_no 45 = text "fa3"
+    ppr_reg_no 46 = text "fa4"
+    ppr_reg_no 47 = text "fa5"
+    ppr_reg_no 48 = text "fa6"
+    ppr_reg_no 49 = text "fa7"
+    ppr_reg_no 50 = text "fs2"
+    ppr_reg_no 51 = text "fs3"
+    ppr_reg_no 52 = text "fs4"
+    ppr_reg_no 53 = text "fs5"
+    ppr_reg_no 54 = text "fs6"
+    ppr_reg_no 55 = text "fs7"
+    ppr_reg_no 56 = text "fs8"
+    ppr_reg_no 57 = text "fs9"
+    ppr_reg_no 58 = text "fs10"
+    ppr_reg_no 59 = text "fs11"
+    ppr_reg_no 60 = text "ft8"
+    ppr_reg_no 61 = text "ft9"
+    ppr_reg_no 62 = text "ft10"
+    ppr_reg_no 63 = text "ft11"
+
+    ppr_reg_no i
+         | i < 0 = pprPanic "Unexpected register number (min is 0)" (ppr w <+> int i)
+         | i > 63 = pprPanic "Unexpected register number (max is 63)" (ppr w <+> int i)
+         -- no support for widths > W64.
+         | otherwise = pprPanic "Unsupported width in register (max is 64)" (ppr w <+> int i)
+
+isSingleOp :: Operand -> Bool
+isSingleOp (OpReg W32 _) = True
+isSingleOp _ = False
+
+isDoubleOp :: Operand -> Bool
+isDoubleOp (OpReg W64 _) = True
+isDoubleOp _ = False
+
+isImmOp :: Operand -> Bool
+isImmOp (OpImm _) = True
+isImmOp _ = False
+
+isImmZero :: Operand -> Bool
+isImmZero (OpImm (ImmFloat 0)) = True
+isImmZero (OpImm (ImmDouble 0)) = True
+isImmZero (OpImm (ImmInt 0)) = True
+isImmZero _ = False
+
+isLabel :: Target -> Bool
+isLabel (TBlock _) = True
+isLabel _ = False
+
+getLabel :: IsLine doc => Platform -> Target -> doc
+getLabel platform (TBlock bid) = pprAsmLabel platform (mkLocalBlockLabel (getUnique bid))
+getLabel _platform _other = panic "Cannot turn this into a label"
+
+pprInstr :: IsDoc doc => Platform -> Instr -> doc
+pprInstr platform instr = case instr of
+  -- Meta Instructions ---------------------------------------------------------
+  -- see Note [dualLine and dualDoc] in GHC.Utils.Outputable
+  COMMENT s  -> dualDoc (asmComment s) empty
+  MULTILINE_COMMENT s -> dualDoc (asmMultilineComment s) empty
+  ANN d i -> dualDoc (pprInstr platform i <+> asmDoubleslashComment d) (pprInstr platform i)
+
+  LOCATION file line' col _name
+    -> line (text "\t.loc" <+> int file <+> int line' <+> int col)
+  DELTA d   -> dualDoc (asmComment $ text "\tdelta = " <> int d) empty
+  NEWBLOCK _ -> panic "PprInstr: NEWBLOCK"
+  LDATA _ _  -> panic "pprInstr: LDATA"
+
+  -- Pseudo Instructions -------------------------------------------------------
+
+  PUSH_STACK_FRAME -> lines_ [ text "\taddi sp, sp, -16"
+                             , text "\tsd x1, 8(sp)"     -- store RA
+                             , text "\tsd x8, 0(sp)"     -- store FP/s0
+                             , text "\taddi x8, sp, 16"]
+
+  POP_STACK_FRAME -> lines_  [ text "\tld x8, 0(sp)" -- restore FP/s0
+                             , text "\tld x1, 8(sp)" -- restore RA
+                             , text "\taddi sp, sp, 16" ]
+  -- ===========================================================================
+  -- AArch64 Instruction Set
+  -- 1. Arithmetic Instructions ------------------------------------------------
+  ADD  o1 o2 o3
+    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> op3 (text "\tfadd." <> if isSingleOp o1 then text "s" else text "d") o1 o2 o3
+    -- This case is used for sign extension: SEXT.W op
+    | OpReg W64 _ <- o1 , OpReg W32 _ <- o2, isImmOp o3 -> op3 (text "\taddiw") o1 o2 o3
+    | otherwise -> op3 (text "\tadd") o1 o2 o3
+  MUL  o1 o2 o3
+    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> op3 (text "\tfmul." <> if isSingleOp o1 then text "s" else text "d") o1 o2 o3
+    | otherwise -> op3 (text "\tmul") o1 o2 o3
+  SMULH o1 o2 o3 -> op3 (text "\tmulh") o1 o2 o3
+  NEG o1 o2 | isFloatOp o1 && isFloatOp o2 && isSingleOp o2 -> op2 (text "\tfneg.s") o1 o2
+  NEG o1 o2 | isFloatOp o1 && isFloatOp o2 && isDoubleOp o2 -> op2 (text "\tfneg.d") o1 o2
+  NEG o1 o2 -> op2 (text "\tneg") o1 o2
+  DIV o1 o2 o3 | isFloatOp o1 && isFloatOp o2 && isFloatOp o3
+  -- TODO: This must (likely) be refined regarding width
+    -> op3 (text "\tfdiv." <> if isSingleOp o1 then text "s" else text "d") o1 o2 o3
+  DIV o1 o2 o3 -> op3 (text "\tdiv") o1 o2 o3
+  REM o1 o2 o3 | isFloatOp o1 && isFloatOp o2 && isFloatOp o3
+    -> panic "pprInstr - REM not implemented for floats (yet)"
+  REM o1 o2 o3 -> op3 (text "\trem") o1 o2 o3
+  REMU o1 o2 o3 -> op3 (text "\tremu") o1 o2 o3
+
+  SUB  o1 o2 o3
+    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> op3 (text "\tfsub." <> if isSingleOp o1 then text "s" else text "d") o1 o2 o3
+    | isImmOp o3 -> op3 (text "\taddi") o1 o2 (negOp o3)
+    | otherwise -> op3 (text "\tsub")  o1 o2 o3
+  DIVU o1 o2 o3 -> op3 (text "\tdivu") o1 o2 o3
+
+  -- 2. Bit Manipulation Instructions ------------------------------------------
+
+  -- 3. Logical and Move Instructions ------------------------------------------
+  AND o1 o2 o3 | isImmOp o3 -> op3 (text "\tandi") o1 o2 o3
+               | otherwise  -> op3 (text "\tand") o1 o2 o3
+  OR o1 o2 o3   -> op3 (text "\tor") o1 o2 o3
+  ASR o1 o2 o3 | isImmOp o3 -> op3 (text "\tsrai") o1 o2 o3
+  ASR o1 o2 o3  -> op3 (text "\tsra") o1 o2 o3
+  XOR o1 o2 o3  -> op3 (text "\txor") o1 o2 o3
+  LSL o1 o2 o3  -> op3 (text "\tsll") o1 o2 o3
+  LSR o1 o2 o3  -> op3 (text "\tsrl") o1 o2 o3
+  MOV o1 o2
+    | isFloatOp o1 && isFloatOp o2 && isDoubleOp o2 -> op2 (text "\tfmv.d") o1 o2 -- fmv.d rd, rs is pseudo op fsgnj.d rd, rs, rs
+    | isFloatOp o1 && isFloatOp o2 && isSingleOp o2 -> op2 (text "\tfmv.s") o1 o2 -- fmv.s rd, rs is pseudo op fsgnj.s rd, rs, rs
+    | isFloatOp o1 && isImmZero o2 && isDoubleOp o1 -> op2 (text "\tfcvt.d.w") o1 zero
+    | isFloatOp o1 && isImmZero o2 && isSingleOp o1 -> op2 (text "\tfcvt.s.w") o1 zero
+    | isFloatOp o1 && not (isFloatOp o2) && isSingleOp o1 -> op2 (text "\tfmv.w.x") o1 o2
+    | isFloatOp o1 && not (isFloatOp o2) && isDoubleOp o1 -> op2 (text "\tfmv.d.x") o1 o2
+    | not (isFloatOp o1) && isFloatOp o2 && isSingleOp o2 -> op2 (text "\tfmv.x.w") o1 o2
+    | not (isFloatOp o1) && isFloatOp o2 && isDoubleOp o2 -> op2 (text "\tfmv.x.d") o1 o2
+    | (OpImm (ImmInteger i)) <- o2
+    , fitsIn12bitImm i
+          -> lines_ [ text "\taddi" <+> pprOp platform o1 <> comma <+> pprOp platform x0 <> comma <+> pprOp platform o2 ]
+    | (OpImm (ImmInt i)) <- o2
+    , fitsIn12bitImm i
+          -> lines_ [ text "\taddi" <+> pprOp platform o1 <> comma <+> pprOp platform x0 <> comma <+> pprOp platform o2 ]
+    | (OpImm (ImmInteger i)) <- o2
+    , fitsIn32bits i
+        -> lines_ [ text "\tlui" <+> pprOp platform o1 <> comma <+> text "%hi(" <> pprOp platform o2 <> text ")"
+                                             , text "\taddw" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> text "%lo(" <> pprOp platform o2 <> text ")" ]
+    | (OpImm (ImmInt i)) <- o2
+    , fitsIn32bits i
+        -> lines_ [ text "\tlui" <+> pprOp platform o1 <> comma <+> text "%hi(" <> pprOp platform o2 <> text ")"
+                                             , text "\taddw" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> text "%lo(" <> pprOp platform o2 <> text ")" ]
+    | isImmOp o2
+        -- Surrender! Let the assembler figure out the right expressions with pseudo-op LI.
+        -> lines_ [ text "\tli" <+> pprOp platform o1 <> comma <+>  pprOp platform o2 ]
+    | otherwise                    -> op3 (text "\taddi") o1 o2 (OpImm (ImmInt 0))
+  ORI o1 o2 o3  -> op3 (text "\tori") o1 o2 o3
+  XORI o1 o2 o3 -> op3 (text "\txori") o1 o2 o3
+
+  -- 4. Branch Instructions ----------------------------------------------------
+  J t             -> pprInstr platform (B t)
+  J_TBL _ _ r     -> pprInstr platform (J (TReg r))
+  -- TODO: This is odd: (B)ranch and branch and link (BL) do the same: branch and link
+  B l | isLabel l -> lines_ [ text "\tla" <+> pprOp platform ip <> comma <+> getLabel platform l
+                            , text "\tjalr" <+> text "x0" <> comma <+> pprOp platform ip <> comma <+> text "0" ]
+  B (TReg r)      -> line $ text "\tjalr" <+> text "x0" <> comma <+> pprReg W64 r <> comma <+> text "0"
+
+  BL l _ _ | isLabel l-> line $ text "\tcall" <+> getLabel platform l
+  BL (TReg r)     _ _ -> line $ text "\tjalr" <+> text "x1" <> comma <+> pprReg W64 r <> comma <+> text "0"
+
+  BCOND c l r t | isLabel t ->
+    line $ text "\t" <> pprBcond c <+> pprOp platform l <> comma <+> pprOp platform r <> comma <+> getLabel platform t
+
+  BCOND _ _ _ (TReg _)     -> panic "RV64.ppr: No conditional branching to registers!"
+
+  -- 5. Atomic Instructions ----------------------------------------------------
+  -- 6. Conditional Instructions -----------------------------------------------
+  CSET o l r c  -> case c of
+    EQ | isIntOp l && isIntOp r -> lines_ [ subFor l r
+                  , text "\tseqz" <+> pprOp platform o <> comma <+> pprOp platform o]
+    EQ | isFloatOp l && isFloatOp r -> line $ binOp ("\tfeq." ++ floatOpPrecision platform l r)
+    NE | isIntOp l && isIntOp r -> lines_ [ subFor l r
+                  , text "\tsnez" <+> pprOp platform o <> comma <+> pprOp platform o]
+    --    feq.s   a0,fa0,fa1
+    --    xori    a0,a0,1
+    NE | isFloatOp l && isFloatOp r -> lines_ [binOp ("\tfeq." ++ floatOpPrecision platform l r)
+                                              , text "\txori" <+>  pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1"]
+    SLT -> lines_ [ sltFor l r <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r ]
+    SLE -> lines_ [ sltFor l r <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l
+                  , text "\txori" <+>  pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1" ]
+    SGE -> lines_ [ sltFor l r <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r
+                  , text "\txori" <+>  pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1" ]
+    SGT -> lines_ [ sltFor l r <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l ]
+    ULT -> lines_ [ sltuFor l r <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r ]
+    ULE -> lines_ [ sltuFor l r <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l
+                  , text "\txori" <+>  pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1" ]
+    UGE -> lines_ [ sltuFor l r <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r
+                  , text "\txori" <+>  pprOp platform o <> comma <+> pprOp platform o <> comma <+> text "1" ]
+    UGT -> lines_ [ sltuFor l r <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l ]
+    FLT | isFloatOp l && isFloatOp r -> line $ binOp ("\tflt." ++ floatOpPrecision platform l r)
+    FLE | isFloatOp l && isFloatOp r -> line $ binOp ("\tfle." ++ floatOpPrecision platform l r)
+    FGT | isFloatOp l && isFloatOp r -> line $ binOp ("\tfgt." ++ floatOpPrecision platform l r)
+    FGE | isFloatOp l && isFloatOp r -> line $ binOp ("\tfge." ++ floatOpPrecision platform l r)
+    x  -> pprPanic "RV64.ppr: unhandled CSET conditional" (text (show x) <+> pprOp platform o <> comma <+> pprOp platform r <> comma <+> pprOp platform l)
+    where
+      subFor l r | (OpImm _) <- r = text "\taddi" <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform (negOp r)
+                 | (OpImm _) <- l = panic "RV64.ppr: Cannot SUB IMM _"
+                 | otherwise      = text "\tsub" <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r
+      sltFor l r | (OpImm _) <- r = text "\tslti"
+                 | (OpImm _) <- l = panic "PV64.ppr: Cannot SLT IMM _"
+                 | otherwise      = text "\tslt"
+      sltuFor l r| (OpImm _) <- r = text "\tsltui"
+                 | (OpImm _) <- l = panic "PV64.ppr: Cannot SLTU IMM _"
+                 | otherwise      = text "\tsltu"
+      binOp :: (IsLine doc) => String -> doc
+      binOp op = text op <+> pprOp platform o <> comma <+> pprOp platform l <> comma <+> pprOp platform r
+
+  -- 7. Load and Store Instructions --------------------------------------------
+  -- NOTE: GHC may do whacky things where it only load the lower part of an
+  --       address. Not observing the correct size when loading will lead
+  --       inevitably to crashes.
+  STR II8  o1 o2 -> op2 (text "\tsb") o1 o2
+  STR II16 o1 o2 -> op2 (text "\tsh") o1 o2
+  STR II32 o1 o2 -> op2 (text "\tsw") o1 o2
+  STR II64 o1 o2 -> op2 (text "\tsd") o1 o2
+  STR FF32 o1 o2 -> op2 (text "\tfsw") o1 o2
+  STR FF64 o1 o2 -> op2 (text "\tfsd") o1 o2
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) ->
+    lines_ [ text "\tla" <+> pprOp platform o1 <> comma <+> pprAsmLabel platform lbl
+           , text "\taddi" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> int off
+           ]
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) ->
+    -- fixing this is _really_ annoyin we need to generate code like:
+    -- 1: auipc x16, %pcrel_hi(<lbl>)
+    --    addi x16, x16, %pcrel_lo(1b)
+    -- I really dislike this (refer back to label 1 syntax from the assembler.)
+    --
+    -- So we'll go with pseudo ops. la and li it is.
+    -- op_adrp o1 (text "%pcrel_hi(" <> pprAsmLabel platform lbl <> text ")") $$
+    -- op_add o1 (text "%pcrel_lo(" <> pprAsmLabel platform lbl <> text ")")
+    line $ text "\tla" <+> pprOp platform o1 <> comma <+> pprAsmLabel platform lbl
+
+  LDR II8  o1 o2 -> op2 (text "\tlb") o1 o2
+  LDR II16 o1 o2 -> op2 (text "\tlh") o1 o2
+  LDR II32 o1 o2 -> op2 (text "\tlw") o1 o2
+  LDR II64 o1 o2 -> op2 (text "\tld") o1 o2
+  LDR FF32 o1 o2 -> op2 (text "\tflw") o1 o2
+  LDR FF64 o1 o2 -> op2 (text "\tfld") o1 o2
+
+  LDRU II8  o1 o2 -> op2 (text "\tlbu") o1 o2
+  LDRU II16 o1 o2 -> op2 (text "\tlhu") o1 o2
+  LDRU II32 o1 o2 -> op2 (text "\tlwu") o1 o2
+  -- double words (64bit) cannot be sign extended by definition
+  LDRU II64 o1 o2 -> op2 (text "\tld") o1 o2
+  LDRU FF32 o1 o2@(OpAddr (AddrReg _)) -> op2 (text "\tflw") o1 o2
+  LDRU FF32 o1 o2@(OpAddr (AddrRegImm _ _)) -> op2 (text "\tflw") o1 o2
+  LDRU FF64 o1 o2@(OpAddr (AddrReg _)) -> op2 (text "\tfld") o1 o2
+  LDRU FF64 o1 o2@(OpAddr (AddrRegImm _ _)) -> op2 (text "\tfld") o1 o2
+  LDRU f o1 o2 -> pprPanic "Unsupported unsigned load" ((text.show) f <+> pprOp platform o1 <+> pprOp platform o2)
+
+  -- 8. Synchronization Instructions -------------------------------------------
+  DMBSY r w -> line $ text "\tfence" <+> pprDmbType r <> char ',' <+> pprDmbType w
+
+  -- 9. Floating Point Instructions --------------------------------------------
+  FCVT o1@(OpReg W32 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.s.d") o1 o2
+  FCVT o1@(OpReg W64 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.d.s") o1 o2
+  FCVT o1 o2 -> pprPanic "RV64.pprInstr - impossible float conversion" $
+                  line (pprOp platform o1 <> text "->" <> pprOp platform o2)
+
+  SCVTF o1@(OpReg W32 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.s.w") o1 o2
+  SCVTF o1@(OpReg W32 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.s.l") o1 o2
+  SCVTF o1@(OpReg W64 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.d.w") o1 o2
+  SCVTF o1@(OpReg W64 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.d.l") o1 o2
+  SCVTF o1 o2 -> pprPanic "RV64.pprInstr - impossible integer to float conversion" $
+                  line (pprOp platform o1 <> text "->" <> pprOp platform o2)
+
+  FCVTZS o1@(OpReg W32 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.w.s") o1 o2
+  FCVTZS o1@(OpReg W32 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.w.d") o1 o2
+  FCVTZS o1@(OpReg W64 _) o2@(OpReg W32 _) -> op2 (text "\tfcvt.l.s") o1 o2
+  FCVTZS o1@(OpReg W64 _) o2@(OpReg W64 _) -> op2 (text "\tfcvt.l.d") o1 o2
+  FCVTZS o1 o2 -> pprPanic "RV64.pprInstr - impossible float to integer conversion" $
+                  line (pprOp platform o1 <> text "->" <> pprOp platform o2)
+
+  FABS o1 o2 | isSingleOp o2 -> op2 (text "\tfabs.s") o1 o2
+  FABS o1 o2 | isDoubleOp o2 -> op2 (text "\tfabs.d") o1 o2
+  instr -> panic $ "RV64.pprInstr - Unknown instruction: " ++ instrCon instr
+ where op2 op o1 o2        = line $ op <+> pprOp platform o1 <> comma <+> pprOp platform o2
+       op3 op o1 o2 o3     = line $ op <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+       pprDmbType DmbRead = text "r"
+       pprDmbType DmbWrite = text "w"
+       pprDmbType DmbReadWrite = text "rw"
+
+floatOpPrecision :: Platform -> Operand -> Operand -> String
+floatOpPrecision _p l r | isFloatOp l && isFloatOp r && isSingleOp l && isSingleOp r = "s" -- single precision
+floatOpPrecision _p l r | isFloatOp l && isFloatOp r && isDoubleOp l && isDoubleOp r = "d" -- double precision
+floatOpPrecision p l r = pprPanic "Cannot determine floating point precission" (text "op1" <+> pprOp p l <+> text "op2" <+> pprOp p r)
+
+pprBcond :: (IsLine doc) => Cond -> doc
+pprBcond c = text "b" <> pprCond c
+  where
+    pprCond :: (IsLine doc) => Cond -> doc
+    pprCond c = case c of
+      EQ -> text "eq"
+      NE -> text "ne"
+      SLT -> text "lt"
+      SLE -> text "le"
+      SGE -> text "ge"
+      SGT -> text "gt"
+      ULT -> text "ltu"
+      ULE -> text "leu"
+      UGE -> text "geu"
+      UGT -> text "gtu"
+      -- BCOND cannot handle floating point comparisons / registers
+      _ -> panic $ "RV64.ppr: unhandled BCOND conditional: " ++ show c
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/RV64/RegInfo.hs
===================================================================
--- /dev/null
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/RV64/RegInfo.hs
@@ -0,0 +1,31 @@
+module GHC.CmmToAsm.RV64.RegInfo where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.RV64.Instr
+import GHC.Cmm.BlockId
+import GHC.Cmm
+
+import GHC.Utils.Outputable
+
+data JumpDest = DestBlockId BlockId
+
+-- Debug Instance
+instance Outputable JumpDest where
+  ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid
+
+-- TODO: documen what this does. See Ticket 19914
+getJumpDestBlockId :: JumpDest -> Maybe BlockId
+getJumpDestBlockId (DestBlockId bid) = Just bid
+
+-- TODO: document what this does. See Ticket 19914
+canShortcut :: Instr -> Maybe JumpDest
+canShortcut _ = Nothing
+
+-- TODO: document what this does. See Ticket 19914
+shortcutStatics :: (BlockId -> Maybe JumpDest) -> RawCmmStatics -> RawCmmStatics
+shortcutStatics _ other_static = other_static
+
+-- TODO: document what this does. See Ticket 19914
+shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
+shortcutJump _ other = other
\ No newline at end of file
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/RV64/Regs.hs
===================================================================
--- /dev/null
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/RV64/Regs.hs
@@ -0,0 +1,226 @@
+module GHC.CmmToAsm.RV64.Regs where
+
+import GHC.Prelude
+import GHC.Data.FastString
+
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+import GHC.CmmToAsm.Format
+
+import GHC.Cmm
+import GHC.Cmm.CLabel           ( CLabel )
+import GHC.Types.Unique
+
+import GHC.Platform.Regs
+import GHC.Utils.Outputable
+import GHC.Utils.Panic
+import GHC.Platform
+
+-- * Registers
+
+-- | First integer register number. @zero@ register.
+x0RegNo :: RegNo
+x0RegNo = 0
+
+x5RegNo, t0RegNo :: RegNo
+x5RegNo = 5
+t0RegNo = x5RegNo
+
+x7RegNo, t2RegNo :: RegNo
+x7RegNo = 7
+t2RegNo = x7RegNo
+
+x28RegNo, t3RegNo :: RegNo
+x28RegNo = 28
+t3RegNo = x28RegNo
+
+-- | Last integer register number. Used as IP register.
+x31RegNo, t6RegNo, ipRegNo :: RegNo
+x31RegNo = 31
+t6RegNo = x31RegNo
+ipRegNo = x31RegNo
+
+-- | First floating point register.
+d0RegNo, ft0RegNo :: RegNo
+d0RegNo = 32
+ft0RegNo = d0RegNo
+
+d7RegNo, ft7RegNo :: RegNo
+d7RegNo = 39
+ft7RegNo = d7RegNo
+
+-- | Last floating point register.
+d31RegNo :: RegNo
+d31RegNo = 63
+
+a0RegNo, x10RegNo :: RegNo
+x10RegNo = 10
+a0RegNo = x10RegNo
+
+a7RegNo, x17RegNo :: RegNo
+x17RegNo = 17
+a7RegNo = x17RegNo
+
+fa0RegNo, d10RegNo :: RegNo
+d10RegNo = 42
+fa0RegNo = d10RegNo
+
+fa7RegNo, d17RegNo :: RegNo
+d17RegNo = 49
+fa7RegNo = d17RegNo
+
+-- Note [The made-up RISCV64 IP register]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- RISCV64 has no inter-procedural register in its ABI. However, we need one to
+-- make register spills/loads to/from high number slots. I.e. slot numbers that
+-- do not fit in a 12bit integer which is used as immediate in the arithmetic
+-- operations. Thus, we're marking one additional register (x31) as permanently
+-- non-free and call it IP.
+--
+-- IP can be used as temporary register in all operations. Just be aware that it
+-- may be clobbered as soon as you loose direct control over it (i.e. using IP
+-- by-passes the register allocation/spilling mechanisms.) It should be fine to
+-- use it as temporary register in a MachOp translation as long as you don't
+-- rely on its value beyond this limited scope.
+--
+-- X31 is a caller-saved register. I.e. there are no guarantees about what the
+-- callee does with it. That's exactly what we want here.
+
+zeroReg, raReg, spMachReg, ipReg :: Reg
+zeroReg = regSingle x0RegNo
+raReg = regSingle 1
+-- | Not to be confused with the `CmmReg` `spReg`
+spMachReg = regSingle 2
+ipReg = regSingle ipRegNo
+
+-- | All machine register numbers.
+allMachRegNos :: [RegNo]
+allMachRegNos = intRegs ++ fpRegs
+  where
+    intRegs = [x0RegNo .. x31RegNo]
+    fpRegs = [d0RegNo .. d31RegNo]
+
+-- | Registers available to the register allocator.
+--
+-- These are all registers minus those with a fixed role in RISCV ABI (zero, lr,
+-- sp, gp, tp, fp, ip) and GHC RTS (Base, Sp, Hp, HpLim, R1..R8, F1..F6,
+-- D1..D6.)
+allocatableRegs :: Platform -> [RealReg]
+allocatableRegs platform =
+  let isFree = freeReg platform
+   in map RealRegSingle $ filter isFree allMachRegNos
+
+-- | Integer argument registers according to the calling convention
+allGpArgRegs :: [Reg]
+allGpArgRegs = map regSingle [a0RegNo .. a7RegNo]
+
+-- | Floating point argument registers according to the calling convention
+allFpArgRegs :: [Reg]
+allFpArgRegs = map regSingle [fa0RegNo .. fa7RegNo]
+
+-- * Addressing modes
+
+-- TODO: AddReg seems to be just a special case of AddrRegImm. Maybe we should
+-- replace it with AddrRegImm having an Imm of 0.
+-- | Addressing modes
+data AddrMode
+  = -- | A register plus some integer, e.g. @8(sp)@ or @-16(sp)@. The offset
+    -- needs to fit into 12bits.
+    AddrRegImm Reg Imm
+  | -- | A register
+    AddrReg Reg
+  deriving (Eq, Show)
+
+-- * Immediates
+
+data Imm
+  = ImmInt      Int
+  | ImmInteger  Integer     -- Sigh.
+  | ImmCLbl     CLabel      -- AbstractC Label (with baggage)
+  | ImmLit      FastString
+  | ImmIndex    CLabel Int
+  | ImmFloat    Rational
+  | ImmDouble   Rational
+  | ImmConstantSum Imm Imm
+  | ImmConstantDiff Imm Imm
+  deriving (Eq, Show)
+
+litToImm :: CmmLit -> Imm
+litToImm (CmmInt i w) = ImmInteger (narrowS w i)
+-- narrow to the width: a CmmInt might be out of
+-- range, but we assume that ImmInteger only contains
+-- in-range values.  A signed value should be fine here.
+litToImm (CmmFloat f W32) = ImmFloat f
+litToImm (CmmFloat f W64) = ImmDouble f
+litToImm (CmmLabel l) = ImmCLbl l
+litToImm (CmmLabelOff l off) = ImmIndex l off
+litToImm (CmmLabelDiffOff l1 l2 off _) =
+  ImmConstantSum
+    (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2))
+    (ImmInt off)
+litToImm l = panic $ "RV64.Regs.litToImm: no match for " ++ show l
+
+-- == To satisfy GHC.CmmToAsm.Reg.Target =======================================
+
+-- squeese functions for the graph allocator -----------------------------------
+-- | regSqueeze_class reg
+--      Calculate the maximum number of register colors that could be
+--      denied to a node of this class due to having this reg
+--      as a neighbour.
+--
+{-# INLINE virtualRegSqueeze #-}
+virtualRegSqueeze :: RegClass -> VirtualReg -> Int
+virtualRegSqueeze cls vr
+ = case cls of
+        RcInteger
+         -> case vr of
+                VirtualRegI{}           -> 1
+                VirtualRegHi{}          -> 1
+                _other                  -> 0
+
+        RcDouble
+         -> case vr of
+                VirtualRegD{}           -> 1
+                VirtualRegF{}           -> 0
+                _other                  -> 0
+
+        _other -> 0
+
+{-# INLINE realRegSqueeze #-}
+realRegSqueeze :: RegClass -> RealReg -> Int
+realRegSqueeze cls rr =
+  case cls of
+    RcInteger ->
+      case rr of
+        RealRegSingle regNo
+          | regNo < d0RegNo -> 1
+          | otherwise -> 0
+    RcDouble ->
+      case rr of
+        RealRegSingle regNo
+          | regNo < d0RegNo -> 0
+          | otherwise -> 1
+    _other -> 0
+
+mkVirtualReg :: Unique -> Format -> VirtualReg
+mkVirtualReg u format
+   | not (isFloatFormat format) = VirtualRegI u
+   | otherwise
+   = case format of
+        FF32    -> VirtualRegD u
+        FF64    -> VirtualRegD u
+        _       -> panic "RV64.mkVirtualReg"
+
+{-# INLINE classOfRealReg #-}
+classOfRealReg :: RealReg -> RegClass
+classOfRealReg (RealRegSingle i)
+  | i < d0RegNo = RcInteger
+  | otherwise = RcDouble
+
+regDotColor :: RealReg -> SDoc
+regDotColor reg
+ = case classOfRealReg reg of
+        RcInteger       -> text "blue"
+        RcFloat         -> text "red"
+        RcDouble        -> text "green"
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
===================================================================
--- ghc-9.8.2.orig/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
@@ -115,7 +115,7 @@ trivColorable platform virtualRegSqueeze
                             ArchMipseb    -> panic "trivColorable ArchMipseb"
                             ArchMipsel    -> panic "trivColorable ArchMipsel"
                             ArchS390X     -> panic "trivColorable ArchS390X"
-                            ArchRISCV64   -> panic "trivColorable ArchRISCV64"
+                            ArchRISCV64   -> 14
                             ArchLoongArch64->panic "trivColorable ArchLoongArch64"
                             ArchJavaScript-> panic "trivColorable ArchJavaScript"
                             ArchWasm32    -> panic "trivColorable ArchWasm32"
@@ -150,7 +150,7 @@ trivColorable platform virtualRegSqueeze
                             ArchMipseb    -> panic "trivColorable ArchMipseb"
                             ArchMipsel    -> panic "trivColorable ArchMipsel"
                             ArchS390X     -> panic "trivColorable ArchS390X"
-                            ArchRISCV64   -> panic "trivColorable ArchRISCV64"
+                            ArchRISCV64   -> 0
                             ArchLoongArch64->panic "trivColorable ArchLoongArch64"
                             ArchJavaScript-> panic "trivColorable ArchJavaScript"
                             ArchWasm32    -> panic "trivColorable ArchWasm32"
@@ -184,7 +184,7 @@ trivColorable platform virtualRegSqueeze
                             ArchMipseb    -> panic "trivColorable ArchMipseb"
                             ArchMipsel    -> panic "trivColorable ArchMipsel"
                             ArchS390X     -> panic "trivColorable ArchS390X"
-                            ArchRISCV64   -> panic "trivColorable ArchRISCV64"
+                            ArchRISCV64   -> 20
                             ArchLoongArch64->panic "trivColorable ArchLoongArch64"
                             ArchJavaScript-> panic "trivColorable ArchJavaScript"
                             ArchWasm32    -> panic "trivColorable ArchWasm32"
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/Reg/Linear.hs
===================================================================
--- ghc-9.8.2.orig/compiler/GHC/CmmToAsm/Reg/Linear.hs
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/Reg/Linear.hs
@@ -114,6 +114,7 @@ import qualified GHC.CmmToAsm.Reg.Linear
 import qualified GHC.CmmToAsm.Reg.Linear.X86     as X86
 import qualified GHC.CmmToAsm.Reg.Linear.X86_64  as X86_64
 import qualified GHC.CmmToAsm.Reg.Linear.AArch64 as AArch64
+import qualified GHC.CmmToAsm.Reg.Linear.RV64    as RV64
 import GHC.CmmToAsm.Reg.Target
 import GHC.CmmToAsm.Reg.Liveness
 import GHC.CmmToAsm.Reg.Utils
@@ -223,7 +224,7 @@ linearRegAlloc config entry_ids block_li
       ArchAlpha      -> panic "linearRegAlloc ArchAlpha"
       ArchMipseb     -> panic "linearRegAlloc ArchMipseb"
       ArchMipsel     -> panic "linearRegAlloc ArchMipsel"
-      ArchRISCV64    -> panic "linearRegAlloc ArchRISCV64"
+      ArchRISCV64    -> go $ (frInitFreeRegs platform :: RV64.FreeRegs)
       ArchLoongArch64-> panic "linearRegAlloc ArchLoongArch64"
       ArchJavaScript -> panic "linearRegAlloc ArchJavaScript"
       ArchWasm32     -> panic "linearRegAlloc ArchWasm32"
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
===================================================================
--- ghc-9.8.2.orig/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
@@ -29,10 +29,12 @@ import qualified GHC.CmmToAsm.Reg.Linear
 import qualified GHC.CmmToAsm.Reg.Linear.X86     as X86
 import qualified GHC.CmmToAsm.Reg.Linear.X86_64  as X86_64
 import qualified GHC.CmmToAsm.Reg.Linear.AArch64 as AArch64
+import qualified GHC.CmmToAsm.Reg.Linear.RV64    as RV64
 
 import qualified GHC.CmmToAsm.PPC.Instr     as PPC.Instr
 import qualified GHC.CmmToAsm.X86.Instr     as X86.Instr
 import qualified GHC.CmmToAsm.AArch64.Instr as AArch64.Instr
+import qualified GHC.CmmToAsm.RV64.Instr    as RV64.Instr
 
 class Show freeRegs => FR freeRegs where
     frAllocateReg :: Platform -> RealReg -> freeRegs -> freeRegs
@@ -64,6 +66,12 @@ instance FR AArch64.FreeRegs where
     frInitFreeRegs = AArch64.initFreeRegs
     frReleaseReg = \_ -> AArch64.releaseReg
 
+instance FR RV64.FreeRegs where
+    frAllocateReg = \_ -> RV64.allocateReg
+    frGetFreeRegs = \_ -> RV64.getFreeRegs
+    frInitFreeRegs = RV64.initFreeRegs
+    frReleaseReg = \_ -> RV64.releaseReg
+
 maxSpillSlots :: NCGConfig -> Int
 maxSpillSlots config = case platformArch (ncgPlatform config) of
    ArchX86       -> X86.Instr.maxSpillSlots config
@@ -76,7 +84,7 @@ maxSpillSlots config = case platformArch
    ArchAlpha     -> panic "maxSpillSlots ArchAlpha"
    ArchMipseb    -> panic "maxSpillSlots ArchMipseb"
    ArchMipsel    -> panic "maxSpillSlots ArchMipsel"
-   ArchRISCV64   -> panic "maxSpillSlots ArchRISCV64"
+   ArchRISCV64   -> RV64.Instr.maxSpillSlots config
    ArchLoongArch64->panic "maxSpillSlots ArchLoongArch64"
    ArchJavaScript-> panic "maxSpillSlots ArchJavaScript"
    ArchWasm32    -> panic "maxSpillSlots ArchWasm32"
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/Reg/Linear/RV64.hs
===================================================================
--- /dev/null
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/Reg/Linear/RV64.hs
@@ -0,0 +1,65 @@
+module GHC.CmmToAsm.Reg.Linear.RV64 where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.RV64.Regs
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GHC.Utils.Outputable
+import GHC.Utils.Panic
+import GHC.Platform
+
+import Data.Word
+
+import GHC.Stack
+
+data FreeRegs = FreeRegs !Word32 !Word32
+
+instance Show FreeRegs where
+  show (FreeRegs g f) = "FreeRegs: " ++ showBits g ++ "; " ++ showBits f
+
+instance Outputable FreeRegs where
+    ppr (FreeRegs g f) = text "   " <+> foldr (\i x -> pad_int i    <+> x) (text "") [0..31]
+                      $$ text "GPR" <+> foldr (\i x -> show_bit g i <+> x) (text "") [0..31]
+                      $$ text "FPR" <+> foldr (\i x -> show_bit f i <+> x) (text "") [0..31]
+      where pad_int i | i < 10 = char ' ' <> int i
+            pad_int i = int i
+            -- remember bit = 1 means it's available.
+            show_bit bits bit | testBit bits bit = text "  "
+            show_bit _    _ = text " x"
+
+noFreeRegs :: FreeRegs
+noFreeRegs = FreeRegs 0 0
+
+showBits :: Word32 -> String
+showBits w = map (\i -> if testBit w i then '1' else '0') [0..31]
+
+-- FR instance implementation (See Linear.FreeRegs)
+allocateReg :: HasCallStack => RealReg -> FreeRegs -> FreeRegs
+allocateReg (RealRegSingle r) (FreeRegs g f)
+    | r > 31 && testBit f (r - 32) = FreeRegs g (clearBit f (r - 32))
+    | r < 32 && testBit g r = FreeRegs (clearBit g r) f
+    | r > 31 = panic $ "Linear.RV64.allocReg: double allocation of float reg v" ++ show (r - 32) ++ "; " ++ showBits f
+    | otherwise = pprPanic "Linear.RV64.allocReg" $ text ("double allocation of gp reg x" ++ show r ++ "; " ++ showBits g)
+
+-- For LLVM Interop, see https://github.com/llvm/llvm-project/blob/6ab900f8746e7d8e24afafb5886a40801f6799f4/llvm/lib/Target/RISCV/RISCVISelLowering.cpp#L13638-L13685
+getFreeRegs :: RegClass -> FreeRegs -> [RealReg]
+getFreeRegs cls (FreeRegs g f)
+  | RcFloat   <- cls = [] -- For now we only support double and integer registers, floats will need to be promoted.
+  | RcDouble  <- cls = go 32 f [0..31]
+  | RcInteger <- cls = go 0  g ([5..7] ++ [10..17] ++ [28..31])
+    where
+        go _   _ []                   = []
+        go off x (i:is) | testBit x i = RealRegSingle (off + i) : (go off x $! is)
+                        | otherwise   = go off x $! is
+
+initFreeRegs :: Platform -> FreeRegs
+initFreeRegs platform = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform)
+
+releaseReg :: HasCallStack => RealReg -> FreeRegs -> FreeRegs
+releaseReg (RealRegSingle r) (FreeRegs g f)
+  | r > 31 && testBit f (r - 32) = pprPanic "Linear.RV64.releaseReg" (text  "can't release non-allocated reg v" <> int (r - 32))
+  | r < 32 && testBit g r = pprPanic "Linear.RV64.releaseReg" (text "can't release non-allocated reg x" <> int r)
+  | r > 31 = FreeRegs g (setBit f (r - 32))
+  | otherwise = FreeRegs (setBit g r) f
\ No newline at end of file
Index: ghc-9.8.2/compiler/GHC/CmmToAsm/Reg/Target.hs
===================================================================
--- ghc-9.8.2.orig/compiler/GHC/CmmToAsm/Reg/Target.hs
+++ ghc-9.8.2/compiler/GHC/CmmToAsm/Reg/Target.hs
@@ -34,7 +34,7 @@ import qualified GHC.CmmToAsm.X86.Regs
 import qualified GHC.CmmToAsm.X86.RegInfo    as X86
 import qualified GHC.CmmToAsm.PPC.Regs       as PPC
 import qualified GHC.CmmToAsm.AArch64.Regs   as AArch64
-
+import qualified GHC.CmmToAsm.RV64.Regs   as RV64
 
 targetVirtualRegSqueeze :: Platform -> RegClass -> VirtualReg -> Int
 targetVirtualRegSqueeze platform
@@ -49,7 +49,7 @@ targetVirtualRegSqueeze platform
       ArchAlpha     -> panic "targetVirtualRegSqueeze ArchAlpha"
       ArchMipseb    -> panic "targetVirtualRegSqueeze ArchMipseb"
       ArchMipsel    -> panic "targetVirtualRegSqueeze ArchMipsel"
-      ArchRISCV64   -> panic "targetVirtualRegSqueeze ArchRISCV64"
+      ArchRISCV64   -> RV64.virtualRegSqueeze
       ArchLoongArch64->panic "targetVirtualRegSqueeze ArchLoongArch64"
       ArchJavaScript-> panic "targetVirtualRegSqueeze ArchJavaScript"
       ArchWasm32    -> panic "targetVirtualRegSqueeze ArchWasm32"
@@ -69,7 +69,7 @@ targetRealRegSqueeze platform
       ArchAlpha     -> panic "targetRealRegSqueeze ArchAlpha"
       ArchMipseb    -> panic "targetRealRegSqueeze ArchMipseb"
       ArchMipsel    -> panic "targetRealRegSqueeze ArchMipsel"
-      ArchRISCV64   -> panic "targetRealRegSqueeze ArchRISCV64"
+      ArchRISCV64   -> RV64.realRegSqueeze
       ArchLoongArch64->panic "targetRealRegSqueeze ArchLoongArch64"
       ArchJavaScript-> panic "targetRealRegSqueeze ArchJavaScript"
       ArchWasm32    -> panic "targetRealRegSqueeze ArchWasm32"
@@ -88,7 +88,7 @@ targetClassOfRealReg platform
       ArchAlpha     -> panic "targetClassOfRealReg ArchAlpha"
       ArchMipseb    -> panic "targetClassOfRealReg ArchMipseb"
       ArchMipsel    -> panic "targetClassOfRealReg ArchMipsel"
-      ArchRISCV64   -> panic "targetClassOfRealReg ArchRISCV64"
+      ArchRISCV64   -> RV64.classOfRealReg
       ArchLoongArch64->panic "targetClassOfRealReg ArchLoongArch64"
       ArchJavaScript-> panic "targetClassOfRealReg ArchJavaScript"
       ArchWasm32    -> panic "targetClassOfRealReg ArchWasm32"
@@ -107,7 +107,7 @@ targetMkVirtualReg platform
       ArchAlpha     -> panic "targetMkVirtualReg ArchAlpha"
       ArchMipseb    -> panic "targetMkVirtualReg ArchMipseb"
       ArchMipsel    -> panic "targetMkVirtualReg ArchMipsel"
-      ArchRISCV64   -> panic "targetMkVirtualReg ArchRISCV64"
+      ArchRISCV64   -> RV64.mkVirtualReg
       ArchLoongArch64->panic "targetMkVirtualReg ArchLoongArch64"
       ArchJavaScript-> panic "targetMkVirtualReg ArchJavaScript"
       ArchWasm32    -> panic "targetMkVirtualReg ArchWasm32"
@@ -126,7 +126,7 @@ targetRegDotColor platform
       ArchAlpha     -> panic "targetRegDotColor ArchAlpha"
       ArchMipseb    -> panic "targetRegDotColor ArchMipseb"
       ArchMipsel    -> panic "targetRegDotColor ArchMipsel"
-      ArchRISCV64   -> panic "targetRegDotColor ArchRISCV64"
+      ArchRISCV64   -> RV64.regDotColor
       ArchLoongArch64->panic "targetRegDotColor ArchLoongArch64"
       ArchJavaScript-> panic "targetRegDotColor ArchJavaScript"
       ArchWasm32    -> panic "targetRegDotColor ArchWasm32"
Index: ghc-9.8.2/compiler/GHC/Driver/Backend.hs
===================================================================
--- ghc-9.8.2.orig/compiler/GHC/Driver/Backend.hs
+++ ghc-9.8.2/compiler/GHC/Driver/Backend.hs
@@ -217,6 +217,7 @@ platformNcgSupported platform = if
          ArchPPC_64 {} -> True
          ArchAArch64   -> True
          ArchWasm32    -> True
+         ArchRISCV64   -> True
          _             -> False
 
 -- | Is the platform supported by the JS backend?
Index: ghc-9.8.2/compiler/GHC/Driver/DynFlags.hs
===================================================================
--- ghc-9.8.2.orig/compiler/GHC/Driver/DynFlags.hs
+++ ghc-9.8.2/compiler/GHC/Driver/DynFlags.hs
@@ -1330,6 +1330,11 @@ default_PIC platform =
                                          -- always generate PIC. See
                                          -- #10597 for more
                                          -- information.
+
+    -- On RISC-V, we need to always have PIC enabled. Otherwise, the stack
+    -- protector emits relocations that cannot be resolved by our linker. The
+    -- addresses for the canary are too high to fit in 32bits.
+    (OSLinux,   ArchRISCV64) -> [Opt_PIC]
     _                      -> []
 
 -- | The language extensions implied by the various language variants.
Index: ghc-9.8.2/compiler/GHC/Platform.hs
===================================================================
--- ghc-9.8.2.orig/compiler/GHC/Platform.hs
+++ ghc-9.8.2/compiler/GHC/Platform.hs
@@ -283,7 +283,6 @@ platformHasRTSLinker p = case archOS_arc
   ArchPPC_64 ELF_V1 -> False -- powerpc64
   ArchPPC_64 ELF_V2 -> False -- powerpc64le
   ArchS390X         -> False
-  ArchRISCV64       -> False
   ArchLoongArch64   -> False
   ArchJavaScript    -> False
   ArchWasm32        -> False
Index: ghc-9.8.2/compiler/ghc.cabal.in
===================================================================
--- ghc-9.8.2.orig/compiler/ghc.cabal.in
+++ ghc-9.8.2/compiler/ghc.cabal.in
@@ -278,6 +278,7 @@ Library
         GHC.CmmToAsm.Reg.Linear.FreeRegs
         GHC.CmmToAsm.Reg.Linear.JoinToTargets
         GHC.CmmToAsm.Reg.Linear.PPC
+        GHC.CmmToAsm.Reg.Linear.RV64
         GHC.CmmToAsm.Reg.Linear.StackMap
         GHC.CmmToAsm.Reg.Linear.State
         GHC.CmmToAsm.Reg.Linear.Stats
@@ -286,6 +287,13 @@ Library
         GHC.CmmToAsm.Reg.Liveness
         GHC.CmmToAsm.Reg.Target
         GHC.CmmToAsm.Reg.Utils
+        GHC.CmmToAsm.RV64
+        GHC.CmmToAsm.RV64.CodeGen
+        GHC.CmmToAsm.RV64.Cond
+        GHC.CmmToAsm.RV64.Instr
+        GHC.CmmToAsm.RV64.Ppr
+        GHC.CmmToAsm.RV64.RegInfo
+        GHC.CmmToAsm.RV64.Regs
         GHC.CmmToAsm.Types
         GHC.CmmToAsm.Utils
         GHC.CmmToAsm.X86
Index: ghc-9.8.2/hadrian/bindist/config.mk.in
===================================================================
--- ghc-9.8.2.orig/hadrian/bindist/config.mk.in
+++ ghc-9.8.2/hadrian/bindist/config.mk.in
@@ -141,7 +141,7 @@ GhcWithSMP := $(strip $(if $(filter YESN
 # Whether to include GHCi in the compiler.  Depends on whether the RTS linker
 # has support for this OS/ARCH combination.
 OsSupportsGHCi=$(strip $(patsubst $(TargetOS_CPP), YES, $(findstring $(TargetOS_CPP), mingw32 linux solaris2 freebsd dragonfly netbsd openbsd darwin kfreebsdgnu)))
-ArchSupportsGHCi=$(strip $(patsubst $(TargetArch_CPP), YES, $(findstring $(TargetArch_CPP), i386 x86_64 powerpc powerpc64 powerpc64le sparc sparc64 arm aarch64)))
+ArchSupportsGHCi=$(strip $(patsubst $(TargetArch_CPP), YES, $(findstring $(TargetArch_CPP), i386 x86_64 powerpc powerpc64 powerpc64le sparc sparc64 arm aarch64 riscv64)))
 
 ifeq "$(OsSupportsGHCi)$(ArchSupportsGHCi)" "YESYES"
 GhcWithInterpreter=YES
Index: ghc-9.8.2/hadrian/src/Settings/Builders/RunTest.hs
===================================================================
--- ghc-9.8.2.orig/hadrian/src/Settings/Builders/RunTest.hs
+++ ghc-9.8.2/hadrian/src/Settings/Builders/RunTest.hs
@@ -116,7 +116,14 @@ inTreeCompilerArgs stg = do
 
     os          <- setting HostOs
     arch        <- setting TargetArch
-    let codegen_arches = ["x86_64", "i386", "powerpc", "powerpc64", "powerpc64le", "aarch64", "wasm32"]
+    let codegen_arches = [ "x86_64"
+                         , "i386"
+                         , "powerpc"
+                         , "powerpc64"
+                         , "powerpc64le"
+                         , "aarch64"
+                         , "wasm32" 
+                         , "riscv64" ]
     let withNativeCodeGen
           | unregisterised = False
           | arch `elem` codegen_arches = True
Index: ghc-9.8.2/rts/LinkerInternals.h
===================================================================
--- ghc-9.8.2.orig/rts/LinkerInternals.h
+++ ghc-9.8.2/rts/LinkerInternals.h
@@ -208,7 +208,7 @@ typedef struct _Segment {
     int n_sections;
 } Segment;
 
-#if defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) || defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)
+#if defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) || defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) || defined(riscv64_HOST_ARCH)
 #define NEED_SYMBOL_EXTRAS 1
 #endif
 
@@ -237,6 +237,8 @@ typedef struct {
     uint8_t     jumpIsland[8];
 #elif defined(arm_HOST_ARCH)
     uint8_t     jumpIsland[16];
+#elif defined(riscv64_HOST_ARCH)
+    uint64_t    addr;
 #endif
 } SymbolExtra;
 
Index: ghc-9.8.2/rts/RtsSymbols.c
===================================================================
--- ghc-9.8.2.orig/rts/RtsSymbols.c
+++ ghc-9.8.2/rts/RtsSymbols.c
@@ -974,6 +974,17 @@ extern char **environ;
 #define RTS_LIBGCC_SYMBOLS
 #endif
 
+#if defined(riscv64_HOST_ARCH)
+// See https://gcc.gnu.org/onlinedocs/gccint/Integer-library-routines.html as
+// reference for the following built-ins. __clzdi2 and __ctzdi2 probably relate
+// to __builtin-s in libraries/ghc-prim/cbits/ctz.c.
+#define RTS_ARCH_LIBGCC_SYMBOLS \
+  SymI_NeedsProto(__clzdi2) \
+  SymI_NeedsProto(__ctzdi2)
+#else
+#define RTS_ARCH_LIBGCC_SYMBOLS
+#endif
+
 // Symbols defined by libgcc/compiler-rt for AArch64's outline atomics.
 #if defined(HAVE_ARM_OUTLINE_ATOMICS)
 #include "ARMOutlineAtomicsSymbols.h"
@@ -1026,6 +1037,7 @@ RTS_DARWIN_ONLY_SYMBOLS
 RTS_OPENBSD_ONLY_SYMBOLS
 RTS_LIBC_SYMBOLS
 RTS_LIBGCC_SYMBOLS
+RTS_ARCH_LIBGCC_SYMBOLS
 RTS_FINI_ARRAY_SYMBOLS
 RTS_LIBFFI_SYMBOLS
 RTS_ARM_OUTLINE_ATOMIC_SYMBOLS
@@ -1068,6 +1080,7 @@ RtsSymbolVal rtsSyms[] = {
       RTS_DARWIN_ONLY_SYMBOLS
       RTS_OPENBSD_ONLY_SYMBOLS
       RTS_LIBGCC_SYMBOLS
+      RTS_ARCH_LIBGCC_SYMBOLS
       RTS_FINI_ARRAY_SYMBOLS
       RTS_LIBFFI_SYMBOLS
       RTS_ARM_OUTLINE_ATOMIC_SYMBOLS
Index: ghc-9.8.2/rts/adjustor/LibffiAdjustor.c
===================================================================
--- ghc-9.8.2.orig/rts/adjustor/LibffiAdjustor.c
+++ ghc-9.8.2/rts/adjustor/LibffiAdjustor.c
@@ -12,6 +12,7 @@
 #include "Adjustor.h"
 
 #include "rts/ghc_ffi.h"
+#include <stdint.h>
 #include <string.h>
 
 // Note that ffi_alloc_prep_closure is a non-standard libffi closure
@@ -187,5 +188,20 @@ createAdjustor (int cconv,
         barf("createAdjustor: failed to allocate memory");
     }
 
-    return (void*)code;
+#if defined(riscv64_HOST_ARCH)
+    // Synchronize the memory and instruction cache to prevent illegal
+    // instruction exceptions.
+
+    // We expect two instructions for address loading, one for the jump.
+    int instrCount = 3;
+    // On Linux the parameters of __builtin___clear_cache are currently unused.
+    // Add them anyways for future compatibility. (I.e. the parameters couldn't
+    // be checked during development.)
+    __builtin___clear_cache((void *)code,
+                            (void *)code + instrCount * sizeof(uint64_t));
+    // Memory barrier to ensure nothing circumvents the fence.i / cache flush.
+    SEQ_CST_FENCE();
+#endif
+
+    return (void *)code;
 }
Index: ghc-9.8.2/rts/linker/Elf.c
===================================================================
--- ghc-9.8.2.orig/rts/linker/Elf.c
+++ ghc-9.8.2/rts/linker/Elf.c
@@ -101,7 +101,7 @@
 #  include <elf_abi.h>
 #endif
 
-#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)
+#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) || defined (riscv64_HOST_ARCH)
 #  define NEED_GOT
 #  define NEED_PLT
 #  include "elf_got.h"
@@ -430,10 +430,7 @@ ocVerifyImage_ELF ( ObjectCode* oc )
       case EM_AARCH64: IF_DEBUG(linker,debugBelch( "aarch64" )); break;
 #endif
 #if defined(EM_RISCV)
-      case EM_RISCV:  IF_DEBUG(linker,debugBelch( "riscv" ));
-          errorBelch("%s: RTS linker not implemented on riscv",
-                     oc->fileName);
-          return 0;
+      case EM_RISCV:  IF_DEBUG(linker,debugBelch( "riscv" )); break;
 #endif
 #if defined(EM_LOONGARCH)
       case EM_LOONGARCH:  IF_DEBUG(linker,debugBelch( "loongarch64" ));
@@ -1995,7 +1992,7 @@ ocResolve_ELF ( ObjectCode* oc )
     (void) shdr;
 #endif /* NEED_GOT */
 
-#if defined(aarch64_HOST_ARCH)
+#if defined(aarch64_HOST_ARCH) || defined(riscv64_HOST_ARCH)
     /* use new relocation design */
     if(relocateObjectCode( oc ))
         return 0;
@@ -2018,6 +2015,8 @@ ocResolve_ELF ( ObjectCode* oc )
 
 #if defined(powerpc_HOST_ARCH)
     ocFlushInstructionCache( oc );
+#elif defined(riscv64_HOST_ARCH)
+    flushInstructionCache( oc );
 #endif
 
     return ocMprotect_Elf(oc);
Index: ghc-9.8.2/rts/linker/ElfTypes.h
===================================================================
--- ghc-9.8.2.orig/rts/linker/ElfTypes.h
+++ ghc-9.8.2/rts/linker/ElfTypes.h
@@ -150,6 +150,7 @@ typedef
 struct _Stub {
     void * addr;
     void * target;
+    void* got_addr;
     /* flags can hold architecture specific information they are used during
      * lookup of stubs as well. Thus two stubs for the same target with
      * different flags are considered unequal.
Index: ghc-9.8.2/rts/linker/SymbolExtras.c
===================================================================
--- ghc-9.8.2.orig/rts/linker/SymbolExtras.c
+++ ghc-9.8.2/rts/linker/SymbolExtras.c
@@ -153,7 +153,7 @@ void ocProtectExtras(ObjectCode* oc)
 }
 
 
-#if defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH)
+#if defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) || defined(riscv64_HOST_ARCH)
 SymbolExtra* makeSymbolExtra( ObjectCode const* oc,
                               unsigned long symbolNumber,
                               unsigned long target )
@@ -189,9 +189,12 @@ SymbolExtra* makeSymbolExtra( ObjectCode
     extra->addr = target;
     memcpy(extra->jumpIsland, jmp, 8);
 #endif /* x86_64_HOST_ARCH */
-
+#if defined(riscv64_HOST_ARCH)
+    // Fake GOT entry (used like GOT, but located in symbol extras)
+    extra->addr = target;
+#endif
     return extra;
 }
-#endif /* powerpc_HOST_ARCH || x86_64_HOST_ARCH */
+#endif /* powerpc_HOST_ARCH || x86_64_HOST_ARCH || riscv64_HOST_ARCH */
 #endif /* !x86_64_HOST_ARCH) || !mingw32_HOST_OS */
 #endif // NEED_SYMBOL_EXTRAS
Index: ghc-9.8.2/rts/linker/SymbolExtras.h
===================================================================
--- ghc-9.8.2.orig/rts/linker/SymbolExtras.h
+++ ghc-9.8.2/rts/linker/SymbolExtras.h
@@ -16,7 +16,7 @@ SymbolExtra* makeArmSymbolExtra( ObjectC
                                  unsigned long target,
                                  bool fromThumb,
                                  bool toThumb );
-#elif defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH)
+#elif defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH) || defined(riscv64_HOST_ARCH)
 SymbolExtra* makeSymbolExtra( ObjectCode const* oc,
                               unsigned long symbolNumber,
                               unsigned long target );
Index: ghc-9.8.2/rts/linker/elf_got.c
===================================================================
--- ghc-9.8.2.orig/rts/linker/elf_got.c
+++ ghc-9.8.2/rts/linker/elf_got.c
@@ -9,22 +9,20 @@
  * Check if we need a global offset table slot for a
  * given symbol
  */
-bool
-needGotSlot(Elf_Sym * symbol) {
-    /* using global here should give an upper bound */
-    /* I don't believe we need to relocate STB_LOCAL
-     * symbols via the GOT; however I'm unsure about
-     * STB_WEAK.
-     *
-     * Any more restrictive filter here would result
-     * in a smaller GOT, which is preferable.
-     */
-    return ELF_ST_BIND(symbol->st_info) == STB_GLOBAL
-        || ELF_ST_BIND(symbol->st_info) == STB_WEAK
-        // Section symbols exist primarily for relocation
-        // and as such may need a GOT slot.
-        || ELF_ST_TYPE(symbol->st_info) == STT_SECTION;
-
+bool needGotSlot(Elf_Sym *symbol) {
+  /* using global here should give an upper bound */
+  /* I don't believe we need to relocate STB_LOCAL
+   * symbols via the GOT; however I'm unsure about
+   * STB_WEAK.
+   *
+   * Any more restrictive filter here would result
+   * in a smaller GOT, which is preferable.
+   */
+  return ELF_ST_BIND(symbol->st_info) == STB_GLOBAL ||
+         ELF_ST_BIND(symbol->st_info) == STB_WEAK
+         // Section symbols exist primarily for relocation
+         // and as such may need a GOT slot.
+         || ELF_ST_TYPE(symbol->st_info) == STT_SECTION;
 }
 
 bool
Index: ghc-9.8.2/rts/linker/elf_plt.c
===================================================================
--- ghc-9.8.2.orig/rts/linker/elf_plt.c
+++ ghc-9.8.2/rts/linker/elf_plt.c
@@ -5,7 +5,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)
+#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) || defined(riscv64_HOST_ARCH)
 #if defined(OBJFORMAT_ELF)
 
 #define STRINGIFY(x) #x
@@ -49,11 +49,13 @@ findStub(Section * section,
 bool
 makeStub(Section * section,
           void* * addr,
+          void* got_addr,
           uint8_t flags) {
 
     Stub * s = calloc(1, sizeof(Stub));
     ASSERT(s != NULL);
     s->target = *addr;
+    s->got_addr = got_addr;
     s->flags  = flags;
     s->next = NULL;
     s->addr = (uint8_t *)section->info->stub_offset + 8
Index: ghc-9.8.2/rts/linker/elf_plt.h
===================================================================
--- ghc-9.8.2.orig/rts/linker/elf_plt.h
+++ ghc-9.8.2/rts/linker/elf_plt.h
@@ -4,8 +4,9 @@
 
 #include "elf_plt_arm.h"
 #include "elf_plt_aarch64.h"
+#include "elf_plt_riscv64.h"
 
-#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)
+#if defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) || defined (riscv64_HOST_ARCH)
 
 #if defined(OBJFORMAT_ELF)
 
@@ -21,6 +22,8 @@
 #define __suffix__ Arm
 #elif defined(__mips__)
 #define __suffix__ Mips
+#elif defined(__riscv)
+#define __suffix__ RISCV64
 #else
 #error "unknown architecture"
 #endif
@@ -34,10 +37,10 @@ unsigned  numberOfStubsForSection( Objec
 #define STUB_SIZE          ADD_SUFFIX(stubSize)
 
 bool findStub(Section * section, void* * addr, uint8_t flags);
-bool makeStub(Section * section, void* * addr, uint8_t flags);
+bool makeStub(Section * section, void* * addr, void* got_addr, uint8_t flags);
 
 void freeStubs(Section * section);
 
 #endif // OBJECTFORMAT_ELF
 
-#endif // arm/aarch64_HOST_ARCH
+#endif // arm/aarch64_HOST_ARCH/riscv64_HOST_ARCH
Index: ghc-9.8.2/rts/linker/elf_plt_riscv64.c
===================================================================
--- /dev/null
+++ ghc-9.8.2/rts/linker/elf_plt_riscv64.c
@@ -0,0 +1,84 @@
+#include "Rts.h"
+#include "elf_compat.h"
+#include "elf_plt_riscv64.h"
+#include "rts/Messages.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#if defined(riscv64_HOST_ARCH)
+
+#if defined(OBJFORMAT_ELF)
+
+const size_t instSizeRISCV64 = 4;
+const size_t stubSizeRISCV64 = 3 * instSizeRISCV64;
+
+bool needStubForRelRISCV64(Elf_Rel *rel) {
+  switch (ELF64_R_TYPE(rel->r_info)) {
+  case R_RISCV_CALL:
+  case R_RISCV_CALL_PLT:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool needStubForRelaRISCV64(Elf_Rela *rela) {
+  switch (ELF64_R_TYPE(rela->r_info)) {
+  case R_RISCV_CALL:
+  case R_RISCV_CALL_PLT:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// After the global offset table (GOT) has been set up, we can use these three
+// instructions to jump to the target address / function:
+//  1. AUIPC ip, %pcrel_hi(addr)
+//  2. LD ip, %pcrel_lo(addr)(ip)
+//  3. JARL x0, ip, 0
+//
+// We could use the absolute address of the target (because we know it), but
+// that would require loading a 64-bit constant which is a nightmare to do in
+// riscv64 assembly. (See
+// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/5ffe5b5aeedb37b1c1c0c3d94641267d9ad4795a/riscv-elf.adoc#procedure-linkage-table)
+bool makeStubRISCV64(Stub *s) {
+  uint32_t *P = (uint32_t *)s->addr;
+  int32_t addr = (uint64_t)s->got_addr - (uint64_t)P;
+
+  uint64_t hi = (addr + 0x800) >> 12;
+  uint64_t lo = addr - (hi << 12);
+
+  IF_DEBUG(
+      linker,
+      debugBelch(
+          "makeStubRISCV64: P = %p, got_addr = %p, target = %p, addr = 0x%x "
+          ", hi = 0x%lx, lo = 0x%lx\n",
+          P, s->got_addr, s->target, addr, hi, lo));
+
+  // AUIPC ip, %pcrel_hi(addr)
+  uint32_t auipcInst = 0b0010111; // opcode
+  auipcInst |= 0x1f << 7;         // rd = ip (x31)
+  auipcInst |= hi << 12;          // imm[31:12]
+
+  // LD ip, %pcrel_lo(addr)(ip)
+  uint32_t ldInst = 0b0000011; // opcode
+  ldInst |= 0x1f << 7;         // rd = ip (x31)
+  ldInst |= 0x1f << 15;        // rs = ip (x31)
+  ldInst |= 0b11 << 12;        // funct3 = 0x3 (LD)
+  ldInst |= lo << 20;          // imm[11:0]
+
+  // JARL x0, ip, 0
+  uint32_t jalrInst = 0b1100111; // opcode
+  jalrInst |= 0x1f << 15;        // rs = ip (x31)
+
+  P[0] = auipcInst;
+  P[1] = ldInst;
+  P[2] = jalrInst;
+
+  return EXIT_SUCCESS;
+}
+
+#endif
+#endif
Index: ghc-9.8.2/rts/linker/elf_plt_riscv64.h
===================================================================
--- /dev/null
+++ ghc-9.8.2/rts/linker/elf_plt_riscv64.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "LinkerInternals.h"
+
+#if defined(OBJFORMAT_ELF)
+
+extern const size_t stubSizeRISCV64;
+bool needStubForRelRISCV64(Elf_Rel * rel);
+bool needStubForRelaRISCV64(Elf_Rela * rel);
+bool makeStubRISCV64(Stub * s);
+
+#endif
Index: ghc-9.8.2/rts/linker/elf_reloc.c
===================================================================
--- ghc-9.8.2.orig/rts/linker/elf_reloc.c
+++ ghc-9.8.2/rts/linker/elf_reloc.c
@@ -4,13 +4,18 @@
 
 #if defined(OBJFORMAT_ELF)
 
-/* we currently only use this abstraction for elf/aarch64 */
-#if defined(aarch64_HOST_ARCH)
+/* we currently only use this abstraction for elf/aarch64 and elf/riscv64 */
+#if defined(aarch64_HOST_ARCH) | defined(riscv64_HOST_ARCH)
 
 bool
 relocateObjectCode(ObjectCode * oc) {
     return ADD_SUFFIX(relocateObjectCode)(oc);
 }
+
+
+void flushInstructionCache(ObjectCode * oc){
+   return ADD_SUFFIX(flushInstructionCache)(oc);
+}
 #endif
 
 #endif
Index: ghc-9.8.2/rts/linker/elf_reloc.h
===================================================================
--- ghc-9.8.2.orig/rts/linker/elf_reloc.h
+++ ghc-9.8.2/rts/linker/elf_reloc.h
@@ -5,9 +5,10 @@
 #if defined(OBJFORMAT_ELF)
 
 #include "elf_reloc_aarch64.h"
+#include "elf_reloc_riscv64.h"
 
 bool
 relocateObjectCode(ObjectCode * oc);
 
-
+void flushInstructionCache(ObjectCode *oc);
 #endif /* OBJETFORMAT_ELF */
Index: ghc-9.8.2/rts/linker/elf_reloc_aarch64.c
===================================================================
--- ghc-9.8.2.orig/rts/linker/elf_reloc_aarch64.c
+++ ghc-9.8.2/rts/linker/elf_reloc_aarch64.c
@@ -240,7 +240,7 @@ computeAddend(Section * section, Elf_Rel
                 /* check if we already have that stub */
                 if(findStub(section, (void**)&S, 0)) {
                     /* did not find it. Crete a new stub. */
-                    if(makeStub(section, (void**)&S, 0)) {
+                    if(makeStub(section, (void**)&S, NULL, 0)) {
                         abort(/* could not find or make stub */);
                     }
                 }
@@ -339,5 +339,10 @@ relocateObjectCodeAarch64(ObjectCode * o
     return EXIT_SUCCESS;
 }
 
+void flushInstructionCacheAarch64(ObjectCode * oc) {
+  // Looks like we don't need this on Aarch64.
+  /* no-op */
+}
+
 #endif /* OBJECTFORMAT_ELF */
 #endif /* aarch64_HOST_ARCH */
Index: ghc-9.8.2/rts/linker/elf_reloc_aarch64.h
===================================================================
--- ghc-9.8.2.orig/rts/linker/elf_reloc_aarch64.h
+++ ghc-9.8.2/rts/linker/elf_reloc_aarch64.h
@@ -7,4 +7,5 @@
 bool
 relocateObjectCodeAarch64(ObjectCode * oc);
 
+void flushInstructionCacheAarch64(ObjectCode *oc);
 #endif /* OBJETFORMAT_ELF */
Index: ghc-9.8.2/rts/linker/elf_reloc_riscv64.c
===================================================================
--- /dev/null
+++ ghc-9.8.2/rts/linker/elf_reloc_riscv64.c
@@ -0,0 +1,655 @@
+#include "elf_reloc_riscv64.h"
+#include "LinkerInternals.h"
+#include "Rts.h"
+#include "Stg.h"
+#include "SymbolExtras.h"
+#include "elf.h"
+#include "elf_plt.h"
+#include "elf_util.h"
+#include "rts/Messages.h"
+#include "util.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#if defined(riscv64_HOST_ARCH)
+
+#if defined(OBJFORMAT_ELF)
+
+typedef uint64_t addr_t;
+
+/* regular instructions are 32bit */
+typedef uint32_t inst_t;
+
+/* compressed instructions are 16bit */
+typedef uint16_t cinst_t;
+
+// TODO: These instances could be static. They are not yet, because we might
+// need their debugging symbols.
+char *relocationTypeToString(Elf64_Xword type);
+int32_t decodeAddendRISCV64(Section *section, Elf_Rel *rel);
+bool encodeAddendRISCV64(Section *section, Elf_Rel *rel, int32_t addend);
+int32_t SignExtend32(uint32_t X, unsigned B);
+void write8le(uint8_t *p, uint8_t v);
+uint8_t read8le(const uint8_t *P);
+void write16le(cinst_t *p, uint16_t v);
+uint16_t read16le(const cinst_t *P);
+uint32_t read32le(const inst_t *P);
+void write32le(inst_t *p, uint32_t v);
+uint64_t read64le(const uint64_t *P);
+void write64le(uint64_t *p, uint64_t v);
+uint32_t extractBits(uint64_t v, uint32_t begin, uint32_t end);
+void setCJType(cinst_t *loc, uint32_t val);
+void setCBType(cinst_t *loc, uint32_t val);
+void setBType(inst_t *loc, uint32_t val);
+void setSType(inst_t *loc, uint32_t val);
+int32_t computeAddend(ElfRelocationATable * relaTab, unsigned relNo, Elf_Rel *rel, ElfSymbol *symbol,
+                      int64_t addend, ObjectCode *oc);
+void setJType(inst_t *loc, uint32_t val);
+void setIType(inst_t *loc, int32_t val);
+void checkInt(inst_t *loc, int32_t v, int n);
+uint32_t setLO12_I(uint32_t insn, uint32_t imm);
+uint32_t setLO12_S(uint32_t insn, uint32_t imm);
+void setUType(inst_t *loc, int32_t val);
+
+
+char *relocationTypeToString(Elf64_Xword type) {
+  switch (ELF64_R_TYPE(type)) {
+  case R_RISCV_NONE:
+    return "R_RISCV_NONE";
+  case R_RISCV_32:
+    return "R_RISCV_32";
+  case R_RISCV_64:
+    return "R_RISCV_64";
+  case R_RISCV_RELATIVE:
+    return "R_RISCV_RELATIVE";
+  case R_RISCV_COPY:
+    return "R_RISCV_COPY";
+  case R_RISCV_JUMP_SLOT:
+    return "R_RISCV_JUMP_SLOT";
+  case R_RISCV_TLS_DTPMOD32:
+    return "R_RISCV_TLS_DTPMOD32";
+  case R_RISCV_TLS_DTPMOD64:
+    return "R_RISCV_TLS_DTPMOD64";
+  case R_RISCV_TLS_DTPREL32:
+    return "R_RISCV_TLS_DTPREL32";
+  case R_RISCV_TLS_DTPREL64:
+    return "R_RISCV_TLS_DTPREL64";
+  case R_RISCV_TLS_TPREL32:
+    return "R_RISCV_TLS_TPREL32";
+  case R_RISCV_TLS_TPREL64:
+    return "R_RISCV_TLS_TPREL64";
+  case R_RISCV_BRANCH:
+    return "R_RISCV_BRANCH";
+  case R_RISCV_JAL:
+    return "R_RISCV_JAL";
+  case R_RISCV_CALL:
+    return "R_RISCV_CALL";
+  case R_RISCV_CALL_PLT:
+    return "R_RISCV_CALL_PLT";
+  case R_RISCV_GOT_HI20:
+    return "R_RISCV_GOT_HI20";
+  case R_RISCV_PCREL_HI20:
+    return "R_RISCV_PCREL_HI20";
+  case R_RISCV_LO12_I:
+    return "R_RISCV_LO12_I";
+  case R_RISCV_PCREL_LO12_I:
+    return "R_RISCV_PCREL_LO12_I";
+  case R_RISCV_HI20:
+    return "R_RISCV_HI20";
+  case R_RISCV_LO12_S:
+    return "R_RISCV_LO12_S";
+  case R_RISCV_PCREL_LO12_S:
+    return "R_RISCV_PCREL_LO12_S";
+  case R_RISCV_RELAX:
+    return "R_RISCV_RELAX";
+  case R_RISCV_RVC_BRANCH:
+    return "R_RISCV_RVC_BRANCH";
+  case R_RISCV_RVC_JUMP:
+    return "R_RISCV_RVC_JUMP";
+  default:
+    return "Unknown relocation type";
+  }
+}
+
+#define Page(x) ((x) & ~0xFFF)
+
+int32_t decodeAddendRISCV64(Section *section STG_UNUSED,
+                            Elf_Rel *rel STG_UNUSED) {
+  debugBelch("decodeAddendRISCV64: Relocations with explicit addend are not "
+             "supported.");
+  abort(/* we don't support Rel locations yet. */);
+}
+
+// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
+// Requires 0 < B <= 32. (32 bit is sufficient as we can only encode 20 + 12 =
+// 32 bit in a relocation pair.)
+int32_t SignExtend32(uint32_t X, unsigned B) {
+  assert(B > 0 && "Bit width can't be 0.");
+  assert(B <= 32 && "Bit width out of range.");
+  return (int32_t)(X << (32 - B)) >> (32 - B);
+}
+
+// Make sure that V can be represented as an N bit signed integer.
+void checkInt(inst_t *loc, int32_t v, int n) {
+  if (v != SignExtend32(v, n)) {
+    debugBelch("Relocation at 0x%x is out of range. value: 0x%x (%d), "
+               "sign-extended value: 0x%x (%d), max bits 0x%x (%d)\n",
+               *loc, v, v, SignExtend32(v, n), SignExtend32(v, n), n, n);
+  }
+}
+
+// RISCV is little-endian by definition.
+void write8le(uint8_t *p, uint8_t v) { *p = v; }
+
+// RISCV is little-endian by definition.
+uint8_t read8le(const uint8_t *P) { return *P; }
+
+// RISCV is little-endian by definition.
+void write16le(cinst_t *p, uint16_t v) { *p = v; }
+
+// RISCV is little-endian by definition.
+uint16_t read16le(const cinst_t *P) { return *P; }
+
+// RISCV is little-endian by definition.
+uint32_t read32le(const inst_t *P) { return *P; }
+
+// RISCV is little-endian by definition.
+void write32le(inst_t *p, uint32_t v) { *p = v; }
+
+// RISCV is little-endian by definition.
+uint64_t read64le(const uint64_t *P) { return *P; }
+
+// RISCV is little-endian by definition.
+void write64le(uint64_t *p, uint64_t v) { *p = v; }
+
+uint32_t extractBits(uint64_t v, uint32_t begin, uint32_t end) {
+  return (v & ((1ULL << (begin + 1)) - 1)) >> end;
+}
+
+uint32_t setLO12_I(uint32_t insn, uint32_t imm) {
+  IF_DEBUG(linker, debugBelch("setLO12_I: insn 0x%x imm 0x%x (insn & 0xfffff) "
+                              "0x%x (imm << 20) 0x%x \n",
+                              insn, imm, (insn & 0xfffff), (imm << 20)));
+  return (insn & 0xfffff) | (imm << 20);
+}
+
+uint32_t setLO12_S(uint32_t insn, uint32_t imm) {
+  return (insn & 0x1fff07f) | (extractBits(imm, 11, 5) << 25) |
+         (extractBits(imm, 4, 0) << 7);
+}
+
+void setUType(inst_t *loc, int32_t val) {
+  const unsigned bits = 32;
+  uint32_t hi = val + 0x800;
+  checkInt(loc, SignExtend32(hi, bits) >> 12, 20);
+  IF_DEBUG(linker, debugBelch("setUType: hi 0x%x val 0x%x\n", hi, val));
+  write32le(loc, (read32le(loc) & 0xFFF) | (hi & 0xFFFFF000));
+}
+
+void setIType(inst_t *loc, int32_t val) {
+  uint64_t hi = (val + 0x800) >> 12;
+  uint64_t lo = val - (hi << 12);
+  IF_DEBUG(linker, debugBelch("setIType: hi 0x%lx lo 0x%lx\n", hi, lo));
+  IF_DEBUG(linker, debugBelch("setIType: loc %p  *loc 0x%x  val 0x%x\n", loc,
+                              *loc, val));
+  uint32_t insn = setLO12_I(read32le(loc), lo & 0xfff);
+  IF_DEBUG(linker, debugBelch("setIType: insn 0x%x\n", insn));
+  write32le(loc, insn);
+  IF_DEBUG(linker, debugBelch("setIType: loc %p  *loc' 0x%x  val 0x%x\n", loc,
+                              *loc, val));
+}
+
+void setSType(inst_t *loc, uint32_t val) {
+  uint64_t hi = (val + 0x800) >> 12;
+  uint64_t lo = val - (hi << 12);
+  write32le(loc, setLO12_S(read32le(loc), lo));
+}
+
+void setJType(inst_t *loc, uint32_t val) {
+  checkInt(loc, val, 21);
+
+  uint32_t insn = read32le(loc) & 0xFFF;
+  uint32_t imm20 = extractBits(val, 20, 20) << 31;
+  uint32_t imm10_1 = extractBits(val, 10, 1) << 21;
+  uint32_t imm11 = extractBits(val, 11, 11) << 20;
+  uint32_t imm19_12 = extractBits(val, 19, 12) << 12;
+  insn |= imm20 | imm10_1 | imm11 | imm19_12;
+
+  write32le(loc, insn);
+}
+
+void setBType(inst_t *loc, uint32_t val) {
+  checkInt(loc, val, 13);
+
+  uint32_t insn = read32le(loc) & 0x1FFF07F;
+  uint32_t imm12 = extractBits(val, 12, 12) << 31;
+  uint32_t imm10_5 = extractBits(val, 10, 5) << 25;
+  uint32_t imm4_1 = extractBits(val, 4, 1) << 8;
+  uint32_t imm11 = extractBits(val, 11, 11) << 7;
+  insn |= imm12 | imm10_5 | imm4_1 | imm11;
+
+  write32le(loc, insn);
+}
+
+void setCBType(cinst_t *loc, uint32_t val) {
+  checkInt((inst_t *)loc, val, 9);
+  uint16_t insn = read16le(loc) & 0xE383;
+  uint16_t imm8 = extractBits(val, 8, 8) << 12;
+  uint16_t imm4_3 = extractBits(val, 4, 3) << 10;
+  uint16_t imm7_6 = extractBits(val, 7, 6) << 5;
+  uint16_t imm2_1 = extractBits(val, 2, 1) << 3;
+  uint16_t imm5 = extractBits(val, 5, 5) << 2;
+  insn |= imm8 | imm4_3 | imm7_6 | imm2_1 | imm5;
+
+  write16le(loc, insn);
+}
+
+void setCJType(cinst_t *loc, uint32_t val) {
+  checkInt((inst_t *)loc, val, 12);
+  uint16_t insn = read16le(loc) & 0xE003;
+  uint16_t imm11 = extractBits(val, 11, 11) << 12;
+  uint16_t imm4 = extractBits(val, 4, 4) << 11;
+  uint16_t imm9_8 = extractBits(val, 9, 8) << 9;
+  uint16_t imm10 = extractBits(val, 10, 10) << 8;
+  uint16_t imm6 = extractBits(val, 6, 6) << 7;
+  uint16_t imm7 = extractBits(val, 7, 7) << 6;
+  uint16_t imm3_1 = extractBits(val, 3, 1) << 3;
+  uint16_t imm5 = extractBits(val, 5, 5) << 2;
+  insn |= imm11 | imm4 | imm9_8 | imm10 | imm6 | imm7 | imm3_1 | imm5;
+
+  write16le(loc, insn);
+}
+
+bool encodeAddendRISCV64(Section *section, Elf_Rel *rel, int32_t addend) {
+  addr_t P = (addr_t)((uint8_t *)section->start + rel->r_offset);
+  IF_DEBUG(linker,
+           debugBelch(
+               "Relocation type %s 0x%lx (%lu) symbol 0x%lx addend 0x%x (%u / "
+               "%d) P 0x%lx\n",
+               relocationTypeToString(rel->r_info), ELF64_R_TYPE(rel->r_info),
+               ELF64_R_TYPE(rel->r_info), ELF64_R_SYM(rel->r_info), addend,
+               addend, addend, P));
+  switch (ELF64_R_TYPE(rel->r_info)) {
+  case R_RISCV_32_PCREL:
+  case R_RISCV_32:
+    write32le((inst_t *)P, addend);
+    break;
+  case R_RISCV_64:
+    write64le((uint64_t *)P, addend);
+    break;
+  case R_RISCV_GOT_HI20:
+  case R_RISCV_PCREL_HI20:
+  case R_RISCV_HI20: {
+    setUType((inst_t *)P, addend);
+    break;
+  }
+  case R_RISCV_PCREL_LO12_I:
+  case R_RISCV_LO12_I: {
+    setIType((inst_t *)P, addend);
+    break;
+  }
+  case R_RISCV_RVC_JUMP: {
+    setCJType((cinst_t *)P, addend);
+    break;
+  }
+  case R_RISCV_RVC_BRANCH: {
+    setCBType((cinst_t *)P, addend);
+    break;
+  }
+  case R_RISCV_BRANCH: {
+    setBType((inst_t *)P, addend);
+    break;
+  }
+  case R_RISCV_CALL:
+  case R_RISCV_CALL_PLT: {
+    // We could relax more (in some cases) but right now most important is to
+    // make it work.
+    setUType((inst_t *)P, addend);
+    setIType(((inst_t *)P) + 1, addend);
+    break;
+  }
+  case R_RISCV_JAL: {
+    setJType((inst_t *)P, addend);
+    break;
+  }
+  case R_RISCV_ADD8:
+    write8le((uint8_t *)P, read8le((uint8_t *)P) + addend);
+    break;
+  case R_RISCV_ADD16:
+    write16le((cinst_t *)P, read16le((cinst_t *)P) + addend);
+    break;
+  case R_RISCV_ADD32:
+    write32le((inst_t *)P, read32le((inst_t *)P) + addend);
+    break;
+  case R_RISCV_ADD64:
+    write64le((uint64_t *)P, read64le((uint64_t *)P) + addend);
+    break;
+  case R_RISCV_SUB6: {
+    uint8_t keep = *((uint8_t *)P) & 0xc0;
+    uint8_t imm = (((*(uint8_t *)P) & 0x3f) - addend) & 0x3f;
+
+    write8le((uint8_t *)P, keep | imm);
+    break;
+  }
+  case R_RISCV_SUB8:
+    write8le((uint8_t *)P, read8le((uint8_t *)P) - addend);
+    break;
+  case R_RISCV_SUB16:
+    write16le((cinst_t *)P, read16le((cinst_t *)P) - addend);
+    break;
+  case R_RISCV_SUB32:
+    write32le((inst_t *)P, read32le((inst_t *)P) - addend);
+    break;
+  case R_RISCV_SUB64:
+    write64le((uint64_t *)P, read64le((uint64_t *)P) - addend);
+    break;
+  case R_RISCV_SET6: {
+    uint8_t keep = *((uint8_t *)P) & 0xc0;
+    uint8_t imm = (addend & 0x3f) & 0x3f;
+
+    write8le((uint8_t *)P, keep | imm);
+    break;
+  }
+  case R_RISCV_SET8:
+    write8le((uint8_t *)P, addend);
+    break;
+  case R_RISCV_SET16:
+    write16le((cinst_t *)P, addend);
+    break;
+  case R_RISCV_SET32:
+    write32le((inst_t *)P, addend);
+    break;
+  case R_RISCV_PCREL_LO12_S:
+  case R_RISCV_TPREL_LO12_S:
+  case R_RISCV_LO12_S: {
+    setSType((inst_t *)P, addend);
+    break;
+  }
+  case R_RISCV_RELAX:
+  case R_RISCV_ALIGN:
+    // I guess we don't need to implement these relaxations (optimizations).
+    break;
+  default:
+    debugBelch("Missing relocation 0x%lx\n", ELF64_R_TYPE(rel->r_info));
+    abort();
+  }
+  return EXIT_SUCCESS;
+}
+
+/**
+ * Compute the *new* addend for a relocation, given a pre-existing addend.
+ * @param section The section the relocation is in.
+ * @param rel     The Relocation struct.
+ * @param symbol  The target symbol.
+ * @param addend  The existing addend. Either explicit or implicit.
+ * @return The new computed addend.
+ */
+int32_t computeAddend(ElfRelocationATable * relaTab, unsigned relNo, Elf_Rel *rel, ElfSymbol *symbol,
+                      int64_t addend, ObjectCode *oc) {
+  Section * section = &oc->sections[relaTab->targetSectionIndex];
+
+  /* Position where something is relocated */
+  addr_t P = (addr_t)((uint8_t *)section->start + rel->r_offset);
+
+  CHECK(0x0 != P);
+  CHECK((uint64_t)section->start <= P);
+  CHECK(P <= (uint64_t)section->start + section->size);
+  /* Address of the symbol */
+  addr_t S = (addr_t)symbol->addr;
+  /* GOT slot for the symbol */
+  addr_t GOT_S = (addr_t)symbol->got_addr;
+
+  int64_t A = addend;
+
+  IF_DEBUG(linker, debugBelch("%s: P 0x%lx S 0x%lx %s GOT_S 0x%lx A 0x%lx relNo %u\n",
+                              relocationTypeToString(rel->r_info), P, S,
+                              symbol->name, GOT_S, A, relNo));
+  switch (ELF64_R_TYPE(rel->r_info)) {
+  case R_RISCV_32:
+    return S + A;
+  case R_RISCV_64:
+    return S + A;
+  case R_RISCV_HI20:
+    return S + A;
+  case R_RISCV_JUMP_SLOT:
+    return S;
+  case R_RISCV_JAL:
+    return S + A - P;
+  case R_RISCV_PCREL_HI20:
+    return S + A - P;
+  case R_RISCV_LO12_I:
+    return S + A;
+    // Quoting LLVM docs: For R_RISCV_PC_INDIRECT (R_RISCV_PCREL_LO12_{I,S}),
+    // the symbol actually points the corresponding R_RISCV_PCREL_HI20
+    // relocation, and the target VA is calculated using PCREL_HI20's symbol.
+  case R_RISCV_PCREL_LO12_S:
+    FALLTHROUGH;
+  case R_RISCV_PCREL_LO12_I: {
+    // Lookup related HI20 relocation and use that value. I'm still confused why
+    // relocations aren't pure, but this is how LLVM does it. And, calculating
+    // the lower 12 bit without any relation ship to the GOT entry's address
+    // makes no sense either.
+      for (unsigned i = relNo; i >= 0 ; i--) {
+        Elf_Rela *rel_prime = &relaTab->relocations[i];
+
+        addr_t P_prime =
+            (addr_t)((uint8_t *)section->start + rel_prime->r_offset);
+
+        if (P_prime != S) {
+          // S points to the P of the corresponding *_HI20 relocation.
+          continue;
+        }
+
+        ElfSymbol *symbol_prime =
+            findSymbol(oc, relaTab->sectionHeader->sh_link,
+                       ELF64_R_SYM((Elf64_Xword)rel_prime->r_info));
+
+        CHECK(0x0 != symbol_prime);
+
+        /* take explicit addend */
+        int64_t addend_prime = rel_prime->r_addend;
+
+        uint64_t type_prime = ELF64_R_TYPE(rel_prime->r_info);
+
+        if (type_prime == R_RISCV_PCREL_HI20 ||
+            type_prime == R_RISCV_GOT_HI20 ||
+            type_prime == R_RISCV_TLS_GD_HI20 ||
+            type_prime == R_RISCV_TLS_GOT_HI20) {
+          IF_DEBUG(linker,
+                   debugBelch(
+                       "Found matching relocation: %s (P: 0x%lx, S: 0x%lx, "
+                       "sym-name: %s) -> %s (P: 0x%lx, S: 0x%lx, sym-name: %s, relNo: %u)",
+                       relocationTypeToString(rel->r_info), P, S, symbol->name,
+                       relocationTypeToString(rel_prime->r_info), P_prime,
+                       symbol_prime->addr, symbol_prime->name, i));
+          int32_t result = computeAddend(relaTab, i, (Elf_Rel *)rel_prime,
+                                         symbol_prime, addend_prime, oc);
+          IF_DEBUG(linker, debugBelch("Result of computeAddend: 0x%x (%d)\n",
+                                      result, result));
+          return result;
+        }
+    }
+    debugBelch("Missing HI relocation for %s: P 0x%lx S 0x%lx %s\n",
+               relocationTypeToString(rel->r_info), P, S, symbol->name);
+    abort();
+  }
+
+  case R_RISCV_RVC_JUMP:
+    return S + A - P;
+  case R_RISCV_RVC_BRANCH:
+    return S + A - P;
+  case R_RISCV_BRANCH:
+    return S + A - P;
+  case R_RISCV_CALL:
+  case R_RISCV_CALL_PLT: {
+    addr_t GOT_Target;
+    if (GOT_S != 0) {
+      // 1. Public symbol with GOT entry.
+      GOT_Target = GOT_S;
+    } else {
+      // 2. Fake GOT entry with symbol extra entry.
+      SymbolExtra *symbolExtra = makeSymbolExtra(oc, ELF_R_SYM(rel->r_info), S);
+      addr_t* FAKE_GOT_S = &symbolExtra->addr;
+      IF_DEBUG(linker, debugBelch("R_RISCV_CALL_PLT w/ SymbolExtra = %p , "
+                                  "entry = 0x%lx\n",
+                                  symbolExtra, FAKE_GOT_S));
+      GOT_Target = (addr_t) FAKE_GOT_S;
+    }
+
+    if (findStub(section, (void **)&S, 0)) {
+      /* did not find it. Crete a new stub. */
+      if (makeStub(section, (void **)&S, (void *)GOT_Target, 0)) {
+        abort(/* could not find or make stub */);
+      }
+    }
+    IF_DEBUG(linker, debugBelch("R_RISCV_CALL_PLT: S = 0x%lx A = 0x%lx P = "
+                                "0x%lx (S + A) - P = 0x%lx \n",
+                                S, A, P, (S + A) - P));
+    return (S + A) - P;
+  }
+  case R_RISCV_ADD8:
+    FALLTHROUGH;
+  case R_RISCV_ADD16:
+    FALLTHROUGH;
+  case R_RISCV_ADD32:
+    FALLTHROUGH;
+  case R_RISCV_ADD64:
+    FALLTHROUGH;
+    return S + A; // Add V when the value is set
+  case R_RISCV_SUB6:
+    FALLTHROUGH;
+  case R_RISCV_SUB8:
+    FALLTHROUGH;
+  case R_RISCV_SUB16:
+    FALLTHROUGH;
+  case R_RISCV_SUB32:
+    FALLTHROUGH;
+  case R_RISCV_SUB64:
+    FALLTHROUGH;
+    return S + A; // Subtract from V when value is set
+  case R_RISCV_SET6:
+    FALLTHROUGH;
+  case R_RISCV_SET8:
+    FALLTHROUGH;
+  case R_RISCV_SET16:
+    FALLTHROUGH;
+  case R_RISCV_SET32:
+    FALLTHROUGH;
+    return S + A;
+  case R_RISCV_RELAX:
+  case R_RISCV_ALIGN:
+    // I guess we don't need to implement this relaxation. Otherwise, this
+    // should return the number of blank bytes to insert via NOPs.
+    return 0;
+  case R_RISCV_32_PCREL:
+    return S + A - P;
+  case R_RISCV_GOT_HI20: {
+    // TODO: Allocating extra memory for every symbol just to play this trick
+    // seems to be a bit obscene. (GOT relocations hitting local symbols
+    // happens, but not very often.) It would be better to allocate only what we
+    // really need.
+
+    // There are two cases here: 1. The symbol is public and has an entry in the
+    // GOT. 2. It's local and has no corresponding GOT entry. The first case is
+    // easy: We simply calculate the addend with the GOT address. In the second
+    // case we create a symbol extra entry and pretend it's the GOT.
+    if (GOT_S != 0) {
+      // 1. Public symbol with GOT entry.
+      return GOT_S + A - P;
+    } else {
+      // 2. Fake GOT entry with symbol extra entry.
+      SymbolExtra *symbolExtra = makeSymbolExtra(oc, ELF_R_SYM(rel->r_info), S);
+      addr_t* FAKE_GOT_S = &symbolExtra->addr;
+      addr_t res = (addr_t) FAKE_GOT_S + A - P;
+      IF_DEBUG(linker, debugBelch("R_RISCV_GOT_HI20 w/ SymbolExtra = %p , "
+                                  "entry = 0x%lx , reloc-addend = 0x%lu ",
+                                  symbolExtra, FAKE_GOT_S, res));
+      return res;
+    }
+  }
+  default:
+    debugBelch("Unimplemented relocation: 0x%lx\n (%lu)",
+               ELF64_R_TYPE(rel->r_info), ELF64_R_TYPE(rel->r_info));
+    abort(/* unhandled rel */);
+  }
+  debugBelch("This should never happen!");
+  abort(/* unhandled rel */);
+}
+
+// TODO: This is duplicated from elf_reloc_aarch64.c
+bool relocateObjectCodeRISCV64(ObjectCode *oc) {
+  for (ElfRelocationTable *relTab = oc->info->relTable; relTab != NULL;
+       relTab = relTab->next) {
+    /* only relocate interesting sections */
+    if (SECTIONKIND_OTHER == oc->sections[relTab->targetSectionIndex].kind)
+      continue;
+
+    Section *targetSection = &oc->sections[relTab->targetSectionIndex];
+
+    for (unsigned i = 0; i < relTab->n_relocations; i++) {
+      Elf_Rel *rel = &relTab->relocations[i];
+
+      ElfSymbol *symbol = findSymbol(oc, relTab->sectionHeader->sh_link,
+                                     ELF64_R_SYM((Elf64_Xword)rel->r_info));
+
+      CHECK(0x0 != symbol);
+
+      // TODO: This always fails, because we don't support Rel locations: Do
+      // we need this case?
+      /* decode implicit addend */
+      int64_t addend = decodeAddendRISCV64(targetSection, rel);
+
+      addend = computeAddend((ElfRelocationATable*) relTab, i, rel, symbol, addend, oc);
+      encodeAddendRISCV64(targetSection, rel, addend);
+    }
+  }
+  for (ElfRelocationATable *relaTab = oc->info->relaTable; relaTab != NULL;
+       relaTab = relaTab->next) {
+    /* only relocate interesting sections */
+    if (SECTIONKIND_OTHER == oc->sections[relaTab->targetSectionIndex].kind)
+      continue;
+
+    Section *targetSection = &oc->sections[relaTab->targetSectionIndex];
+
+    for (unsigned i = 0; i < relaTab->n_relocations; i++) {
+
+      Elf_Rela *rel = &relaTab->relocations[i];
+
+      ElfSymbol *symbol = findSymbol(oc, relaTab->sectionHeader->sh_link,
+                                     ELF64_R_SYM((Elf64_Xword)rel->r_info));
+
+      CHECK(0x0 != symbol);
+
+      /* take explicit addend */
+      int64_t addend = rel->r_addend;
+
+      addend = computeAddend(relaTab, i, (Elf_Rel *)rel, symbol, addend, oc);
+      encodeAddendRISCV64(targetSection, (Elf_Rel *)rel, addend);
+    }
+  }
+  return EXIT_SUCCESS;
+}
+
+void flushInstructionCacheRISCV64(ObjectCode *oc) {
+  // Synchronize the memory and instruction cache to prevent illegal
+  // instruction exceptions. On Linux the parameters of
+  // __builtin___clear_cache are currently unused. Add them anyways for future
+  // compatibility. (I.e. the parameters couldn't be checked during
+  // development.)
+
+  /* The main object code */
+  void *codeBegin = oc->image + oc->misalignment;
+  __builtin___clear_cache(codeBegin, codeBegin + oc->fileSize);
+
+  /* Jump Islands */
+  __builtin___clear_cache((void *)oc->symbol_extras,
+                          (void *)oc->symbol_extras +
+                              sizeof(SymbolExtra) * oc->n_symbol_extras);
+
+  // Memory barrier to ensure nothing circumvents the fence.i / cache flushes.
+  SEQ_CST_FENCE();
+}
+
+#endif /* OBJECTFORMAT_ELF */
+#endif /* riscv64_HOST_ARCH */
Index: ghc-9.8.2/rts/linker/elf_reloc_riscv64.h
===================================================================
--- /dev/null
+++ ghc-9.8.2/rts/linker/elf_reloc_riscv64.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "LinkerInternals.h"
+
+#if defined(OBJFORMAT_ELF)
+
+bool
+relocateObjectCodeRISCV64(ObjectCode * oc);
+
+void flushInstructionCacheRISCV64(ObjectCode *oc);
+#endif /* OBJETFORMAT_ELF */
Index: ghc-9.8.2/rts/linker/macho/plt.h
===================================================================
--- ghc-9.8.2.orig/rts/linker/macho/plt.h
+++ ghc-9.8.2/rts/linker/macho/plt.h
@@ -25,7 +25,7 @@ unsigned numberOfStubsForSection( Object
 #define STUB_SIZE          ADD_SUFFIX(stubSize)
 
 bool findStub(Section * section, void* * addr, uint8_t flags);
-bool makeStub(Section * section, void* * addr, uint8_t flags);
+bool makeStub(Section * section, void* * addr, void* got_addr, uint8_t flags);
 
 void freeStubs(Section * section);
 
Index: ghc-9.8.2/rts/rts.cabal.in
===================================================================
--- ghc-9.8.2.orig/rts/rts.cabal.in
+++ ghc-9.8.2/rts/rts.cabal.in
@@ -634,9 +634,11 @@ library
                  linker/elf_got.c
                  linker/elf_plt.c
                  linker/elf_plt_aarch64.c
+                 linker/elf_plt_riscv64.c
                  linker/elf_plt_arm.c
                  linker/elf_reloc.c
                  linker/elf_reloc_aarch64.c
+                 linker/elf_reloc_riscv64.c
                  linker/elf_tlsgd.c
                  linker/elf_util.c
                  sm/BlockAlloc.c
Index: ghc-9.8.2/tests/compiler/cmm/README.md
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/README.md
@@ -0,0 +1,62 @@
+# The Cmm tests
+
+These tests are primarily targetted towards writing a codegeneration backend. The
+`driver.sh` will pickup all `.cmm` files and run them by looking for the `RUN:`
+marker in them.  The commands will then be executed in sequence.
+
+A typical file will contain the following header:
+```
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+```
+this instructs the driver, to compie the cmm file `$1`; and pipe the assembly
+output into the `FileCheck` tool from the llvm project, which allows us to write
+basic assertions about the generated assembly.  The `FileCheck` tool will scan
+the file for `CHECK-A64:` marker, as well as `CHECK-A64-NEXT:` marker and verify
+those in the output (see the `-check-prefix` argument to `FileCheck`).  The
+`FileCheck` tool has many more options, that might come in handy.
+
+Some files will also verify that they execute with a zero exit code, or print
+some value to stdout.  Again we can use the `RUN:` marker for this as the following
+example demonstrates:
+```
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+// RUN: "$CC" "${1%%.*}.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.cmm}.exe"
+```
+this will not only compile the assembly, verify that some instructions are
+contained, but also convert the produced object file into an executable using
+a c compiler, and then run the executable.
+
+You will most likely also want to include the following:
+```
+#include "Cmm.h"
+#include "Types.h"
+```
+to be able to copy the cmm dumps ghc produces more closely. Those will still
+need quite a bit of munging, as the pretty-printer does not roundtrip.
+
+## Environment Variables:
+
+- `HC` the haskell compiler to use.
+- `CC` the c compiler to use, should be targetting the same architecture as `HC`.
+- `EXEC` if we are cross compiling, we might need a tool to execute the foreign
+  executable. E.g. `wine` or `qemu-<arch>`.
+
+That is, to execute the whole test-suite (when cross compiling) you may run
+```
+EXEC=qemu-aarch64 CC="$TARGET_CC" bash ./driver.sh
+```
+
+## Missing features
+
+- [ ] Run selective tests.
+- [ ] Pipe command output into log, and cat log on failure.
+- [x] Better reporting on which command failed if multiple commands are given.
+- [ ] RUN commands should have pipeset fail set.
+
+## Cmm ppr/parser bugs
+- [ ] [Parser] Cannot assing to a fixed value (F1) = ... fails to parse
+- [ ] [Parser] F1 + F2, will produce MO_Add F1 F2 instead of MO_F_Add F1 F2
+- [ ] [Parser] Can not construct f(y(x)), the parser will turn that into z = y(x); f(z).
+- [ ] [Ppr] keeps pring types all over the place even though the parser doesn't allow for those.
+- [ ] [Ppr] will print MO_Add, ... instead of the symbols or primitives understood by the parser.
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/Types.h
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/Types.h
@@ -0,0 +1,4 @@
+#define F32 float32
+#define F64 float64
+#define W32 bits32
+#define W64 bits64
Index: ghc-9.8.2/tests/compiler/cmm/assert.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/assert.cmm
@@ -0,0 +1,26 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+// We need ghc to link this
+// RUN: "$HC" -no-hs-main -debug "${1%%.*}.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.cmm}.exe" | FileCheck "$1" -check-prefix=CHECK-RUN-A64
+
+#define DEBUG
+
+#include "Cmm.h"
+
+main () {
+  // CHECK-RUN-A64: Sp: 0; SpLim: 0
+  foreign "C" printf("Sp: %x; SpLim: %x\n", Sp, SpLim);
+
+  Sp = 0x105368;
+  SpLim = 0x1050c0;
+
+  // XXX: This might just work with qemu, which provides predictable memory
+  //      locations.
+  // CHECK-RUN-A64: Sp: 105368; SpLim: 1050c0
+  foreign "C" printf("Sp: %x; SpLim: %x\n", Sp, SpLim);
+
+  // CHECK-A64: x18, x18, :lo12:_assertFail
+  ASSERT(SpLim - WDS(RESERVED_STACK_WORDS) <= Sp);
+
+  foreign "C" exit(0::I64);
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/driver.sh
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/driver.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+#
+# A simple test driver for the cmm tests.  Using FileCheck, a little less
+# sofisticated than the LLVM regression test driver.
+
+# If __PRINT=echo, we'll see the output.
+__PRINT=""
+
+function run_test() {
+    bash <<CODE
+function _run () {
+    $__PRINT $1
+}
+_run $2
+CODE
+}
+
+# Colors! See https://stackoverflow.com/a/5947802
+#    .---------- constant part!
+#    vvvv vvvv-- the code from above
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+GRAY='\033[1;30m'
+NC='\033[0m' # No Color
+
+for cmm in *.cmm; do
+    if grep "RUN: " $cmm > /dev/null ; then
+        cmds=$(grep "RUN: " $cmm |sed "s|.*RUN: ||g")
+        if run_test "$cmds" "$cmm" > /dev/null 2>&1 ; then
+            echo -e "[${GREEN}\u2713${NC}] $cmm ${GREEN}succeded${NC}"
+            rm -f "${cmm%%.*}.s" "${cmm%%.*}.o" "${cmm%%.*}.exe" "${cmm%.*}.c" "${cmm%%.*}_stub.o"
+        else
+            echo -e "[${RED}\u2717${NC}] $cmm ${RED}failed${NC}"
+            IFS=$'\n'; arrCmds=($cmds); unset IFS;
+            for cmd in "${arrCmds[@]}"; do
+                expandedCmd=$(__PRINT=echo run_test "\"$cmd\"" "$cmm")
+                if run_test "$cmd" "$cmm" > /dev/null 2>&1 ; then
+                    echo -e "\t[${GREEN}\u2713${NC}] $expandedCmd"
+                else
+                    echo -e "\t[${RED}\u2717${NC}] $expandedCmd"
+                fi
+            done
+        fi
+    else
+        echo -e "[${GRAY}?${NC}] $cmm ${GRAY}has no RUN: marker${NC}"
+    fi
+done
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/encode_values.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/encode_values.cmm
@@ -0,0 +1,30 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+// RUN: "$CC" "${1%%.*}.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.cmm}.exe" | FileCheck "$1" -check-prefix=CHECK-RUN-A64
+
+#include "Cmm.h"
+#include "Types.h"
+
+main () {
+    R1 = 123454321;
+    F1 = 0.12345::F32;
+    D1 = 3.123456789123456789;
+    D2 = 1.234567891234567890;
+
+    // Let's see that the c string ends up as a symbol
+    // CHECK-A64: c1_str:
+    // CHECK-A64-NEXT: .string "%d\n%1.10f\n%.20f\n%.20f\n"
+
+    // CHECK-RUN-A64: 123454321
+    // The following is WRONG. We need to promote Float to Double when
+    // calling a C function. However for now, and for compatibility, we
+    // don't promote.
+
+    // CHECK-RUN-A64-NEXT: 0.0000000000
+    // CHECK-RUN-A64-NEXT: 3.12345678912345681155
+    // CHECK-RUN-A64-NEXT: 1.23456789123456789348
+
+    foreign "C" printf("%d\n%1.10f\n%.20f\n%.20f\n", R1, F1, D1, D2);
+
+    foreign "C" exit(0::I64);
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/immideate.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/immideate.cmm
@@ -0,0 +1,65 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+// RUN: "$HC" -no-hs-main "${1%%.*}.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.cmm}.exe" | FileCheck "$1" -check-prefix=CHECK-RUN-A64
+
+#include "Cmm.h"
+
+// Too bad this won't work. We can't construct Cmm Immediates.
+// Often ghc will just optimize it away.
+
+// CHECK-A64: .globl main
+// CHECK-A64: main:
+main () {
+
+    // CHECK-A64: mov w22, #1
+    // CHECK-RUN-A64: 1 = 1
+    R1 = 1 :: I64; foreign "C" printf("1 = %ld\n", R1);
+
+    // CHECK-A64: mov w22, #-1
+    // CHECK-RUN-A64: -1 = -1
+    R1 = -1; foreign "C" printf("-1 = %ld\n", R1);
+
+    // CHECK-A64: mov w22, #4096
+    // CHECK-RUN-A64: 4096 = 4096
+    R1 = 1 << 12 :: I64; foreign "C" printf("4096 = %ld\n", R1);
+
+    // CHECK-A64: mov w22, #-4096
+    // CHECK-RUN-A64: -4096 = -4096
+    R1 = -1 << 12 :: I64; foreign "C" printf("-4096 = %ld\n", R1);
+
+    // CHECK-A64:      mov w22, #0
+    // CHECK-A64-NEXT: movk w22, #1, lsl #16
+    // CHECK-RUN-A64: 65536 = 65536
+    R1 = 1 << 16 :: I64; foreign "C" printf("65536 = %ld\n", R1);
+
+    // CHECK-A64:      mov w22, #0
+    // CHECK-A64-NEXT: movk w22, #65535, lsl #16
+    // CHECK-RUN-A64: -65536 = -65536
+    R1 = -1 << 16 :: I64; foreign "C" printf("-65536 = %ld\n", R1);
+
+    // CHECK-A64:      mov w22, #0
+    // CHECK-A64-NEXT: movk w22, #256, lsl #16
+    // CHECK-RUN-A64: 16777216 = 16777216
+    R1 = 1 << 24 :: I64; foreign "C" printf("16777216 = %ld\n", R1);
+
+    // CHECK-A64:      mov w22, #0
+    // CHECK-A64-NEXT: movk w22, #65280, lsl #16
+    // CHECK-RUN-A64: -16777216 = -16777216
+    R1 = -1 << 24 :: I64; foreign "C" printf("-16777216 = %ld\n", R1);
+
+    // CHECK-A64:      mov x22, #0
+    // CHECK-A64-NEXT: movk x22, #0, lsl #16
+    // CHECK-A64-NEXT: movk x22, #16, lsl #32
+    // CHECK-A64-NEXT: movk x22, #0, lsl #48
+    // CHECK-RUN-A64: 68719476736 = 68719476736
+    R1 = 1 << 36 :: I64; foreign "C" printf("68719476736 = %ld\n", R1);
+
+    // CHECK-A64:      mov x22, #0
+    // CHECK-A64-NEXT: movk x22, #0, lsl #16
+    // CHECK-A64-NEXT: movk x22, #65520, lsl #32
+    // CHECK-A64-NEXT: movk x22, #65535, lsl #48
+    // CHECK-RUN-A64: -68719476736 = -68719476736
+    R1 = -1 << 36 :: I64; foreign "C" printf("-68719476736 = %ld\n", R1);
+
+    foreign "C" exit(0::I64);
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/load_large_immediate.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/load_large_immediate.cmm
@@ -0,0 +1,28 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+#include "Cmm.h"
+#include "Types.h"
+
+large() {
+    // CHECK-A64:      mov x23, #52428
+    // CHECK-A64-NEXT: movk x23, #52428, lsl #16
+    // CHECK-A64-NEXT: movk x23, #52428, lsl #32
+    // CHECK-A64-NEXT: movk x23, #3276, lsl #48
+    R2 = 922337203685477580;
+
+    // CHECK-A64:      mov x23, #52429
+    R2 = 922337203685477581;
+
+    // CHECK-A64:      mov x18, #11544
+    // CHECK-A64-NEXT: movk x18, #21572, lsl #16
+    // CHECK-A64-NEXT: movk x18, #8699, lsl #32
+    // CHECK-A64-NEXT: movk x18, #16393, lsl #48
+    // CHECK-A64-NEXT: fmov d12, x18
+    D1 = 3.141592653589793;
+
+    // CHECK-A64:      mov x18, #11544
+    // CHECK-A64-NEXT: movk x18, #21572, lsl #16
+    // CHECK-A64-NEXT: movk x18, #8699, lsl #32
+    // CHECK-A64-NEXT: movk x18, #16377, lsl #48
+    // CHECK-A64-NEXT: fmov d12, x18
+    D1 = 1.5707963267948966;
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/load_store.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/load_store.cmm
@@ -0,0 +1,9 @@
+
+// XXX: todo
+main () {
+    // test load_store
+    // write 64bit word, load lower 8bits, 16bits, 32bits,
+    // and ensure we get exactly what we asked for!
+
+    // write byte, half, word, ... load 64bit, verify.
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/load_sym_with_large_offset.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/load_sym_with_large_offset.cmm
@@ -0,0 +1,9 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+
+test () {
+    // CHECK-A64: mov w18, #4273
+    // CHECK-A64: adrp x23, stg_INTLIKE_closure
+    // CHECK-A64: add x23, x23, :lo12:stg_INTLIKE_closure
+    // CHECK-A64: add x23, x23, x18
+    R2 = stg_INTLIKE_closure+4273;
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/main.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/main.cmm
@@ -0,0 +1,27 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+// RUN: "$CC" "${1%%.*}.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.cmm}.exe"
+/*
+ * This is a very basic cmm example. For a more in-depth discussion see
+ * GHC/Cmm/Parser.y.  This file emulates a main function in Cmm.  Note
+ * that we can't use return. This will most of the time require a StackSlot,
+ * which isn't implemented in most code gen backends.
+ *
+ * To compile:
+ *
+ *   $HC -cpp -dcmm-lint -keep-s-file -c main.cmm
+ *   $CC main.o -o main
+ *
+ * Note: We use $CC to compile, the reson is that this allows us to depend on
+ *       the default crt0, ..., files; and we get functions like exit.
+ *
+ * Note2: The Cmm.h include (which requires the -cpp flag to $HC), this
+ *        allows us to use I64, I32, ...
+ */
+#include "Cmm.h"
+
+// CHECK-A64: .type main, @function
+// CHECK-A64-NEXT: main:
+main ( I32 argc, I32 argv_ptr ) {
+    foreign "C" exit(0::I64);
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/mo_f_eq.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/mo_f_eq.cmm
@@ -0,0 +1,10 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+#include "Cmm.h"
+#include "Types.h"
+
+w64 () {
+    // CHECK-A64:      fmov d31, xzr
+    // CHECK-A64-NEXT: fcmp d12, d31
+    // CHECK-A64-NEXT: cset x22, eq
+    R1 = %feq(D1, 0.0 :: W64);
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/mo_f_gt.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/mo_f_gt.cmm
@@ -0,0 +1,15 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+
+#include "Cmm.h"
+#include "Types.h"
+
+w64 () {
+    // CHECK-A64: fmov d31, xzr
+    R1 = %fgt(D1, 0.0 :: W64);
+
+    if (%fgt(D1, 0.0 :: W64)) {
+        R2 = 1 :: W64;
+    } else {
+        R2 = 0 :: W64;
+    }
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/mo_f_neg.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/mo_f_neg.cmm
@@ -0,0 +1,14 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+#include "Cmm.h"
+#include "Types.h"
+
+f_neg_same_reg () {
+    // CHECK-A64: neg x22, x22
+    R1 = %neg(R1);
+
+    // CHECK-A64: fneg s8, s8
+    F1 = %fneg(F1);
+
+    // CHECK-A64: fneg d12, d12
+    D1 = %fneg(D1);
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/mo_ff_conv.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/mo_ff_conv.cmm
@@ -0,0 +1,12 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+#include "Cmm.h"
+#include "Types.h"
+
+w64 () {
+    F64 x;
+    F32 y;
+    // CHECK-A64: fmov d31, d12
+    x = D1;
+    // CHECK-A64: fcvt s31, d31
+    y = %f2f32(x);
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/mo_uu_conv.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/mo_uu_conv.cmm
@@ -0,0 +1,20 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -dppr-debug -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+#include "Cmm.h"
+#include "Types.h"
+
+w32_w64 () {
+    I64 m;
+    m = 1234 :: I64;
+    // lobits64 is MO_UU_Conv_W32_W64, when given a I32
+    // zx32 is MO_UU_Conv_W64_W32
+    //
+    // First we extract the lower 32 bits from x18 (32 bit value -> 64 bit value)
+    // CHECK-A64: ubfm x22, x18, #0, #31
+    R1 = %lobits64(I32[m]);
+
+    // then we extract the lower 32 bits from x22 (64 bit value -> 32 bit value)
+    // CHECK-A64-NEXT: ubfm x18, x22, #0, #31
+    // and finally extend it again.
+    // CHECK-A64-NEXT: ubfm x23, x18, #0, #31
+    R2 = %lobits64(%zx32(R1));
+}
Index: ghc-9.8.2/tests/compiler/cmm/nan.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/nan.cmm
@@ -0,0 +1,51 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+// RUN: "$HC" -no-hs-main "${1%%.*}.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.cmm}.exe" | FileCheck "$1" -check-prefix=CHECK-RUN-A64
+
+#include "Cmm.h"
+#include "Types.h"
+
+// CHECK-A64: .globl main
+// CHECK-A64: main:
+
+main () {
+    F64 y;
+    I64 z;
+
+    D1 = 0.0 - 7.0;
+
+    (y) = foreign "C" sqrt(D1);
+
+    (z) = foreign "C" isDoubleNaN(y);
+
+    // CHECK-RUN-A64: sqrt(-7.000000) = nan
+    // CHECK-RUN-A64: isDoubleNaN(nan) = 1
+    foreign "C" printf("sqrt(%f) = %f\n", D1, y);
+    foreign "C" printf("isDoubleNaN(%f) = %d\n", y, z);
+
+    // CHECK-RUN-A64: nan < 0 = 0
+    foreign "C" printf("%f < 0 = %d\n", y, y `flt` 0.0);
+    // CHECK-RUN-A64: nan > 0 = 0
+    foreign "C" printf("%f > 0 = %d\n", y, y `fgt` 0.0);
+    // CHECK-RUN-A64: nan <= 0 = 0
+    foreign "C" printf("%f <= 0 = %d\n", y, y `fle` 0.0);
+    // CHECK-RUN-A64: nan >= 0 = 0
+    foreign "C" printf("%f >= 0 = %d\n", y, y `fge` 0.0);
+    // CHECK-RUN-A64: nan == 0 = 0
+    foreign "C" printf("%f == 0 = %d\n", y, y `feq` 0.0);
+    // CHECK-RUN-A64: nan /= 0 = 1
+    foreign "C" printf("%f /= 0 = %d\n", y, y `fne` 0.0);
+
+    F64 x;
+    x = 0.0; y = 0.0;
+    foreign "C" printf("%f / %f = %d\n", x, y, x `fquot` y);
+
+    // CHECK-RUN-A64: nan >= 0
+    if((x `fquot` y) < x) {
+        foreign "C" printf("%f < %f\n", x `fquot` y, x);
+    } else {
+        foreign "C" printf("%f >= %f\n", x `fquot` y, x);
+    }
+
+    foreign "C" exit(0::I64);
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/primops_math.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/primops_math.cmm
@@ -0,0 +1,111 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+// RUN: "$HC" -no-hs-main "${1%%.*}.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.cmm}.exe" | FileCheck "$1" -check-prefix=CHECK-RUN-A64
+
+#include "Cmm.h"
+
+// TODO: most of the unsigned test don't really test unsigned.
+
+// CHECK-A64: .globl main
+// CHECK-A64: main:
+main () {
+    R1 = 7; R2 = 3;
+    R4 = -7; R5 = -3;
+
+    // CHECK-RUN-A64: 7 + 3 => 10
+    R3 = R1 `add` R2;
+    foreign "C" printf("%d + %d => %d\n", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 - 3 => 4
+    R3 = R1 `sub` R2;
+    foreign "C" printf("%d - %d => %d\n", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 = 3 => 0
+    R3 = R1 `eq` R2;
+    foreign "C" printf("%d = %d => %d\n", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 * 3 => 21
+    R3 = R1 `mul` R2;
+    foreign "C" printf("%d * %d => %d\n", R1, R2, R3);
+
+    // CHECK-RUN-A64: -(7+3) => -10
+    R3 = %neg(R1+R2);
+    foreign "C" printf("-(%d+%d) => %d\n", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 / 3 => 2
+    R3 = R1 `quot` R2;
+    foreign "C" printf("%d / %d => %d\n", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 % 3 => 1
+    R3 = R1 `rem` R2;
+    foreign "C" printf("%d %% %d => %d\n", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 / 3 => 2
+    R3 = R1 `divu` R2;
+    foreign "C" printf("(unsigned): %d / %d => %d\n", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 % 3 => 1
+    R3 = R1 `modu` R2;
+    foreign "C" printf("(unsigned): %d %% %d => %d\n", R1, R2, R3);
+
+    // Signed >=, <=, >, <
+    // CHECK-RUN-A64: 7 >= 3 => 1
+    R3 = R1 `ge` R2;
+    foreign "C" printf("%d >= %d => %d", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 <= 3 => 0
+    R3 = R1 `le` R2;
+    foreign "C" printf("%d <= %d => %d", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 > 3 => 1
+    R3 = R1 `gt` R2;
+    foreign "C" printf("%d > %d => %d", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 < 3 => 0
+    R3 = R1 `lt` R2;
+    foreign "C" printf("%d < %d => %d", R1, R2, R3);
+
+    // Unsigned >=, <=, >, <
+    // CHECK-RUN-A64: 7 >= 3 => 1
+    R3 = R1 `geu` R2;
+    foreign "C" printf("(unsigned): %d >= %d => %d", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 <= 3 => 0
+    R3 = R1 `leu` R2;
+    foreign "C" printf("(unsigned): %d <= %d => %d", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 > 3 => 1
+    R3 = R1 `gtu` R2;
+    foreign "C" printf("(unsigned): %d > %d => %d", R1, R2, R3);
+
+    // CHECK-RUN-A64: 7 < 3 => 0
+    R3 = R1 `ltu` R2;
+    foreign "C" printf("(unsigned): %d < %d => %d", R1, R2, R3);
+
+    // Logical ops: and, or, xor, com, shl, shrl, shra
+    R3 = R1 `and` R2;
+    foreign "C" printf("%d and %d => %d", R1, R2, R3);
+
+    R3 = R1 `or` R2;
+    foreign "C" printf("%d or %d => %d", R1, R2, R3);
+
+    R3 = R1 `xor` R2;
+    foreign "C" printf("%d xor %d => %d", R1, R2, R3);
+
+    // R3 = R1 `com` R2;
+    // foreign "C" printf("%d com %d => %d", R1, R2, R3);
+
+    R3 = R1 `shl` R2;
+    foreign "C" printf("%d shl %d => %d", R1, R2, R3);
+
+    R3 = R1 `shrl` R2;
+    foreign "C" printf("%d shrl %d => %d", R1, R2, R3);
+
+    R3 = R1 `shra` R2;
+    foreign "C" printf("%d shra %d => %d", R1, R2, R3);
+
+
+
+
+    foreign "C" exit(0::I64);
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/shift_left.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/shift_left.cmm
@@ -0,0 +1,15 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+#include "Cmm.h"
+#include "Types.h"
+
+shift_left () {
+    I64 a, b, c, d, e, f;
+    a = 1;
+    b = 1;
+    c = 1;
+    d = 1;
+    // CHECK-A64: lsl x18, x18, x14
+    // CHECK-A64: lsl x18, x18, x17
+    // CHECK-A64: lsl x18, x18, x17
+    R2 = (((((a << 1 | b) << 1) | c) << 1) | d);
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/shift_left_conditional.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/shift_left_conditional.cmm
@@ -0,0 +1,13 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+#include "Cmm.h"
+#include "Types.h"
+
+// Original failing cmm:
+//
+// R1 = I64[(%MO_F_Gt_W32(F32[Sp + 8], F32[R1 + 7]) << 3) + ghczmprim_GHCziTypes_Bool_closure_tbl];   // CmmAssign
+//
+shift_left_conditional () {
+    // CHECK-A64: fcmp s31, s30
+    // CHECK-A64-NEXT: cset w18, gt
+    R1 = %fgt(F32[Sp], F32[R1]) << 3;
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/shift_right.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/shift_right.cmm
@@ -0,0 +1,24 @@
+// RUN: "$HC" -debug -dppr-debug -cpp -dcmm-lint -keep-s-file -O0 -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-RV64
+// RUN: "$CC" "${1%%.*}.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.cmm}.exe"
+
+#include "Cmm.h"
+#include "Types.h"
+
+main() {
+    I64 buffer;
+    I32 a, b, c, d;
+
+    I64 arr;
+    (arr) = foreign "C" malloc(1024);
+    bits64[arr] = 2;
+
+    a = I32[arr];
+    b = %mul(a, 32 :: I32);
+    c = %neg(b);
+    d = %shra(c, 4::I64);
+
+    foreign "C" printf("a: %hd b: %hd c: %hd d: %hd", a, b, c, d);
+
+    foreign "C" exit(d == -4 :: I32);
+}
Index: ghc-9.8.2/tests/compiler/cmm/store_neg_float.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/store_neg_float.cmm
@@ -0,0 +1,13 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+#include "Cmm.h"
+#include "Types.h"
+
+store_neg_float_on_heap () {
+    // CHECK-A64: fneg s31, s8
+    // CHECK-A64-NEXT: str s31, [ x21 ]
+    F32[Hp] = %fneg(F1);
+
+    // CHECK-A64: fneg d31, d12
+    // CHECK-A64-NEXT: str d31, [ x21 ]
+    F64[Hp] = %fneg(D1);
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/switch_expr.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/switch_expr.cmm
@@ -0,0 +1,52 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+// RUN: "$CC" "${1%%.*}.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.cmm}.exe"
+
+#include "Cmm.h"
+#include "Types.h"
+
+
+main (I64 argc) {
+    _switch: // global
+        switch [0 .. 7] argc {
+            case 0 : goto _a0;
+            case 1 : goto _a1;
+            case 2 : goto _a2;
+            case 3 : goto _a3;
+            case 4 : goto _a4;
+            case 5 : goto _a5;
+            case 6 : goto _a6;
+            case 7 : goto _a7;
+            default: {goto _c;}
+        }   // CmmSwitch
+    _a0: // global
+        foreign "C" printf("A0\n");
+        goto _end;
+    _a1: // global
+        foreign "C" printf("A1\n");
+        goto _end;
+    _a2: // global
+        foreign "C" printf("A2\n");
+        goto _end;
+    _a3: // global
+        foreign "C" printf("A3\n");
+        goto _end;
+    _a4: // global
+        foreign "C" printf("A4\n");
+        goto _end;
+    _a5: // global
+        foreign "C" printf("A5\n");
+        goto _end;
+    _a6: // global
+        foreign "C" printf("A6\n");
+        goto _end;
+    _a7: // global
+        foreign "C" printf("A7\n");
+        goto _end;
+    _c: // global
+        foreign "C" printf("C\n");
+        goto _end;
+
+    _end:
+        foreign "C" exit(0::I64);
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/unsafe_foreign_calls.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/unsafe_foreign_calls.cmm
@@ -0,0 +1,34 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+#include "Cmm.h"
+#include "Types.h"
+
+// Original failing cmm:
+//
+// (_ckSs::F32) = call MO_F32_ExpM1(_shwV::F32);   // CmmUnsafeForeignCall
+//
+// One issue with the cmm parser is, that we can't build the following expressions
+// the parser will deconstruct them into arg0 <- arg0_expr; f(arg0); instead of f(arg0_expr)
+//
+// (_ckSz::F32) = call MO_F32_Log(%MO_F_Neg_W32(_ckSs::F32));   // CmmUnsafeForeignCall
+// (_ckqE::F64) = call MO_F64_Log(2.0 :: W64);   // CmmUnsafeForeignCall
+// (_cjE1::F64) = call MO_F64_Atan(%MO_F_Quot_W64(_shFv::F64, _shFw::F64));   // CmmUnsafeForeignCall
+//
+//
+foreign_calls () {
+    F32 x, y;
+    x = F1;
+
+    // CHECK-A64: adrp x18, expm1f
+    // CHECK-A64: add x18, x18, :lo12:expm1f
+    // CHECK-A64: fmov s0, s30
+    // CHECK-A64: blr x18
+    (y) = prim %expM132f(F1);
+    // CHECK-A64: fmov s8, s30
+    F1 = y;
+    F2 = y;
+    (y) = prim %log32f(%fneg(x));
+
+    prim %log64f(2.0);
+
+    prim %atan64f(%fquot(D1,D2));
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/unsafe_foreign_calls_excess_arguments.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/unsafe_foreign_calls_excess_arguments.cmm
@@ -0,0 +1,36 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+// RUN: "$LN" -sf "$1" "${1%%.*}.c"
+// RUN: "$CC" -DC -c "${1%%.*}.c" -o "${1%%.*}_stub.o"
+// RUN: "$CC" "${1%%.*}.o" "${1%%.*}_stub.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.*}.exe"
+
+#if defined(C) // the C part
+int mega_add(int a, int b, int c, int d, int e, int f, int g, int h, int i, int j, int k) {
+    return a + b + c + d + e + f + g + h + i + j + k;
+}
+#else // the CMM part
+#include "Cmm.h"
+#include "Types.h"
+
+// Let's ensure we can call functions with lots of arguments, and they are
+// passed correctly.
+main () {
+
+    // create some fake stack space and assign to Sp.
+    I64 sp;
+    (sp) = foreign "C" malloc(1024);
+    Sp = sp;
+
+    I64 x, y;
+    I64 a, b, c, d, e, f, g, h, i, j, k;
+    a = 1; b = 2; c = 4; d = 8; e = 16; f = 32; g = 64; h = 128; i = 256; j = 512; k = 1024;
+
+    // We have 3 excess arguments; thus we need 4 slots (32 bytes)
+    // CHECK-A64: sub sp, sp, #32
+    // CHECK-A64: add sp, sp, #32
+    (y) = foreign "C" mega_add(a,b,c,d,e,f,g,h,i,j,k);
+    x = a + b + c + d + e + f + g + h + i + j + k;
+
+    foreign "C" exit(y != x);
+}
+#endif
Index: ghc-9.8.2/tests/compiler/cmm/unsafe_foreign_calls_excess_float_arguments.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/unsafe_foreign_calls_excess_float_arguments.cmm
@@ -0,0 +1,37 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+// RUN: "$LN" -sf "$1" "${1%%.*}.c"
+// RUN: "$CC" -DC -c "${1%%.*}.c" -o "${1%%.*}_stub.o"
+// RUN: "$CC" "${1%%.*}.o" "${1%%.*}_stub.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.*}.exe"
+
+#if defined(C) // the C part
+double mega_fadd(double a, double b, double c, double d, double e, double f, double g, double h, double i, double j, double k) {
+    return a + b + c + d + e + f + g + h + i + j + k;
+}
+
+#else // the CMM part
+#include "Cmm.h"
+#include "Types.h"
+
+// Let's ensure we can call functions with lots of arguments, and they are
+// passed correctly.
+main () {
+
+    // create some fake stack space and assign to Sp.
+    I64 sp;
+    (sp) = foreign "C" malloc(1024);
+    Sp = sp;
+
+    F64 fx, fy;
+    F64 fa, fb, fc, fd, fe, ff, fg, fh, fi, fj, fk;
+    fa = 1.0; fb = 2.0; fc = 4.0; fd = 8.0; fe = 16.0; ff = 32.0; fg = 64.0; fh = 128.0; fi = 256.0; fj = 512.0; fk = 1024.0;
+
+    // We have 3 excess arguments; thus we need 4 slots (32 bytes)
+    // CHECK-A64: sub sp, sp, #32
+    // CHECK-A64: add sp, sp, #32
+    (fy) = foreign "C" mega_fadd(fa,fb,fc,fd,fe,ff,fg,fh,fi,fj,fk);
+    fx = fa + fb + fc + fd + fe + ff + fg + fh + fi + fj + fk;
+
+    foreign "C" exit(fy != fx);
+}
+#endif
Index: ghc-9.8.2/tests/compiler/cmm/unsafe_foreign_calls_excess_mixed_arguments.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/unsafe_foreign_calls_excess_mixed_arguments.cmm
@@ -0,0 +1,57 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+// RUN: "$LN" -sf "$1" "${1%%.*}.c"
+// RUN: "$CC" -DC -c "${1%%.*}.c" -o "${1%%.*}_stub.o"
+// RUN: "$CC" "${1%%.*}.o" "${1%%.*}_stub.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.*}.exe"
+
+#if defined(C) // the C part
+#include <stdlib.h>
+#include <stdio.h>
+
+void *mega_mixed_add(int a, double fa, int b, double fb, int c, double fc,
+                             int d, double fd, int e, double fe, int f, double ff,
+                             int g, double fg, int h, double fh, int i, double fi,
+                             int j, double fj, int k, double fk) {
+
+    void *r = (void *)calloc(1,16);
+    *(int*)r = a + b + c + d + e + f + g + h + i + j + k;
+    *(double*)(r+8) = fa + fb + fc + fd + fe + ff + fg + fh + fi + fj + fk;
+    return r;
+}
+
+#else // the CMM part
+#include "Cmm.h"
+#include "Types.h"
+
+// Let's ensure we can call functions with lots of arguments, and they are
+// passed correctly.
+main () {
+
+    // create some fake stack space and assign to Sp.
+    I64 sp;
+    (sp) = foreign "C" malloc(1024);
+    Sp = sp;
+
+    I64 x, y;
+    I64 a, b, c, d, e, f, g, h, i, j, k;
+    a = 1; b = 2; c = 4; d = 8; e = 16; f = 32; g = 64; h = 128; i = 256; j = 512; k = 1024;
+
+    F64 fx, fy;
+    F64 fa, fb, fc, fd, fe, ff, fg, fh, fi, fj, fk;
+    fa = 1.0; fb = 2.0; fc = 4.0; fd = 8.0; fe = 16.0; ff = 32.0; fg = 64.0; fh = 128.0; fi = 256.0; fj = 512.0; fk = 1024.0;
+
+    // We have 6 excess arguments; thus we need 6 slots (48 bytes)
+    // CHECK-A64: sub sp, sp, #48
+    // CHECK-A64: add sp, sp, #48
+    I64 rptr;
+    (rptr) = foreign "C" mega_mixed_add(a,fa,b,fb,c,fc,d,fd,e,fe,f,ff,g,fg,h,fh,  i,fi,j,fj,k,fk);
+
+    y = I64[rptr];
+    fy = F64[rptr + 8];
+
+    x = a + b + c + d + e + f + g + h + i + j + k;
+    fx = fa + fb + fc + fd + fe + ff + fg + fh + fi + fj + fk;
+
+    foreign "C" exit((fy != fx) == (f != y));
+}
+#endif
Index: ghc-9.8.2/tests/compiler/cmm/unsafe_foreign_calls_spill_regs.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/unsafe_foreign_calls_spill_regs.cmm
@@ -0,0 +1,50 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-A64
+#include "Cmm.h"
+#include "Types.h"
+
+// Original failing cmm:
+//
+// (_ckSs::F32) = call MO_F32_ExpM1(_shwV::F32);   // CmmUnsafeForeignCall
+//
+// One issue with the cmm parser is, that we can't build the following expressions
+// the parser will deconstruct them into arg0 <- arg0_expr; f(arg0); instead of f(arg0_expr)
+//
+// (_ckSz::F32) = call MO_F32_Log(%MO_F_Neg_W32(_ckSs::F32));   // CmmUnsafeForeignCall
+// (_ckqE::F64) = call MO_F64_Log(2.0 :: W64);   // CmmUnsafeForeignCall
+// (_cjE1::F64) = call MO_F64_Atan(%MO_F_Quot_W64(_shFv::F64, _shFw::F64));   // CmmUnsafeForeignCall
+//
+//
+foreign_calls () {
+    F32 x, y;
+    x = F1;
+
+    // Some variables to ensure we use up all the registers.
+    F32 a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t;
+    a = F1; b = F1; c = F1; d = F1; e = F1; f = F1; g = F1; h = F1; i = F1;
+    j = F1; k = F1; l = F1; m = F1; n = F1; o = F1; p = F1; q = F1; r = F1;
+    s = F1; t = F1;
+    // Some integer register
+    I64 _a, _b, _c, _d, _e, _f, _g, _h, _i, _j, _k, _l, _m, _n, _o, _p, _q, _r, _s, _t, _u, _v, _w, _x, _y, _z;
+    _a = R1; _b = R1; _c = R1; _d = R1; _e = R1; _f = R1; _g = R1; _h = R1;
+    _i = R1; _j = R1; _k = R1; _l = R1; _m = R1; _n = R1; _o = R1; _p = R1;
+    _q = R1; _r = R1; _s = R1; _t = R1; _u = R1; _v = R1; _w = R1; _x = R1; _y = R1; _z = R1;
+
+    // CHECK-A64: adrp x11, expm1f
+    // CHECK-A64: add x11, x11, :lo12:expm1f
+    // CHECK-A64: fmov s0, s2
+    // CHECK-A64: blr x11
+    (y) = prim %expM132f(F1);
+    // CHECK-A64: fmov s8, s30
+    F1 = y;
+    F2 = y;
+    (y) = prim %log32f(%fneg(x));
+
+    prim %log64f(2.0);
+
+    prim %atan64f(%fquot(D1,D2));
+
+    // make sure a..t stay alive!
+    F3 = a + b + c + d + e + f + g + h + i + j + k + l + m + n + o + p + q + r + s + t;
+    R2 = _a + _b + _c + _d + _e + _f + _g + _h + _i + _j + _k + _l + _m + _n
+            + _o + _p + _q + _r + _s + _t + _u + _v + _w + _x + _y + _z;
+}
\ No newline at end of file
Index: ghc-9.8.2/tests/compiler/cmm/zero.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/tests/compiler/cmm/zero.cmm
@@ -0,0 +1,14 @@
+// RUN: "$HC" -cpp -dcmm-lint -keep-s-file -c "$1" && cat "${1%%.*}.s" | FileCheck "$1" -check-prefix=CHECK-RV64
+// RUN: "$CC" "${1%%.*}.o" -o "${1%%.*}.exe"
+// RUN: "$EXEC" "${1%%.cmm}.exe"
+
+#include "Cmm.h"
+#include "Types.h"
+
+main(){
+    I64 zero;
+    // Should refer to the zero register
+    // CHECK-RV64: addi t0, zero, 0
+    zero = 0;
+    foreign "C" exit(zero);
+}
Index: ghc-9.8.2/testsuite/tests/codeGen/should_run/CCallConv.hs
===================================================================
--- /dev/null
+++ ghc-9.8.2/testsuite/tests/codeGen/should_run/CCallConv.hs
@@ -0,0 +1,123 @@
+{-# LANGUAGE ForeignFunctionInterface #-}
+{-# LANGUAGE GHCForeignImportPrim #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+{-# LANGUAGE UnliftedFFITypes #-}
+
+-- | This test ensures that sub-word signed and unsigned parameters are correctly
+-- handed over to C functions. I.e. it asserts the calling-convention.
+--
+-- The number of parameters is currently shaped for the RISCV64 calling-convention.
+-- You may need to add more parameters to the C functions in case there are more
+-- registers reserved for parameters in your architecture.
+module Main where
+
+import Data.Word
+import GHC.Exts
+import GHC.Int
+
+foreign import ccall "fun8"
+  fun8 ::
+    Int8# -> -- a0
+    Word8# -> -- a1
+    Int8# -> -- a2
+    Int8# -> -- a3
+    Int8# -> -- a4
+    Int8# -> -- a5
+    Int8# -> -- a6
+    Int8# -> -- a7
+    Word8# -> -- s0
+    Int8# -> -- s1
+    Int64# -- result
+
+foreign import ccall "fun16"
+  fun16 ::
+    Int16# -> -- a0
+    Word16# -> -- a1
+    Int16# -> -- a2
+    Int16# -> -- a3
+    Int16# -> -- a4
+    Int16# -> -- a5
+    Int16# -> -- a6
+    Int16# -> -- a7
+    Word16# -> -- s0
+    Int16# -> -- s1
+    Int64# -- result
+
+foreign import ccall "fun32"
+  fun32 ::
+    Int32# -> -- a0
+    Word32# -> -- a1
+    Int32# -> -- a2
+    Int32# -> -- a3
+    Int32# -> -- a4
+    Int32# -> -- a5
+    Int32# -> -- a6
+    Int32# -> -- a7
+    Word32# -> -- s0
+    Int32# -> -- s1
+    Int64# -- result
+
+foreign import ccall "funFloat"
+  funFloat ::
+    Float# -> -- a0
+    Float# -> -- a1
+    Float# -> -- a2
+    Float# -> -- a3
+    Float# -> -- a4
+    Float# -> -- a5
+    Float# -> -- a6
+    Float# -> -- a7
+    Float# -> -- s0
+    Float# -> -- s1
+    Float# -- result
+
+foreign import ccall "funDouble"
+  funDouble ::
+    Double# -> -- a0
+    Double# -> -- a1
+    Double# -> -- a2
+    Double# -> -- a3
+    Double# -> -- a4
+    Double# -> -- a5
+    Double# -> -- a6
+    Double# -> -- a7
+    Double# -> -- s0
+    Double# -> -- s1
+    Double# -- result
+
+main :: IO ()
+main =
+  -- N.B. the values here aren't choosen by accident: -1 means all bits one in
+  -- twos-complement, which is the same as the max word value.
+  let i8 :: Int8# = intToInt8# (-1#)
+      w8 :: Word8# = wordToWord8# (255##)
+      res8 :: Int64# = fun8 i8 w8 i8 i8 i8 i8 i8 i8 w8 i8
+      expected_res8 :: Int64 = 2 * (fromInteger . fromIntegral) (maxBound :: Word8) + 8 * (-1)
+      i16 :: Int16# = intToInt16# (-1#)
+      w16 :: Word16# = wordToWord16# (65535##)
+      res16 :: Int64# = fun16 i16 w16 i16 i16 i16 i16 i16 i16 w16 i16
+      expected_res16 :: Int64 = 2 * (fromInteger . fromIntegral) (maxBound :: Word16) + 8 * (-1)
+      i32 :: Int32# = intToInt32# (-1#)
+      w32 :: Word32# = wordToWord32# (4294967295##)
+      res32 :: Int64# = fun32 i32 w32 i32 i32 i32 i32 i32 i32 w32 i32
+      expected_res32 :: Int64 = 2 * (fromInteger . fromIntegral) (maxBound :: Word32) + 8 * (-1)
+      resFloat :: Float = F# (funFloat 1.0# 1.1# 1.2# 1.3# 1.4# 1.5# 1.6# 1.7# 1.8# 1.9#)
+      resDouble :: Double = D# (funDouble 1.0## 1.1## 1.2## 1.3## 1.4## 1.5## 1.6## 1.7## 1.8## 1.9##)
+   in do
+        print $ "fun8 result:" ++ show (I64# res8)
+        assertEqual expected_res8 (I64# res8)
+        print $ "fun16 result:" ++ show (I64# res16)
+        assertEqual expected_res16 (I64# res16)
+        print $ "fun32 result:" ++ show (I64# res32)
+        assertEqual expected_res32 (I64# res32)
+        print $ "funFloat result:" ++ show resFloat
+        assertEqual (14.5 :: Float) resFloat
+        print $ "funDouble result:" ++ show resDouble
+        assertEqual (14.5 :: Double) resDouble
+
+assertEqual :: (Eq a, Show a) => a -> a -> IO ()
+assertEqual a b =
+  if a == b
+    then pure ()
+    else error $ show a ++ " =/= " ++ show b
Index: ghc-9.8.2/testsuite/tests/codeGen/should_run/CCallConv.stdout
===================================================================
--- /dev/null
+++ ghc-9.8.2/testsuite/tests/codeGen/should_run/CCallConv.stdout
@@ -0,0 +1,60 @@
+"fun8 result:502"
+"fun16 result:131062"
+"fun32 result:8589934582"
+"funFloat result:14.5"
+"funDouble result:14.5"
+fun32:
+a0: 0xffffffff -1
+a1: 0xffffffff 4294967295
+a2: 0xffffffff -1
+a3: 0xffffffff -1
+a4: 0xffffffff -1
+a5: 0xffffffff -1
+a6: 0xffffffff -1
+a7: 0xffffffff -1
+s0: 0xffffffff -1
+s1: 0xffffffff 4294967295
+fun16:
+a0: 0xffffffff -1
+a1: 0xffff 65535
+a2: 0xffffffff -1
+a3: 0xffffffff -1
+a4: 0xffffffff -1
+a5: 0xffffffff -1
+a6: 0xffffffff -1
+a7: 0xffffffff -1
+s0: 0xffffffff -1
+s1: 0xffff 65535
+fun8:
+a0: 0xffffffff -1
+a1: 0xff 255
+a2: 0xffffffff -1
+a3: 0xffffffff -1
+a4: 0xffffffff -1
+a5: 0xffffffff -1
+a6: 0xffffffff -1
+a7: 0xffffffff -1
+s0: 0xffffffff -1
+s1: 0xff 255
+funFloat:
+a0: 1.000000
+a1: 1.100000
+a2: 1.200000
+a3: 1.300000
+a4: 1.400000
+a5: 1.500000
+a6: 1.600000
+a7: 1.700000
+s0: 1.800000
+s1: 1.900000
+funDouble:
+a0: 1.000000
+a1: 1.100000
+a2: 1.200000
+a3: 1.300000
+a4: 1.400000
+a5: 1.500000
+a6: 1.600000
+a7: 1.700000
+s0: 1.800000
+s1: 1.900000
Index: ghc-9.8.2/testsuite/tests/codeGen/should_run/CCallConv_c.c
===================================================================
--- /dev/null
+++ ghc-9.8.2/testsuite/tests/codeGen/should_run/CCallConv_c.c
@@ -0,0 +1,91 @@
+#include "stdint.h"
+#include "stdio.h"
+
+int64_t fun8(int8_t a0, uint8_t a1, int8_t a2, int8_t a3, int8_t a4, int8_t a5,
+             int8_t a6, int8_t a7, int8_t s0, uint8_t s1) {
+  printf("fun8:\n");
+  printf("a0: %#x %hhd\n", a0, a0);
+  printf("a1: %#x %hhu\n", a1, a1);
+  printf("a2: %#x %hhd\n", a2, a2);
+  printf("a3: %#x %hhd\n", a3, a3);
+  printf("a4: %#x %hhd\n", a4, a4);
+  printf("a5: %#x %hhd\n", a5, a5);
+  printf("a6: %#x %hhd\n", a6, a6);
+  printf("a7: %#x %hhd\n", a7, a7);
+  printf("s0: %#x %hhd\n", s0, s0);
+  printf("s1: %#x %hhu\n", s1, s1);
+
+  return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + s1;
+}
+
+int64_t fun16(int16_t a0, uint16_t a1, int16_t a2, int16_t a3, int16_t a4,
+              int16_t a5, int16_t a6, int16_t a7, int16_t s0, uint16_t s1) {
+  printf("fun16:\n");
+  printf("a0: %#x %hd\n", a0, a0);
+  printf("a1: %#x %hu\n", a1, a1);
+  printf("a2: %#x %hd\n", a2, a2);
+  printf("a3: %#x %hd\n", a3, a3);
+  printf("a4: %#x %hd\n", a4, a4);
+  printf("a5: %#x %hd\n", a5, a5);
+  printf("a6: %#x %hd\n", a6, a6);
+  printf("a7: %#x %hd\n", a7, a7);
+  printf("s0: %#x %hd\n", s0, s0);
+  printf("s1: %#x %hu\n", s1, s1);
+
+  return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + s1;
+}
+
+int64_t fun32(int32_t a0, uint32_t a1, int32_t a2, int32_t a3, int32_t a4,
+              int32_t a5, int32_t a6, int32_t a7, int32_t s0, uint32_t s1) {
+  printf("fun32:\n");
+  printf("a0: %#x %d\n", a0, a0);
+  printf("a1: %#x %u\n", a1, a1);
+  printf("a2: %#x %d\n", a2, a2);
+  printf("a3: %#x %d\n", a3, a3);
+  printf("a4: %#x %d\n", a4, a4);
+  printf("a5: %#x %d\n", a5, a5);
+  printf("a6: %#x %d\n", a6, a6);
+  printf("a7: %#x %d\n", a7, a7);
+  printf("s0: %#x %d\n", s0, s0);
+  printf("s1: %#x %u\n", s1, s1);
+
+  // Ensure the addition happens in long int (not just int) precission.
+  // Otherwise, the result is truncated during the operation.
+  int64_t force_int64_precission = 0;
+  return force_int64_precission + a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 +
+         s1;
+}
+
+float funFloat(float a0, float a1, float a2, float a3, float a4, float a5,
+             float a6, float a7, float s0, float s1) {
+  printf("funFloat:\n");
+  printf("a0: %f\n", a0);
+  printf("a1: %f\n", a1);
+  printf("a2: %f\n", a2);
+  printf("a3: %f\n", a3);
+  printf("a4: %f\n", a4);
+  printf("a5: %f\n", a5);
+  printf("a6: %f\n", a6);
+  printf("a7: %f\n", a7);
+  printf("s0: %f\n", s0);
+  printf("s1: %f\n", s1);
+
+  return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + s1;
+}
+
+double funDouble(double a0, double a1, double a2, double a3, double a4, double a5,
+             double a6, double a7, double s0, double s1) {
+  printf("funDouble:\n");
+  printf("a0: %f\n", a0);
+  printf("a1: %f\n", a1);
+  printf("a2: %f\n", a2);
+  printf("a3: %f\n", a3);
+  printf("a4: %f\n", a4);
+  printf("a5: %f\n", a5);
+  printf("a6: %f\n", a6);
+  printf("a7: %f\n", a7);
+  printf("s0: %f\n", s0);
+  printf("s1: %f\n", s1);
+
+  return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + s0 + s1;
+}
Index: ghc-9.8.2/testsuite/tests/codeGen/should_run/MulMayOflo_cmm.cmm
===================================================================
--- /dev/null
+++ ghc-9.8.2/testsuite/tests/codeGen/should_run/MulMayOflo_cmm.cmm
@@ -0,0 +1,89 @@
+// Suppress empty ASSERT() optimization
+#define USE_ASSERTS_ALL_WAYS 1
+
+#include "Cmm.h"
+
+runCmmzh() {
+// BEWARE: Cmm isn't really type checked. I.e. you may construct
+// 256::I8, which is obviously wrong and let's to strange behaviour.
+
+// N.B. the contract of '%mulmayoflo' is a bit weak:
+// "Return non-zero if there is any possibility that the signed multiply
+// of a and b might overflow.  Return zero only if you are absolutely sure
+// that it won't overflow.  If in doubt, return non-zero." (Stg.h)
+// So, this test might be a bit too strict for some architectures as it
+// expects a perfect implementation.
+
+  // --- I8
+  ASSERT(%mulmayoflo(1::I8, 1::I8) == 0::I8);
+  ASSERT(%mulmayoflo(0::I8, 0::I8) == 0::I8);
+  ASSERT(%mulmayoflo(-1::I8, 1::I8) == 0::I8);
+  ASSERT(%mulmayoflo(1::I8, -1::I8) == 0::I8);
+  ASSERT(%mulmayoflo(127::I8, 1::I8) == 0::I8);
+  ASSERT(%mulmayoflo(1::I8, 127::I8) == 0::I8);
+  ASSERT(%mulmayoflo(63::I8, 2::I8) == 0::I8);
+  ASSERT(%mulmayoflo(2::I8, 63::I8) == 0::I8);
+  ASSERT(%mulmayoflo(64::I8, 2::I8) > 0::I8);
+  ASSERT(%mulmayoflo(2::I8, 64::I8) > 0::I8);
+  ASSERT(%mulmayoflo(127::I8, -1::I8) == 0::I8);
+  ASSERT(%mulmayoflo(-1::I8, 127::I8) == 0::I8);
+  ASSERT(%mulmayoflo(-128::I8, 1::I8) == 0::I8);
+  ASSERT(%mulmayoflo(-64::I8, 2::I8) == 0::I8);
+  ASSERT(%mulmayoflo(2::I8, -64::I8) == 0::I8);
+  ASSERT(%mulmayoflo(-65::I8, 2::I8) > 0::I8);
+  ASSERT(%mulmayoflo(2::I8, -65::I8) > 0::I8);
+
+  // --- I16
+  ASSERT(%mulmayoflo(1::I16, 1::I16) == 0::I16);
+  ASSERT(%mulmayoflo(0::I16, 0::I16) == 0::I16);
+  ASSERT(%mulmayoflo(-1::I16, 1::I16) == 0::I16);
+  ASSERT(%mulmayoflo(1::I16, -1::I16) == 0::I16);
+  ASSERT(%mulmayoflo(32767::I16, 1::I16) == 0::I16);
+  ASSERT(%mulmayoflo(1::I16,32767 ::I16) == 0::I16);
+  ASSERT(%mulmayoflo(16383::I16, 2::I16) == 0::I16);
+  ASSERT(%mulmayoflo(2::I16, 16383::I16) == 0::I16);
+  ASSERT(%mulmayoflo(16384::I16, 2::I16) > 0::I16);
+  ASSERT(%mulmayoflo(2::I16, 16384::I16) > 0::I16);
+  ASSERT(%mulmayoflo(-16384::I16, 2::I16) == 0::I16);
+  ASSERT(%mulmayoflo(2::I16, -16384::I16) == 0::I16);
+  ASSERT(%mulmayoflo(-32768::I16, 1::I16) == 0::I16);
+  ASSERT(%mulmayoflo(1::I16, -32768::I16) == 0::I16);
+
+  // -- I32
+  ASSERT(%mulmayoflo(1::I32, 1::I32) == 0::I32);
+  ASSERT(%mulmayoflo(0::I32, 0::I32) == 0::I32);
+  ASSERT(%mulmayoflo(-1::I32, 1::I32) == 0::I32);
+  ASSERT(%mulmayoflo(1::I32, -1::I32) == 0::I32);
+  ASSERT(%mulmayoflo(2147483647::I32, 1::I32) == 0::I32);
+  ASSERT(%mulmayoflo(1::I32, 2147483647::I32) == 0::I32);
+  ASSERT(%mulmayoflo(-2147483648::I32, 1::I32) == 0::I32);
+  ASSERT(%mulmayoflo(1::I32, -2147483648::I32) == 0::I32);
+  ASSERT(%mulmayoflo(2::I32, 1073741823::I32) == 0::I32);
+  ASSERT(%mulmayoflo(1073741823::I32, 2::I32) == 0::I32);
+  ASSERT(%mulmayoflo(2::I32, 1073741824::I32) > 0::I32);
+  ASSERT(%mulmayoflo(1073741824::I32, 2::I32) > 0::I32);
+  ASSERT(%mulmayoflo(2::I32, -1073741824::I32) == 0::I32);
+  ASSERT(%mulmayoflo(-1073741824::I32, 2::I32) == 0::I32);
+  ASSERT(%mulmayoflo(2::I32, -1073741825::I32) > 0::I32);
+  ASSERT(%mulmayoflo(-1073741825::I32, 2::I32) > 0::I32);
+
+  // -- I64
+  ASSERT(%mulmayoflo(1::I64, 1::I64) == 0::I64);
+  ASSERT(%mulmayoflo(0::I64, 0::I64) == 0::I64);
+  ASSERT(%mulmayoflo(-1::I64, 1::I64) == 0::I64);
+  ASSERT(%mulmayoflo(1::I64, -1::I64) == 0::I64);
+  ASSERT(%mulmayoflo(9223372036854775807::I64, 1::I64) == 0::I64);
+  ASSERT(%mulmayoflo(1::I64, 9223372036854775807::I64) == 0::I64);
+  ASSERT(%mulmayoflo(-9223372036854775808::I64, 1::I64) == 0::I64);
+  ASSERT(%mulmayoflo(1::I64, -9223372036854775808::I64) == 0::I64);
+  ASSERT(%mulmayoflo(4611686018427387903::I64, 2::I64) == 0::I64);
+  ASSERT(%mulmayoflo(2::I64, 4611686018427387903::I64) == 0::I64);
+  ASSERT(%mulmayoflo(-4611686018427387904::I64, 2::I64) == 0::I64);
+  ASSERT(%mulmayoflo(2::I64, -4611686018427387904::I64) == 0::I64);
+  ASSERT(%mulmayoflo(-4611686018427387905::I64, 2::I64) > 0::I64);
+  ASSERT(%mulmayoflo(2::I64, -4611686018427387905::I64) > 0::I64);
+  ASSERT(%mulmayoflo(4611686018427387904::I64, 2::I64) > 0::I64);
+  ASSERT(%mulmayoflo(2::I64, 4611686018427387904::I64) > 0::I64);
+
+  return(0);
+}

Places

File riscv-ncg.patch of Package ghc

Places