Peeter Joot's Blog

The quest for DWARF instrumentation in a toy MLIR compiler

May 25, 2025 C/C++ development and debugging. debug, dwarf, flang, fused-location, gdb, LLVM IR, lowering, MLIR

I’ve gotten my toy compiler project to the stage where I can generate a program, compile and run it. Here’s a little example:

Screenshot

This is the MLIR that is generated for this little program:

"builtin.module"() ({
  "toy.program"() ({
    "toy.declare"() <{name = "x", type = f64}> : () -> () loc(#loc)
    %0 = "arith.constant"() <{value = 5 : i64}> : () -> i64 loc(#loc1)
    %1 = "arith.constant"() <{value = 3.140000e+00 : f64}> : () -> f64 loc(#loc1)
    %2 = "toy.add"(%0, %1) : (i64, f64) -> f64 loc(#loc1)
    "toy.assign"(%2) <{name = "x"}> : (f64) -> () loc(#loc1)
    %3 = "toy.load"() <{name = "x"}> : () -> f64 loc(#loc2)
    "toy.print"(%3) : (f64) -> () loc(#loc2)
    "toy.declare"() <{name = "y", type = f64}> : () -> () loc(#loc3)
    %4 = "toy.load"() <{name = "x"}> : () -> f64 loc(#loc4)
    %5 = "arith.constant"() <{value = 2 : i64}> : () -> i64 loc(#loc4)
    %6 = "toy.mul"(%4, %5) : (f64, i64) -> f64 loc(#loc4)
    "toy.assign"(%6) <{name = "y"}> : (f64) -> () loc(#loc4)
    %7 = "toy.load"() <{name = "y"}> : () -> f64 loc(#loc5)
    "toy.print"(%7) : (f64) -> () loc(#loc5)
    "toy.exit"() : () -> () loc(#loc)
  }) : () -> () loc(#loc)
}) : () -> () loc(#loc)
#loc = loc("test.toy":1:1)
#loc1 = loc("test.toy":2:5)
#loc2 = loc("test.toy":3:1)
#loc3 = loc("test.toy":4:1)
#loc4 = loc("test.toy":5:5)
#loc5 = loc("test.toy":6:1)

Notice how the location information is baked into the cake. After lowering of all the MLIR operations to LLVM-IR (except for the top most MLIR module), we have:

"builtin.module"() ({
  "llvm.func"() <{CConv = #llvm.cconv, function_type = !llvm.func, linkage = #llvm.linkage, sym_name = "__toy_print_f64", visibility_ = 0 : i64}> ({
  }) : () -> () loc(#loc)
  "llvm.func"() <{CConv = #llvm.cconv, function_type = !llvm.func, linkage = #llvm.linkage, sym_name = "__toy_print_i64", visibility_ = 0 : i64}> ({
  }) : () -> () loc(#loc)
  "llvm.func"() <{CConv = #llvm.cconv, function_type = !llvm.func, linkage = #llvm.linkage, sym_name = "main", visibility_ = 0 : i64}> ({
    %0 = "llvm.mlir.constant"() <{value = 1 : i64}> : () -> i64 loc(#loc)
    %1 = "llvm.alloca"(%0) <{alignment = 8 : i64, elem_type = f64}> : (i64) -> !llvm.ptr loc(#loc)
    %2 = "llvm.mlir.constant"() <{value = 5 : i64}> : () -> i64 loc(#loc1)
    %3 = "llvm.mlir.constant"() <{value = 3.140000e+00 : f64}> : () -> f64 loc(#loc1)
    %4 = "llvm.sitofp"(%2) : (i64) -> f64 loc(#loc1)
    %5 = "llvm.fadd"(%4, %3) <{fastmathFlags = #llvm.fastmath}> : (f64, f64) -> f64 loc(#loc1)
    "llvm.store"(%5, %1) <{ordering = 0 : i64}> : (f64, !llvm.ptr) -> () loc(#loc1)
    %6 = "llvm.load"(%1) <{ordering = 0 : i64}> : (!llvm.ptr) -> f64 loc(#loc2)
    "llvm.call"(%6) <{CConv = #llvm.cconv, TailCallKind = #llvm.tailcallkind, callee = @__toy_print_f64, fastmathFlags = #llvm.fastmath, op_bundle_sizes = array, operandSegmentSizes = array}> : (f64) -> () loc(#loc2)
    %7 = "llvm.alloca"(%0) <{alignment = 8 : i64, elem_type = f64}> : (i64) -> !llvm.ptr loc(#loc3)
    %8 = "llvm.load"(%1) <{ordering = 0 : i64}> : (!llvm.ptr) -> f64 loc(#loc4)
    %9 = "llvm.mlir.constant"() <{value = 2 : i64}> : () -> i64 loc(#loc4)
    %10 = "llvm.sitofp"(%9) : (i64) -> f64 loc(#loc4)
    %11 = "llvm.fmul"(%8, %10) <{fastmathFlags = #llvm.fastmath}> : (f64, f64) -> f64 loc(#loc4)
    "llvm.store"(%11, %7) <{ordering = 0 : i64}> : (f64, !llvm.ptr) -> () loc(#loc4)
    %12 = "llvm.load"(%7) <{ordering = 0 : i64}> : (!llvm.ptr) -> f64 loc(#loc5)
    "llvm.call"(%12) <{CConv = #llvm.cconv, TailCallKind = #llvm.tailcallkind, callee = @__toy_print_f64, fastmathFlags = #llvm.fastmath, op_bundle_sizes = array, operandSegmentSizes = array}> : (f64) -> () loc(#loc5)
    %13 = "llvm.mlir.constant"() <{value = 0 : i32}> : () -> i32 loc(#loc)
    "llvm.return"(%13) : (i32) -> () loc(#loc)
  }) : () -> () loc(#loc)
}) : () -> () loc(#loc)
#loc = loc("../samples/test.toy":1:1)
#loc1 = loc("../samples/test.toy":2:5)
#loc2 = loc("../samples/test.toy":3:1)
#loc3 = loc("../samples/test.toy":4:1)
#loc4 = loc("../samples/test.toy":5:5)
#loc5 = loc("../samples/test.toy":6:1)

All is still good. See how all the location information has been retained through the lowering to LLVM-IR transformations. However, as soon as we run mlir::translateModuleToLLVMIR, our final LLVM-IR ends up stripped of all the location information:

; ModuleID = '../samples/test.toy'
source_filename = "../samples/test.toy"

declare void @__toy_print_f64(double)

declare void @__toy_print_i64(i64)

define i32 @main() {
  %1 = alloca double, i64 1, align 8
  store double 8.140000e+00, ptr %1, align 8
  %2 = load double, ptr %1, align 8
  call void @__toy_print_f64(double %2)
  %3 = alloca double, i64 1, align 8
  %4 = load double, ptr %1, align 8
  %5 = fmul double %4, 2.000000e+00
  store double %5, ptr %3, align 8
  %6 = load double, ptr %3, align 8
  call void @__toy_print_f64(double %6)
  ret i32 0
}

!llvm.module.flags = !{!0}

!0 = !{i32 2, !"Debug Info Version", i32 3}

If we build a simple standalone C program with clang, it’s a much different story:

int main() {
    int x = 42;
    return 0;
}

fedora:/home/pjoot/toycalculator/prototypes> clang -g -S -emit-llvm test.c
fedora:/home/pjoot/toycalculator/prototypes> cat test.s
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-redhat-linux-gnu"

; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 @main() #0 !dbg !8 {
  %1 = alloca i32, align 4
  %2 = alloca i32, align 4
  store i32 0, ptr %1, align 4
    #dbg_declare(ptr %2, !13, !DIExpression(), !14)
  store i32 42, ptr %2, align 4, !dbg !14
  ret i32 0, !dbg !15
}

attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }

!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!2, !3, !4, !5, !6}
!llvm.ident = !{!7}

!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.1.3 (Fedora 20.1.3-1.fc42)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
!1 = !DIFile(filename: "test.c", directory: "/home/pjoot/toycalculator/prototypes", checksumkind: CSK_MD5, checksum: "8d269ab8efc2854a823fb05a50ccf4b0")
!2 = !{i32 7, !"Dwarf Version", i32 5}
!3 = !{i32 2, !"Debug Info Version", i32 3}
!4 = !{i32 1, !"wchar_size", i32 4}
!5 = !{i32 7, !"uwtable", i32 2}
!6 = !{i32 7, !"frame-pointer", i32 2}
!7 = !{!"clang version 20.1.3 (Fedora 20.1.3-1.fc42)"}
!8 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12)
!9 = !DISubroutineType(types: !10)
!10 = !{!11}
!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!12 = !{}
!13 = !DILocalVariable(name: "x", scope: !8, file: !1, line: 2, type: !11)
!14 = !DILocation(line: 2, column: 9, scope: !8)
!15 = !DILocation(line: 3, column: 5, scope: !8)

This has a number of features that our automatically-lowered LLVM-IR does not have including:

!DICompileUnit
!DIFile
Dwarf Version
!DISubprogram
!DILocation
!llvm.dbg.cu
source_filename
target datalayout
target triple

It doesn’t surprise me that we don’t have some of these (like target triple, …) without doing extra work, but I’d have expected the location info to be converted into !DILocation.

It’s possible to emit DWARF info from an LLVM builder, which is how a non-MLIR project would do it. I have brutal hack of an example of that in my project — that program is a little standalone beastie that constructs equivalent LLVM-IR for a simulated C program. After MLIR lowering to LLVM-IR, I’m able to inject all the DI instrumentation (relying on the fact that I know the operations in this program, and the line numbers for those operations). It would be very difficult to do this in my toy-language compiler driver, since that has to work for all programs. It is totally and completely obvious that this is the WRONG way to do it. However, the little program is at least debuggable, and perhaps worth something for that reason.

We see the same sort of DI artifacts if we build a Fortran program with flang (not classic-flang, but MLIR flang):

program main
  implicit none
  integer :: x
  x = 42
  print *, x
end program main

The flang LL has all the expected DI elements:

; ModuleID = 'FIRModule'
source_filename = "FIRModule"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

$_QQclXc451cd8b1f5eae110b31be7df6c0fcf9 = comdat any

@_QQclXc451cd8b1f5eae110b31be7df6c0fcf9 = linkonce constant [43 x i8] c"/home/pjoot/toycalculator/fortran/test.f90\00", comdat

define void @_QQmain() #0 !dbg !6 {
  %1 = alloca i32, i64 1, align 4, !dbg !9
    #dbg_declare(ptr %1, !10, !DIExpression(), !9)
  store i32 42, ptr %1, align 4, !dbg !12
  %2 = call ptr @_FortranAioBeginExternalListOutput(i32 6, ptr @_QQclXc451cd8b1f5eae110b31be7df6c0fcf9, i32 5), !dbg !13
  %3 = load i32, ptr %1, align 4, !dbg !14
  %4 = call i1 @_FortranAioOutputInteger32(ptr %2, i32 %3), !dbg !14
  %5 = call i32 @_FortranAioEndIoStatement(ptr %2), !dbg !13
  ret void, !dbg !15
}

declare !dbg !16 ptr @_FortranAioBeginExternalListOutput(i32, ptr, i32)

declare !dbg !21 zeroext i1 @_FortranAioOutputInteger32(ptr, i32)

declare !dbg !25 i32 @_FortranAioEndIoStatement(ptr)

declare !dbg !28 void @_FortranAProgramStart(i32, ptr, ptr, ptr)

declare !dbg !31 void @_FortranAProgramEndStatement()

define i32 @main(i32 %0, ptr %1, ptr %2) #0 !dbg !33 {
  call void @_FortranAProgramStart(i32 %0, ptr %1, ptr %2, ptr null), !dbg !36
  call void @_QQmain(), !dbg !36
  call void @_FortranAProgramEndStatement(), !dbg !36
  ret i32 0, !dbg !36
}

attributes #0 = { "frame-pointer"="all" "target-cpu"="x86-64" }

!llvm.module.flags = !{!0, !1, !2}
!llvm.dbg.cu = !{!3}
!llvm.ident = !{!5}

!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = distinct !DICompileUnit(language: DW_LANG_Fortran95, file: !4, producer: "flang version 20.1.5 (https://github.com/llvm/llvm-project.git 7b09d7b446383b71b63d429b21ee45ba389c5134)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
!4 = !DIFile(filename: "test.f90", directory: "/home/pjoot/toycalculator/fortran")
!5 = !{!"flang version 20.1.5 (https://github.com/llvm/llvm-project.git 7b09d7b446383b71b63d429b21ee45ba389c5134)"}
!6 = distinct !DISubprogram(name: "main", linkageName: "_QQmain", scope: !4, file: !4, line: 1, type: !7, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagMainSubprogram, unit: !3)
!7 = !DISubroutineType(cc: DW_CC_program, types: !8)
!8 = !{null}
!9 = !DILocation(line: 3, column: 14, scope: !6)
!10 = !DILocalVariable(name: "x", scope: !6, file: !4, line: 3, type: !11)
!11 = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
!12 = !DILocation(line: 4, column: 3, scope: !6)
!13 = !DILocation(line: 5, column: 3, scope: !6)
!14 = !DILocation(line: 5, column: 12, scope: !6)
!15 = !DILocation(line: 6, column: 1, scope: !6)
!16 = !DISubprogram(name: "_FortranAioBeginExternalListOutput", linkageName: "_FortranAioBeginExternalListOutput", scope: !4, file: !4, line: 5, type: !17, scopeLine: 5, spFlags: 0)
!17 = !DISubroutineType(cc: DW_CC_normal, types: !18)
!18 = !{!19, !11, !20, !11}
!19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !20, size: 64)
!20 = !DIBasicType(name: "integer", size: 8, encoding: DW_ATE_signed)
!21 = !DISubprogram(name: "_FortranAioOutputInteger32", linkageName: "_FortranAioOutputInteger32", scope: !4, file: !4, line: 5, type: !22, scopeLine: 5, spFlags: 0)
!22 = !DISubroutineType(cc: DW_CC_normal, types: !23)
!23 = !{!24, !20, !11}
!24 = !DIBasicType(name: "integer", size: 1, encoding: DW_ATE_signed)
!25 = !DISubprogram(name: "_FortranAioEndIoStatement", linkageName: "_FortranAioEndIoStatement", scope: !4, file: !4, line: 5, type: !26, scopeLine: 5, spFlags: 0)
!26 = !DISubroutineType(cc: DW_CC_normal, types: !27)
!27 = !{!11, !20}
!28 = !DISubprogram(name: "_FortranAProgramStart", linkageName: "_FortranAProgramStart", scope: !4, file: !4, line: 6, type: !29, scopeLine: 6, spFlags: 0)
!29 = !DISubroutineType(cc: DW_CC_normal, types: !30)
!30 = !{null, !11, !11, !11, !11}
!31 = !DISubprogram(name: "_FortranAProgramEndStatement", linkageName: "_FortranAProgramEndStatement", scope: !4, file: !4, line: 6, type: !32, scopeLine: 6, spFlags: 0)
!32 = !DISubroutineType(cc: DW_CC_normal, types: !8)
!33 = distinct !DISubprogram(name: "main", linkageName: "main", scope: !4, file: !4, line: 6, type: !34, scopeLine: 6, spFlags: DISPFlagDefinition, unit: !3)
!34 = !DISubroutineType(cc: DW_CC_normal, types: !35)
!35 = !{!11, !11, !11, !11}
!36 = !DILocation(line: 6, column: 1, scope: !33)

Ignoring all the language bootstrap gorp, this has all the expected DI elements, so we know that there’s a mechanism for flang to emit the DWARF instrumentation.

Looking through the flang source, the only place I found any sort of DI instrumentation was flang/lib/Optimizer/Transforms/AddDebugInfo.cpp, which, in about 600 lines, appears to implement all the DI instrumentation needed for variable and function declarations for the Fortran language (as a MLIR pass.) There’s nothing in there for location translation that I can see, except for a couple of little sneaky bits

bool debugInfoIsAlreadySet(mlir::Location loc) {
  if (mlir::isa<mlir::FusedLoc>(loc)) {
    if (loc->findInstanceOf<mlir::FusedLocWith<fir::LocationKindAttr>>())
      return false;
    return true;
  }
  return false;
}

...
declOp->setLoc(builder.getFusedLoc({declOp->getLoc()}, localVarAttr));
...
globalOp->setLoc(builder.getFusedLoc({globalOp.getLoc()}, arrayAttr));
...

void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
                                    mlir::LLVM::DIFileAttr fileAttr,
                                    mlir::LLVM::DICompileUnitAttr cuAttr,
                                    fir::DebugTypeGenerator &typeGen,
                                    mlir::SymbolTable *symbolTable) {
  mlir::Location l = funcOp->getLoc();
  // If fused location has already been created then nothing to do
  // Otherwise, create a fused location.
  if (debugInfoIsAlreadySet(l))
    return;
...
// We have the imported entities now. Generate the final DISubprogramAttr.
  spAttr = mlir::LLVM::DISubprogramAttr::get(
      context, recId, /*isRecSelf=*/false, id2, compilationUnit, Scope,
      funcName, fullName, funcFileAttr, line, line, subprogramFlags,
      subTypeAttr, entities, /*annotations=*/{});
  funcOp->setLoc(builder.getFusedLoc({l}, spAttr));
...

It looks as if a fused location is used to replace any MLIR supplied location for each function, but all the rest of the location info is left as is. My presumption is that translateModuleToLLVMIR() handles the plain old loc() references to !dbg. To get an idea what happens in AddDebugInfoPass, let’s look at the MLIR dump at the beginning, and compare to afterwards.

To see this, I used the following gdb script:

set follow-fork-mode child
set detach-on-fork off
set breakpoint pending on
break AddDebugInfoPass::runOnOperation
run

Here’s the debug session to see the module dumps before and after AddDebugInfoPass

fedora:/home/pjoot/toycalculator/fortran> gdb -q -x b --args /usr/local/llvm-20.1.5/bin/flang-new -g -S -emit-llvm test.f90 -o test.ll
...
[Switching to Thread 0x7fffda437640 (LWP 119612)]

Thread 2.1 "flang" hit Breakpoint 1.2, (anonymous namespace)::AddDebugInfoPass::runOnOperation (this=0x6368e0)
    at /home/pjoot/llvm-project/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp:520
520	  mlir::ModuleOp module = getOperation();
(gdb) n
521	  mlir::MLIRContext *context = &getContext();
(gdb) p module->dump()
module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, "dlti.endianness" = "little", "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", fir.target_cpu = "x86-64", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.1.5 (https://github.com/llvm/llvm-project.git 7b09d7b446383b71b63d429b21ee45ba389c5134)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
  func.func @_QQmain() attributes {fir.bindc_name = "main"} {
    %c5_i32 = arith.constant 5 : i32
    %c6_i32 = arith.constant 6 : i32
    %c42_i32 = arith.constant 42 : i32
    %0 = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
    %1 = fircg.ext_declare %0 {uniq_name = "_QFEx"} : (!fir.ref) -> !fir.ref
    fir.store %c42_i32 to %1 : !fir.ref
    %2 = fir.address_of(@_QQclXc451cd8b1f5eae110b31be7df6c0fcf9) : !fir.ref<!fir.char<1,43>>
    %3 = fir.convert %2 : (!fir.ref<!fir.char<1,43>>) -> !fir.ref
    %4 = fir.call @_FortranAioBeginExternalListOutput(%c6_i32, %3, %c5_i32) fastmath : (i32, !fir.ref, i32) -> !fir.ref
    %5 = fir.load %1 : !fir.ref
    %6 = fir.call @_FortranAioOutputInteger32(%4, %5) fastmath : (!fir.ref, i32) -> i1
    %7 = fir.call @_FortranAioEndIoStatement(%4) fastmath : (!fir.ref) -> i32
    return
  }
  func.func private @_FortranAioBeginExternalListOutput(i32, !fir.ref, i32) -> !fir.ref attributes {fir.io, fir.runtime}
  fir.global linkonce @_QQclXc451cd8b1f5eae110b31be7df6c0fcf9 constant : !fir.char<1,43> {
    %0 = fir.string_lit "/home/pjoot/toycalculator/fortran/test.f90\00"(43) : !fir.char<1,43>
    fir.has_value %0 : !fir.char<1,43>
  }
  func.func private @_FortranAioOutputInteger32(!fir.ref, i32) -> i1 attributes {fir.io, fir.runtime}
  func.func private @_FortranAioEndIoStatement(!fir.ref) -> i32 attributes {fir.io, fir.runtime}
  func.func private @_FortranAProgramStart(i32, !llvm.ptr, !llvm.ptr, !llvm.ptr)
  func.func private @_FortranAProgramEndStatement()
  func.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 {
    %c0_i32 = arith.constant 0 : i32
    %0 = fir.zero_bits !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref, !fir.ref>>>>>
    fir.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %0) fastmath : (i32, !llvm.ptr, !llvm.ptr, !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref, !fir.ref>>>>>) -> ()
    fir.call @_QQmain() fastmath : () -> ()
    fir.call @_FortranAProgramEndStatement() fastmath : () -> ()
    return %c0_i32 : i32
  }
}
(gdb) c
Continuing.

Thread 2.1 "flang" hit Breakpoint 3.2, Fortran::frontend::CodeGenAction::generateLLVMIR (this=0x485530) at /home/pjoot/llvm-project/flang/lib/Frontend/FrontendActions.cpp:889
889	  timingScopeMLIRPasses.stop();

(gdb) mlirModule.op.dump()
module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, "dlti.endianness" = "little", "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", fir.target_cpu = "x86-64", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.1.5 (https://github.com/llvm/llvm-project.git 7b09d7b446383b71b63d429b21ee45ba389c5134)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
  llvm.func @_QQmain() attributes {fir.bindc_name = "main", frame_pointer = #llvm.framePointerKind, target_cpu = "x86-64"} {
    %0 = llvm.mlir.constant(1 : i64) : i64
    %1 = llvm.alloca %0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr
    %2 = llvm.mlir.constant(5 : i32) : i32
    %3 = llvm.mlir.constant(6 : i32) : i32
    %4 = llvm.mlir.constant(42 : i32) : i32
    %5 = llvm.mlir.constant(1 : i64) : i64
    llvm.intr.dbg.declare #llvm.di_local_variable, id = distinct[1]<>, compileUnit = , sourceLanguage = DW_LANG_Fortran95, file = <"test.f90" in "/home/pjoot/toycalculator/fortran">, producer = "flang version 20.1.5 (https://github.com/llvm/llvm-project.git 7b09d7b446383b71b63d429b21ee45ba389c5134)", isOptimized = false, emissionKind = Full>, scope = #llvm.di_file<"test.f90" in "/home/pjoot/toycalculator/fortran">, name = "main", linkageName = "_QQmain", file = <"test.f90" in "/home/pjoot/toycalculator/fortran">, line = 1, scopeLine = 1, subprogramFlags = "Definition|MainSubprogram", type = >, name = "x", file = <"test.f90" in "/home/pjoot/toycalculator/fortran">, line = 3, type = #llvm.di_basic_type> = %1 : !llvm.ptr
    llvm.store %4, %1 : i32, !llvm.ptr
    %6 = llvm.mlir.addressof @_QQclXc451cd8b1f5eae110b31be7df6c0fcf9 : !llvm.ptr
    %7 = llvm.call @_FortranAioBeginExternalListOutput(%3, %6, %2) {fastmathFlags = #llvm.fastmath} : (i32, !llvm.ptr, i32) -> !llvm.ptr
    %8 = llvm.load %1 : !llvm.ptr -> i32
    %9 = llvm.call @_FortranAioOutputInteger32(%7, %8) {fastmathFlags = #llvm.fastmath} : (!llvm.ptr, i32) -> i1
    %10 = llvm.call @_FortranAioEndIoStatement(%7) {fastmathFlags = #llvm.fastmath} : (!llvm.ptr) -> i32
    llvm.return
  }
  llvm.func @_FortranAioBeginExternalListOutput(i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.io, fir.runtime, frame_pointer = #llvm.framePointerKind, sym_visibility = "private", target_cpu = "x86-64"}
  llvm.mlir.global linkonce constant @_QQclXc451cd8b1f5eae110b31be7df6c0fcf9() comdat(@__llvm_comdat::@_QQclXc451cd8b1f5eae110b31be7df6c0fcf9) {addr_space = 0 : i32} : !llvm.array<43 x i8> {
    %0 = llvm.mlir.constant("/home/pjoot/toycalculator/fortran/test.f90\00") : !llvm.array<43 x i8>
    llvm.return %0 : !llvm.array<43 x i8>
  }
  llvm.comdat @__llvm_comdat {
    llvm.comdat_selector @_QQclXc451cd8b1f5eae110b31be7df6c0fcf9 any
  }
  llvm.func @_FortranAioOutputInteger32(!llvm.ptr, i32) -> (i1 {llvm.zeroext}) attributes {fir.io, fir.runtime, frame_pointer = #llvm.framePointerKind, sym_visibility = "private", target_cpu = "x86-64"}
  llvm.func @_FortranAioEndIoStatement(!llvm.ptr) -> i32 attributes {fir.io, fir.runtime, frame_pointer = #llvm.framePointerKind, sym_visibility = "private", target_cpu = "x86-64"}
  llvm.func @_FortranAProgramStart(i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) attributes {frame_pointer = #llvm.framePointerKind, sym_visibility = "private", target_cpu = "x86-64"}
  llvm.func @_FortranAProgramEndStatement() attributes {frame_pointer = #llvm.framePointerKind, sym_visibility = "private", target_cpu = "x86-64"}
  llvm.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 attributes {frame_pointer = #llvm.framePointerKind, target_cpu = "x86-64"} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.zero : !llvm.ptr
    llvm.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %1) {fastmathFlags = #llvm.fastmath} : (i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
    llvm.call @_QQmain() {fastmathFlags = #llvm.fastmath} : () -> ()
    llvm.call @_FortranAProgramEndStatement() {fastmathFlags = #llvm.fastmath} : () -> ()
    llvm.return %0 : i32
  }
}

Even before the AddDebugInfoPass we have target_cpu, data_layout, ident, and target_triple set, so AddDebugInfoPass doesn’t add those. We do have all of these, either added by AddDebugInfoPass, or something after it:

 llvm.intr.dbg.declare #llvm.di_local_variable, id = distinct[1]<>, compileUnit = , sourceLanguage = DW_LANG_Fortran95, file = <"test.f90" in "/home/pjoot/toycalculator/fortran">, producer = "flang version 20.1.5 (https://github.com/llvm/llvm-project.git 7b09d7b446383b71b63d429b21ee45ba389c5134)", isOptimized = false, emissionKind = Full>, scope = #llvm.di_file<"test.f90" in "/home/pjoot/toycalculator/fortran">, name = "main", linkageName = "_QQmain", file = <"test.f90" in "/home/pjoot/toycalculator/fortran">, line = 1, scopeLine = 1, subprogramFlags = "Definition|MainSubprogram", type = >, name = "x", file = <"test.f90" in "/home/pjoot/toycalculator/fortran">, line = 3, type = #llvm.di_basic_type> = %1 : !llvm.ptr

So, it looks like we need to add a compileUnit, subprogram, and if we want debug variables, instrumentation for those too.

In some standalone code (prototypes/simplest.cpp) adding those wasn’t enough to ensure that translateModuleToLLVMIR() didn’t strip out the location info.

What did make the difference was adding this bit:

func->setLoc( builder.getFusedLoc( { loc }, subprogramAttr ) );

with that, the generated LLVM-IR, not only doesn’t have the loc() stripped, but everything is properly converted to !dbg:

; ModuleID = 'test'
source_filename = "test"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define i32 @main() !dbg !4 {
  %1 = alloca i64, i64 1, align 4, !dbg !8
  store i32 42, ptr %1, align 4, !dbg !8
    #dbg_declare(ptr %1, !9, !DIExpression(), !8)
  ret i32 0, !dbg !10
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare void @llvm.dbg.declare(metadata, metadata, metadata) #0

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
!llvm.ident = !{!3}

!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "testcompiler", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
!2 = !DIFile(filename: "test.c", directory: ".")
!3 = !{!"toycompiler 0.0"}
!4 = distinct !DISubprogram(name: "main", linkageName: "main", scope: !2, file: !2, line: 1, type: !5, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !1)
!5 = !DISubroutineType(types: !6)
!6 = !{!7}
!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!8 = !DILocation(line: 2, column: 3, scope: !4)
!9 = !DILocalVariable(name: "x", scope: !4, file: !2, line: 2, type: !7, align: 32)
!10 = !DILocation(line: 3, column: 3, scope: !4)

I’m also able to debug a faked source file corresponding to the LLVM-IR generated by this MWE program:

fedora:/home/pjoot/toycalculator/prototypes> ../build/simplest > output.ll
fedora:/home/pjoot/toycalculator/prototypes> clang -g -o output output.ll -Wno-override-module
fedora:/home/pjoot/toycalculator/prototypes> gdb -q ./output
Reading symbols from ./output...
(gdb) b main
Breakpoint 1 at 0x400450: file test.c, line 2.
(gdb) run
Starting program: /home/pjoot/toycalculator/prototypes/output 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".

Breakpoint 1, main () at test.c:2
2           int x = 42;
(gdb) n
3           return 0;
(gdb) p x
$1 = 42
(gdb) rerun
Undefined command: "rerun".  Try "help".
(gdb) run
The program being debugged has been started already.
Start it from the beginning? (y or n) y
Starting program: /home/pjoot/toycalculator/prototypes/output 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".

Breakpoint 1, main () at test.c:2
2           int x = 42;
(gdb) b 3
Breakpoint 2 at 0x400458: file test.c, line 3.
(gdb) c
Continuing.

Breakpoint 2, main () at test.c:3
3           return 0;
(gdb) q
A debugging session is active.

        Inferior 1 [process 119835] will be killed.

Quit anyway? (y or n) y

Now that I have this working in a standalone test case, it’s time to do it in the toy compiler itself, but that’s a task for another day.

A peek into C++ std::atomic. Extracting the meaning of std::memory_order_seq_cst.

May 19, 2025 C/C++ development and debugging. ARM, fetch_add, Intel, ISYNC, LDADDA, LDADDAL, LDADDL, LDAR, LDAXR, LDXR, load, LOCK XADD, LOCK XCHG, LWARX, LWSYNC, powerpc, std::atomic, std::memory_order_acquire, std::memory_order_release, std::memory_order_seq_cst, STLR, STLXR, store, STWCX., STXR

When I worked on DB2-LUW, we had to write our own atomic implementation. Ours worked on many different types of hardware, operating systems, and compilers, so was quite representative, and matches the capabilities of the now “new” C++ interfaces. Some of that correspondence is not a surprise, since we provided our implementation to the xlC compiler team, who was advocating against sequentially-consistent being the only option. That was a choice that would have penalized PowerPC significantly. The compiler guys were on the standards sub-committee for std::atomic, and were able to reference our cross platform implementation to advocate for the ordering variations that they also wanted.

It was not terribly easy to implement the DB2 atomics. We used compiler builtins when available, inline assembly in some other cases, and plain old assembly in other cases. Even in cases where the hardware was the same, we were often using different compilers, but even if the compilers were the same (i.e.: intel compiler on both Linux and Windows), there would be differences that had to be accounted for. On top of that, we also supported both 32-bit and 64-bit targets in those days (thankfully having ditched our 16-bit client support by that point in time.) Having 32-bit targets meant that we have to use a mutex based implementation for 64-bit atomics on some platforms. You’d have instructions like cmpxchg8b on intel, and the option of using 64 bit instructions for sparcv9 32-bit targets, and csg instructions for 32-bit zLinux, but even if you could use them, additional care was required because 8 byte alignment was needed for mutexes and 32-bit allocators didn’t always provide that.

I’d assume that DB2 now only supports 64-bit targets, which cuts half of the platform variants out of the picture.

With the atomic support built into C++ now, the complexity of building a cross platform solution is now reduced considerably. One still has to be aware of memory ordering, and all the complexities required to build lock free algorithms, but at least the basic infrastructure is now made trivial.

Since my history with atomic types predates std::atomic, I thought it would be interesting to look at acq/rel/cst variations of some basic std::atomic operations and see what the generated assembly looks like. I only have ARM and X64 machines available to me at the moment, which meant that I can’t see what a PowerPC, IA64, sparc, or SGI compiler generates.

My test code included a couple load variations:

int loadv( int & i32 )
{
    return i32;
}

int load( std::atomic & i32 )
{
    return i32.load();
}

int load_acq( std::atomic & i32 )
{
    return i32.load( std::memory_order_acquire );
}

int load_cst( std::atomic & i32 )
{
    return i32.load( std::memory_order_seq_cst );
}

Initially I had made the atomic object global, but the generated assembly on ARM for a global variable was ugly and distracting, so switching to a plain old parameter made things easier to read.

On intel, this is what we get (trimming out post ‘ret’ garbage from the objdump of each function):

0000000000000000 <loadv(int&)>:
   0:   mov    (%rdi),%eax
   2:   ret

0000000000000010 <load(std::atomic&)>:
  10:   mov    (%rdi),%eax
  12:   ret

0000000000000020 <load_acq(std::atomic&)>:
  20:   mov    (%rdi),%eax
  22:   ret

0000000000000030 <load_cst(std::atomic&)>:
  30:   mov    (%rdi),%eax
  32:   ret

We already have .acq semantics for intel loads, as it is mostly ordered already (my recollection is that only a store-load pair can be reordered on intel.) It is interesting that std::memory_order_seq_cst doesn’t require any MFENCE or LFENCE instructions. Effectively, this lack of additional fencing actually gives us an implicit definition of std::memory_order_seq_cst, but let’s comment on that afterwards.

On ARM we have:

0000000000000000 <loadv(int&)>:
   0:   ldr     w0, [x0]
   4:   ret

0000000000000010 <load(std::atomic&)>:
  10:   ldar    w0, [x0]
  14:   ret

0000000000000020 <load_acq(std::atomic&)>:
  20:   ldar    w0, [x0]
  24:   ret

0000000000000030 <load_cst(std::atomic&)>:
  30:   ldar    w0, [x0]
  34:   ret

We have plain old load instruction for a non-atomic load, so it seems reasonable to guess that ldar means a load with acquire semantics. Before looking that up, let’s look at the assembly for some store variations, using the following code:

void storev( int & i32, int v )
{
    i32 = v;
}

void store( std::atomic & i32, int v )
{
    i32.store( v );
}

void store_rel( std::atomic & i32, int v )
{
    i32.store( v, std::memory_order_release );
}

void store_cst( std::atomic & i32, int v )
{
    i32.store( v, std::memory_order_seq_cst );
}

On intel, we get:

0000000000000040 <storev(int&, int)>:
  40:   mov    %esi,(%rdi)
  42:   ret

0000000000000050 <store(std::atomic&, int)>:
  50:   xchg   %esi,(%rdi)
  52:   ret

0000000000000060 <store_rel(std::atomic&, int)>:
  60:   mov    %esi,(%rdi)
  62:   ret

0000000000000070 <store_cst(std::atomic&, int)>:
  70:   xchg   %esi,(%rdi)
  72:   ret

So, a plain old MOV instruction is equivalent to a store with .rel semantics, whereas a std::memory_order_seq_cst (the default atomic semantics for store) requires an implicit LOCK prefix. How about ARM:

0000000000000040 <storev(int&, int)>:
  40:   str     w1, [x0]
  44:   ret

0000000000000050 <store(std::atomic&, int)>:
  50:   stlr    w1, [x0]
  54:   ret

0000000000000060 <store_rel(std::atomic&, int)>:
  60:   stlr    w1, [x0]
  64:   ret

0000000000000070 <store_cst(std::atomic&, int)>:
  70:   stlr    w1, [x0]
  74:   ret

We see that the ARM load w/ acquire and store w/ release instructions use a special instruction, also providing cst semantics for those operations.

The ARM reference manual describes these memory ordered load/store instructions and their semantics:

LDAR Load-acquire register
STLR Store-release register
Load-Acquire and Store-Release instructions

This matches what we’d expect from a modern instruction set, very much like the ld.acq and st.rel instructions that we had on IA64.

Now let’s look at one more atomic operation, a fetch-and-add:

int fetch_add( std::atomic & i32, int v )
{
    return i32.fetch_add( v );
}

int fetch_add_rel( std::atomic & i32, int v )
{
    return i32.fetch_add( v, std::memory_order_release );
}

int fetch_add_acq( std::atomic & i32, int v )
{
    return i32.fetch_add( v, std::memory_order_acquire );
}

int fetch_add_cst( std::atomic & i32, int v )
{
    return i32.fetch_add( v, std::memory_order_seq_cst );
}

On intel we have:

0000000000000080 <fetch_add(std::atomic&, int)>:
  80:   mov    %esi,%eax
  82:   lock xadd %eax,(%rdi)
  86:   ret

0000000000000090 <fetch_add_rel(std::atomic&, int)>:
  90:   mov    %esi,%eax
  92:   lock xadd %eax,(%rdi)
  96:   ret

00000000000000a0 <fetch_add_acq(std::atomic&, int)>:
  a0:   mov    %esi,%eax
  a2:   lock xadd %eax,(%rdi)
  a6:   ret

00000000000000b0 <fetch_add_cst(std::atomic&, int)>:
  b0:   mov    %esi,%eax
  b2:   lock xadd %eax,(%rdi)
  b6:   ret

There’s no implicit LOCK prefix for xadd, so we see it used explicitly now. On ARM, unfortunately, looking at assembly listings is no longer sufficient:

0000000000000080 <fetch_add(std::atomic&, int)>:
  80:   mov     x2, x0
  84:   stp     x29, x30, [sp, #-16]!
  88:   mov     w0, w1
  8c:   mov     x29, sp
  90:   mov     x1, x2
  94:   bl      0 <__aarch64_ldadd4_acq_rel>
  98:   ldp     x29, x30, [sp], #16
  9c:   ret

00000000000000a0 <fetch_add_rel(std::atomic&, int)>:
  a0:   mov     x2, x0
  a4:   stp     x29, x30, [sp, #-16]!
  a8:   mov     w0, w1
  ac:   mov     x29, sp
  b0:   mov     x1, x2
  b4:   bl      0 <__aarch64_ldadd4_rel>
  b8:   ldp     x29, x30, [sp], #16
  bc:   ret

00000000000000c0 <fetch_add_acq(std::atomic&, int)>:
  c0:   mov     x2, x0
  c4:   stp     x29, x30, [sp, #-16]!
  c8:   mov     w0, w1
  cc:   mov     x29, sp
  d0:   mov     x1, x2
  d4:   bl      0 <__aarch64_ldadd4_acq>
  d8:   ldp     x29, x30, [sp], #16
  dc:   ret

00000000000000e0 <fetch_add_cst(std::atomic&, int)>:
  e0:   mov     x2, x0
  e4:   stp     x29, x30, [sp, #-16]!
  e8:   mov     w0, w1
  ec:   mov     x29, sp
  f0:   mov     x1, x2
  f4:   bl      0 <__aarch64_ldadd4_acq_rel>
  f8:   ldp     x29, x30, [sp], #16
  fc:   ret

We do see that the std::atomic default fetch_add behaviour matches std::memory_order_seq_cst (in this case, an operation with both acquire and release sematics, as was the case on intel due to the LOCK prefix.) We can’t see exactly what happens under the covers for those operations, as there’s a branch-and-link instruction to some system provided interfaces.

Here’s what gdb says these functions do:

(gdb) disassemble __aarch64_ldadd4_acq_rel
Dump of assembler code for function __aarch64_ldadd4_acq_rel:
   0x00000000004008f0 <+0>:	bti	c
=> 0x00000000004008f4 <+4>:	adrp	x16, 0x420000 <__libc_start_main@got.plt>
   0x00000000004008f8 <+8>:	ldrb	w16, [x16, #45]
   0x00000000004008fc <+12>:	cbz	w16, 0x400908 <__aarch64_ldadd4_acq_rel+24>
   0x0000000000400900 <+16>:	ldaddal	w0, w0, [x1]
   0x0000000000400904 <+20>:	ret
   0x0000000000400908 <+24>:	mov	w16, w0
   0x000000000040090c <+28>:	ldaxr	w0, [x1]
   0x0000000000400910 <+32>:	add	w17, w0, w16
   0x0000000000400914 <+36>:	stlxr	w15, w17, [x1]
   0x0000000000400918 <+40>:	cbnz	w15, 0x40090c <__aarch64_ldadd4_acq_rel+28>
   0x000000000040091c <+44>:	ret
End of assembler dump.
(gdb) disassemble __aarch64_ldadd4_rel
Dump of assembler code for function __aarch64_ldadd4_rel:
   0x00000000004008c0 <+0>:	bti	c
   0x00000000004008c4 <+4>:	adrp	x16, 0x420000 <__libc_start_main@got.plt>
   0x00000000004008c8 <+8>:	ldrb	w16, [x16, #45]
   0x00000000004008cc <+12>:	cbz	w16, 0x4008d8 <__aarch64_ldadd4_rel+24>
   0x00000000004008d0 <+16>:	ldaddl	w0, w0, [x1]
   0x00000000004008d4 <+20>:	ret
   0x00000000004008d8 <+24>:	mov	w16, w0
   0x00000000004008dc <+28>:	ldxr	w0, [x1]
   0x00000000004008e0 <+32>:	add	w17, w0, w16
   0x00000000004008e4 <+36>:	stlxr	w15, w17, [x1]
   0x00000000004008e8 <+40>:	cbnz	w15, 0x4008dc <__aarch64_ldadd4_rel+28>
   0x00000000004008ec <+44>:	ret
End of assembler dump.
(gdb) disassemble __aarch64_ldadd4_acq
Dump of assembler code for function __aarch64_ldadd4_acq:
   0x0000000000400890 <+0>:	bti	c
   0x0000000000400894 <+4>:	adrp	x16, 0x420000 <__libc_start_main@got.plt>
   0x0000000000400898 <+8>:	ldrb	w16, [x16, #45]
   0x000000000040089c <+12>:	cbz	w16, 0x4008a8 <__aarch64_ldadd4_acq+24>
   0x00000000004008a0 <+16>:	ldadda	w0, w0, [x1]
   0x00000000004008a4 <+20>:	ret
   0x00000000004008a8 <+24>:	mov	w16, w0
   0x00000000004008ac <+28>:	ldaxr	w0, [x1]
   0x00000000004008b0 <+32>:	add	w17, w0, w16
   0x00000000004008b4 <+36>:	stxr	w15, w17, [x1]
   0x00000000004008b8 <+40>:	cbnz	w15, 0x4008ac <__aarch64_ldadd4_acq+28>
   0x00000000004008bc <+44>:	ret
End of assembler dump.

There’s some sort of runtime determined switching, based on program global variable, probably initialized during pre-main setup. If that variable is not set, then we use one of the following Atomic add on word or doubleword in memory instructions

LDADDA and LDADDAL load from memory with acquire semantics.
LDADDL and LDADDAL store to memory with release semantics.
(LDADD has neither acquire nor release semantics.)

It looks like these hybrid fetch-and-add instructions are optional, also only provided on ARMv8.1-A or later. The fallback looks very familiar to a PowerPC programmer, and are load-with-reservation/store-conditional like instructions (i.e.: lwarx/stwcx.), but also include built in memory ordering semantics, which is nice (so we don’t need any supplemental isync/lwsync’s to get the desired memory ordering.)

Some final thoughts.

We saw that store with release semantics was a plain old intel store instruction, whereas store with cst included a full memory barrier (the lock xchg). My recollection is that store/load was the only reorderable pair of memory operations on intel, which was why we were able to construct a spinlock using xchg (lock implied when the destination was a memory operand), but could use a plain old store for the spinlock release.

We can visualize the spinlock acquire/release like a cage that is porous only from the outside:

loads/stores (dumb tourists)

ATOMIC W/ ACQUIRE (cage wall)

Loads and stores that can’t get out of the cage (lions and tigers)

RELEASE (cage wall)
loads/stores (dumb tourists)

You want to ensure that the loads and stores (the lions and tigers) that are in the cage can’t get out, but if somebody is foolish enough to go into the cage from the outside, you are willing to let them get eaten. On intel the release-store is enough to ensure that preceding loads and stores are complete before anybody can see the side effect of the lock release operation. On PPC we needed isync and lwsync to help guard the cage walls. It looks like we need explicit acquire release semantics on ARM to guard those walls too, but have nice instruction variants to do that.

We’ve seen enough to give meaning to std::memory_order_seq_cst. It would have made more sense to try to look it up first, but where is the fun in that. Instead, we’ve seen that:

loads with std::memory_order_seq_cst have acquire semantics,
stores with std::memory_order_seq_cst have both release semantics and acquire semantics.
Operations that have both load and store aspects of their operations (like fetch-and-add), because they are stores, also have both release and acquire semantics for std::memory_order_seq_cst.

On Intel we saw std::memory_order_seq_cst for hybrid (load/store) operations was achieved with an explicit LOCK prefix (bidirectional fence), and on ARM with instructions like LDADDAL that have both release and acquire semantics by construction.

Adding fixed size types to my toy MLIR compiler

May 17, 2025 C/C++ development and debugging. LLVM IR, lowering, MLIR

In toycalculator, an MLIR/LLVM compiler experiment, I described a rudimentary MLIR based compiler.

I’ve now added fixed size integer types, a boolean type and boolean constants (but not boolean operators), and two floating point types. For the time being, the untyped ‘DCL’ declaration type (FLOAT64) is still in the grammar and the parser/MLIR-builder.

There’s lots still in the TODO list, including control flow, functions and dwarf support. I was quite surprised that given the location requirements for all MLIR statements, we don’t get DWARF instrumentation for free when doing LLVM lowering, and I wonder if I am missing some sort of setup for that.

New Language/Compiler enhancements

Here are the new elements added to the language:

Declare variables with BOOL, INT8, INT16, INT32, INT64, FLOAT32, FLOAT64 types:
```
BOOL b;
INT16 i;
FLOAT32 f;
```
TRUE, FALSE, and floating point constants:
```
b = TRUE;
f = 5 + 3.14E0;
```
An EXIT builtin to return a Unix command line value (must be the last statement in the program):
```
EXIT 1;
EXIT x;
```
Expression type conversions:
```
INT16 x;
FLOAT32 y;
y = 3.14E0;
x = 1 + y;
```
The type conversion rules in the language are not like C.
Instead, all expression elements are converted to the type of the destination before the operation, and integers are truncated.
Example:
```
INT32 x;
x = 1.78 + 3.86E0;
FLOAT64 f;
f = x;
PRINT f;
f = 1.78 + 3.86E0;
PRINT f;
```
The expected output for this program is:
```
4.000000
5.640000
```

MLIR

Example 1

The MLIR for the language now matches the statements of the language much more closely. Consider test.toy for example:

DCL x;
x = 5 + 3.14E0;
PRINT x;
DCL y;
y = x * 2;
PRINT y;

for which the MLIR is now free of memref dialect:

"builtin.module"() ({
  "toy.program"() ({
    "toy.declare"() <{name = "x", type = f64}> : () -> () loc(#loc)
    %0 = "arith.constant"() <{value = 5 : i64}> : () -> i64 loc(#loc1)
    %1 = "arith.constant"() <{value = 3.140000e+00 : f64}> : () -> f64 loc(#loc1)
    %2 = "toy.add"(%0, %1) : (i64, f64) -> f64 loc(#loc1)
    "toy.assign"(%2) <{name = "x"}> : (f64) -> () loc(#loc1)
    %3 = "toy.load"() <{name = "x"}> : () -> f64 loc(#loc2)
    "toy.print"(%3) : (f64) -> () loc(#loc2)
    "toy.declare"() <{name = "y", type = f64}> : () -> () loc(#loc3)
    %4 = "toy.load"() <{name = "x"}> : () -> f64 loc(#loc4)
    %5 = "arith.constant"() <{value = 2 : i64}> : () -> i64 loc(#loc4)
    %6 = "toy.mul"(%4, %5) : (f64, i64) -> f64 loc(#loc4)
    "toy.assign"(%6) <{name = "y"}> : (f64) -> () loc(#loc4)
    %7 = "toy.load"() <{name = "y"}> : () -> f64 loc(#loc5)
    "toy.print"(%7) : (f64) -> () loc(#loc5)
    "toy.exit"() : () -> () loc(#loc)
  }) : () -> () loc(#loc)
}) : () -> () loc(#loc)
#loc = loc("test.toy":1:1)
#loc1 = loc("test.toy":2:5)
#loc2 = loc("test.toy":3:1)
#loc3 = loc("test.toy":4:1)
#loc4 = loc("test.toy":5:5)
#loc5 = loc("test.toy":6:1)

Variable references still use llvm.alloca, but are symbol based in the builder, and llvm.alloca now doesn’t show up until lowering to LLVM-IR:

; ModuleID = 'test.toy'
source_filename = "test.toy"

declare void @__toy_print(double)

define i32 @main() {
  %1 = alloca double, i64 1, align 8
  store double 8.140000e+00, ptr %1, align 8
  %2 = load double, ptr %1, align 8
  call void @__toy_print(double %2)
  %3 = alloca double, i64 1, align 8
  %4 = load double, ptr %1, align 8
  %5 = fmul double %4, 2.000000e+00
  store double %5, ptr %3, align 8
  %6 = load double, ptr %3, align 8
  call void @__toy_print(double %6)
  ret i32 0
}

!llvm.module.flags = !{!0}

!0 = !{i32 2, !"Debug Info Version", i32 3}

Here is an example generated assembly, for the program above:

0000000000000000 <main>:
   0:   push   %rax
   1:   movsd  0x0(%rip),%xmm0        
   9:   call   e <main+0xe>
   e:   movsd  0x0(%rip),%xmm0       
  16:   call   1b <main+0x1b>
  1b:   xor    %eax,%eax
  1d:   pop    %rcx
  1e:   ret

Example 2

Here’s an example that highlights the new type support a bit better:

DESKTOP-16N83AG:/home/pjoot/toycalculator/samples> cat addi.toy 
INT32 x;
x = 5 + 3.14E0;
FLOAT64 f;
f = x;
PRINT f;

The MLIR for this is:

"builtin.module"() ({
  "toy.program"() ({
    "toy.declare"() <{name = "x", type = i32}> : () -> () loc(#loc)
    %0 = "arith.constant"() <{value = 5 : i64}> : () -> i64 loc(#loc1)
    %1 = "arith.constant"() <{value = 3.140000e+00 : f64}> : () -> f64 loc(#loc1)
    %2 = "toy.add"(%0, %1) : (i64, f64) -> i32 loc(#loc1)
    "toy.assign"(%2) <{name = "x"}> : (i32) -> () loc(#loc1)
    "toy.declare"() <{name = "f", type = f64}> : () -> () loc(#loc2)
    %3 = "toy.load"() <{name = "x"}> : () -> i32 loc(#loc3)
    "toy.assign"(%3) <{name = "f"}> : (i32) -> () loc(#loc3)
    %4 = "toy.load"() <{name = "f"}> : () -> f64 loc(#loc4)
    "toy.print"(%4) : (f64) -> () loc(#loc4)
    "toy.exit"() : () -> () loc(#loc)
  }) : () -> () loc(#loc)
}) : () -> () loc(#loc)
#loc = loc("addi.toy":1:1)
#loc1 = loc("addi.toy":2:5)
#loc2 = loc("addi.toy":3:1)
#loc3 = loc("addi.toy":4:5)
#loc4 = loc("addi.toy":5:1)

for which the LLVM IR lowering result is:

; ModuleID = 'addi.toy'
source_filename = "addi.toy"

declare void @__toy_print(double)

define i32 @main() {
  %1 = alloca i32, i64 1, align 4
  store i32 8, ptr %1, align 4
  %2 = alloca double, i64 1, align 8
  %3 = load i32, ptr %1, align 4
  %4 = sitofp i32 %3 to double
  store double %4, ptr %2, align 8
  %5 = load double, ptr %2, align 8
  call void @__toy_print(double %5)
  ret i32 0
}

!llvm.module.flags = !{!0}

!0 = !{i32 2, !"Debug Info Version", i32 3}

Because of lack of side effects, most of that code is obliterated in the assembly printing stage, leaving just:

0000000000000000 <main>:
   0:   push   %rax
   1:   movsd  0x0(%rip),%xmm0
   9:   call   e <main+0xe>
   e:   xor    %eax,%eax
  10:   pop    %rcx
  11:   ret

toycalculator, an MLIR/LLVM compiler experiment.

April 27, 2025 C/C++ development and debugging. antlr4, ASM printer, compiler, LLVM IR, MLIR, tablegen

Over the last 8 years, I’ve been intimately involved in building a pair of LLVM based compilers for the COBOL and PL/I languages. However, a lot of my work was on the runtime side of the story. This was non-trivial work, with lots of complex interactions to figure out, but it also meant that I didn’t get to play with the fun (codegen) part of the compiler.

Over the last month or so, I’ve been incrementally building an MLIR based compiler for a toy language. I thought this would be a great way to get my hands dirty, and learn as I went. These were the elements required:

A lexer/parser grammar for my language, and parse tree walker. I used an altlr4 grammar for this part, emitting C++ parse listener classes.
An MLIR dialect (tablegen definitions) for the language.
Parse tree listener callbacks to do the MLIR builder calls.
Tablegen definitions for the LLVM IR lowering pass.
Lowering classes for all the required operations.
A compiler driver to orchestrate all the interconnected pieces, and drive the assembly printer.

As it turns out, MLIR tablegen is pretty finicky when you have no clue what you are doing. Once you get the basic infrastructure in place it makes a lot more sense, and you can look at the generated C++ classes associated with your tablegen, to get a good idea what is happening under the covers.

Here’s a couple examples that illustrate the toy language:

// empty.toy
// This should be allowed by the grammar.



// dcl.toy
DCL x; // the next simplest non-empty program (i.e.: also has a comment)



// foo.toy
DCL x;
x = 3;
// This indenting is to test location generation, and to verify that the resulting columnar position is right.
     PRINT x;



// unary.toy
DCL x;
x = 3;
x = +x;
x = -x;
PRINT x;



// test.toy
DCL x;
DCL y;
x = 5 + 3;
y = x * 2;
PRINT x;

There is also a RETURN statement, not in any of those examples explicitly. I added that language element to simplify the LLVM lowering process. Here’s a motivating example:

> ../build/toycalculator empty.toy  --location
"builtin.module"() ({
  "toy.program"() ({
  ^bb0:
  }) : () -> () loc(#loc1)
}) : () -> () loc(#loc)
#loc = loc(unknown) 
#loc1 = loc("empty.toy":2:1)

Notice the wierd looking ‘^bb0:’ in the MLIR dump. This is a representation of an empty basic block, and was a bit of a pain to figure out how to lower properly. What I ended up doing is inserting a RETURN operation into the program if there was no other statements. I wanted to support such a dumb trivial program with no actual statements as a first test for the lowering, and see that things worked end to end before getting some of the trickier lowering working. With a return statement, empty.toy’s MLIR now looks like:

"builtin.module"() ({ 
  "toy.program"() ({
    "toy.return"() : () -> () loc(#loc1)
  }) : () -> () loc(#loc1)
}) : () -> () loc(#loc)
#loc = loc(unknown)
#loc1 = loc("../samples/empty.toy":2:1)

The idea behind the lowering operations is that each MLIR operation can be matched with a rewriter operation, and the program can be iterated over, gradually replacing all the MLIR operators with LLVM operations.

For example, an element like:

"toy.return"() : () -> ()

Can be replaced by

  %0 = "llvm.mlir.constant"() <{value = 0 : i32}> : () -> i32
  "llvm.return"(%0) : (i32) -> ()

Once that replacement is made, we can delete the toy.return element:

    // Lower toy.return to nothing (erase).
    class ReturnOpLowering : public ConversionPattern
    {
       public:
        ReturnOpLowering( MLIRContext* context )
            : ConversionPattern( toy::ReturnOp::getOperationName(), 1, context )
        {
        }           

        LogicalResult matchAndRewrite(
            Operation* op, ArrayRef operands,
            ConversionPatternRewriter& rewriter ) const override
        {   
            LLVM_DEBUG( llvm::dbgs()
                        << "Lowering toy.return: " << *op << '\n' );
            // ...
            rewriter.eraseOp( op );
            return success();
        }
    };

In the sample above the toy.program elememt also needs to be deleted. It gets replaced by a llvm basic block, moving all the MLIR BB elements into it. The last step is the removal of the outer most MLIR module, but there’s an existing Dialect for that. When all is said and done, we are left with the following LLVM IR:

define i32 @main() {
  ret i32 0
}

Here’s the MLIR for the foo.toy, which is slightly more interesting

"builtin.module"() ({
  "toy.program"() ({
    %0 = "memref.alloca"() <{operandSegmentSizes = array}> : () -> memref
    "toy.declare"() <{name = "x"}> : () -> ()
    %1 = "arith.constant"() <{value = 3 : i64}> : () -> i64
    %2 = "toy.unary"(%1) <{op = "+"}> : (i64) -> f64
    "memref.store"(%2, %0) : (f64, memref) -> ()
    "toy.assign"(%2) <{name = "x"}> : (f64) -> ()
    "toy.print"(%0) : (memref) -> ()
    "toy.return"() : () -> ()
  }) : () -> ()
}) : () -> ()

As we go through the lowering replacements, more an more of the MLIR operators get replaced with LLVM equivalents. Here’s an example part way through:

"llvm.func"() <{CConv = #llvm.cconv, function_type = !llvm.func, linkage = #llvm.linkage, sym_name = "main", visibility_ = 0 : i64}> ({
  %0 = "llvm.mlir.constant"() <{value = 1 : i64}> : () -> i64
  %1 = "llvm.alloca"(%0) <{alignment = 8 : i64, elem_type = f64}> : (i64) -> !llvm.ptr
  %2 = "memref.alloca"() <{operandSegmentSizes = array}> : () -> memref
  "toy.declare"() <{name = "x"}> : () -> ()
  %3 = "llvm.mlir.constant"() <{value = 3 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 3 : i64}> : () -> i64
  %5 = "toy.unary"(%4) <{op = "+"}> : (i64) -> f64
  "memref.store"(%5, %2) : (f64, memref) -> ()
  "toy.assign"(%5) <{name = "x"}> : (f64) -> ()
  "toy.print"(%2) : (memref) -> ()
  "toy.return"() : () -> ()
}) : () -> ()

and after a few more:

"llvm.func"() <{CConv = #llvm.cconv, function_type = !llvm.func, linkage = #llvm.linkage, sym_na
me = "main", visibility_ = 0 : i64}> ({
  %0 = "llvm.mlir.constant"() <{value = 1 : i64}> : () -> i64
  %1 = "llvm.alloca"(%0) <{alignment = 8 : i64, elem_type = f64}> : (i64) -> !llvm.ptr
  %2 = "memref.alloca"() <{operandSegmentSizes = array}> : () -> memref
  "toy.declare"() <{name = "x"}> : () -> ()
  %3 = "llvm.mlir.constant"() <{value = 3 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 3 : i64}> : () -> i64
  %5 = "llvm.sitofp"(%3) : (i64) -> f64
  %6 = "toy.unary"(%4) <{op = "+"}> : (i64) -> f64
  "llvm.store"(%5, %1) <{ordering = 0 : i64}> : (f64, !llvm.ptr) -> ()
  "memref.store"(%6, %2) : (f64, memref) -> ()
  "toy.assign"(%6) <{name = "x"}> : (f64) -> ()
  %7 = "llvm.load"(%1) <{ordering = 0 : i64}> : (!llvm.ptr) -> f64
  "llvm.call"(%7) <{CConv = #llvm.cconv, TailCallKind = #llvm.tailcallkind, callee = @__toy_print, fastmathFlags = #llvm.fastmath, op_bundle_sizes = array, operandSegmentSizes = array}> : (f64) -> ()
  "toy.print"(%2) : (memref) -> ()
  %8 = "llvm.mlir.constant"() <{value = 0 : i32}> : () -> i32
  "llvm.return"(%8) : (i32) -> ()
  "toy.return"() : () -> ()
}) : () -> ()

Eventually, after various LLVM IR blocks get merged (almost magically by one of the passes), we end up with:

declare void @__toy_print(double)

define i32 @main() {
  %1 = alloca double, i64 1, align 8
  store double 3.000000e+00, ptr %1, align 8
  %2 = load double, ptr %1, align 8
  call void @__toy_print(double %2)
  ret i32 0
}

Enabling an assembly printer pass, we get an object file

fedora:/home/pjoot/toycalculator/samples> objdump -dr foo.o

foo.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000

: 0: 50 push %rax 1: 48 b8 00 00 00 00 00 movabs $0x4008000000000000,%rax 8: 00 08 40 b: 48 89 04 24 mov %rax,(%rsp) f: f2 0f 10 05 00 00 00 movsd 0x0(%rip),%xmm0 # 17 <main+0x17> 16: 00 13: R_X86_64_PC32 .LCPI0_0-0x4 17: e8 00 00 00 00 call 1c <main+0x1c> 18: R_X86_64_PLT32 __toy_print-0x4 1c: 31 c0 xor %eax,%eax 1e: 59 pop %rcx 1f: c3 ret

Here’s an end to end example of a full compile, link and build of this little module:

fedora:/home/pjoot/toycalculator/samples> ../build/toycalculator foo.toy 
Generated object file: foo.o
fedora:/home/pjoot/toycalculator/samples> clang -o foo foo.o -L ../build -l toy_runtime -Wl,-rpath,`pwd`/../build
fedora:/home/pjoot/toycalculator/samples> ./foo
3.000000

A month of work and 1800 lines of code, and now I can print a single constant number!

A little float explorer program

April 23, 2025 C/C++ development and debugging. bias, double, exponent, float, mantissa

Karl’s 1st year C programming course included IEEE floating point representation. It’s good to understand that representation, but pretty silly to try to do those conversions by hand.

I remember covering this in our computer organization course, but by that time I’d already figured it out for my own purposes. I wasn’t smart enough to look it up, and figured it out the hard way by reverse engineering the layout with printfs or the debugger or something like that.

I couldn’t help myself and wrote a little program to unpack a (32-bit) float into it’s representative parts. The idea is that we have a number in the form:

\begin{equation}\label{eqn:float32:1}
\pm 1.bbbbbbbb x 2^n
\end{equation}

The sizes of those components in a 32-bit C float type are:

1 sign bit,
8 exponent bits,
1 implied 1 mantissa bit,
23 mantissa bits,

where the exponent has a bias (127 = (1<<7)-1) so that the hardware doesn’t have to do any twos complement manipulation.

I’ve put my little floating point explorer program on github. Here is some sample output:

value:    0
hex:      00000000
bits:     00000000000000000000000000000000
sign:     0
exponent:  00000000                        (0+0)
mantissa:          00000000000000000000000
number:          0.00000000000000000000000 x 2^(0)

value:    inf
hex:      7F800000
bits:     01111111100000000000000000000000
sign:     0
exponent:  11111111
mantissa:          00000000000000000000000
number:   +inf

value:    -inf
hex:      FF800000
bits:     11111111100000000000000000000000
sign:     1
exponent:  11111111
mantissa:          00000000000000000000000
number:   -inf

value:    nan
hex:      7FC00000
bits:     01111111110000000000000000000000
sign:     0
exponent:  11111111
mantissa:          10000000000000000000000
number:   NaN

value:    1.1754944e-38
hex:      00800000
bits:     00000000100000000000000000000000
sign:     0
exponent:  00000001                        (127 -126)
mantissa:          00000000000000000000000
number:          1.00000000000000000000000 x 2^(-126)

value:    3.4028235e+38
hex:      7F7FFFFF
bits:     01111111011111111111111111111111
sign:     0
exponent:  11111110                        (127 +127)
mantissa:          11111111111111111111111
number:          1.11111111111111111111111 x 2^(127)
Smallest denormal:

value:    1e-45
hex:      00000001
bits:     00000000000000000000000000000001
sign:     0
exponent:  00000000                        (0-126)
mantissa:          00000000000000000000001
number:          0.00000000000000000000001 x 2^(-126)
Largest denormal:

value:    1.1754942e-38
hex:      007FFFFF
bits:     00000000011111111111111111111111
sign:     0
exponent:  00000000                        (0-126)
mantissa:          11111111111111111111111
number:          0.11111111111111111111111 x 2^(-126)

value:    1
hex:      3F800000
bits:     00111111100000000000000000000000
sign:     0
exponent:  01111111                        (127 +0)
mantissa:          00000000000000000000000
number:          1.00000000000000000000000 x 2^(0)

value:    -2
hex:      C0000000
bits:     11000000000000000000000000000000
sign:     1
exponent:  10000000                        (127 +1)
mantissa:          00000000000000000000000
number:         -1.00000000000000000000000 x 2^(1)

value:    6
hex:      40C00000
bits:     01000000110000000000000000000000
sign:     0
exponent:  10000001                        (127 +2)
mantissa:          10000000000000000000000
number:          1.10000000000000000000000 x 2^(2)

value:    1.5
hex:      3FC00000
bits:     00111111110000000000000000000000
sign:     0
exponent:  01111111                        (127 +0)
mantissa:          10000000000000000000000
number:          1.10000000000000000000000 x 2^(0)

value:    0.125
hex:      3E000000
bits:     00111110000000000000000000000000
sign:     0
exponent:  01111100                        (127 -3)
mantissa:          00000000000000000000000
number:          1.00000000000000000000000 x 2^(-3)

Shoutout to Grok for the code review and the code fragments required to show the representation of NaN, $ \pm \infty $, and denormalized numbers. Grok offered to help extend this to double and long double representations, but where is the fun in letting it do that — that’s an exercise for another day.

EDIT: I updated the code with double decoding support too (and added command line options):

./floatexplorer  --float --double 1
value:    1
hex:      3F800000
bits:     00111111100000000000000000000000
sign:     0
exponent:  01111111                        (127 +0)
mantissa:          00000000000000000000000
number:          1.00000000000000000000000 x 2^(0)

value:    1
hex:      3FF0000000000000
bits:     0011111111110000000000000000000000000000000000000000000000000000
sign:     0
exponent:  01111111111                                                     (1023 +0)
mantissa:             0000000000000000000000000000000000000000000000000000
number:             1.0000000000000000000000000000000000000000000000000000 x 2^(0)

The quest for DWARF instrumentation in a toy MLIR compiler

Like this:

A peek into C++ std::atomic. Extracting the meaning of std::memory_order_seq_cst.

Some final thoughts.

Like this:

Adding fixed size types to my toy MLIR compiler

New Language/Compiler enhancements

MLIR

Example 1

Example 2

Like this:

toycalculator, an MLIR/LLVM compiler experiment.

Like this:

A little float explorer program

Like this:

Pages

Categories

Recent Posts

Archives

Meta

Categories

Recent Posts

Archives

The quest for DWARF instrumentation in a toy MLIR compiler

Share this:

Like this:

A peek into C++ std::atomic. Extracting the meaning of std::memory_order_seq_cst.

Some final thoughts.

Share this:

Like this:

Adding fixed size types to my toy MLIR compiler

New Language/Compiler enhancements

MLIR

Example 1

Example 2

Share this:

Like this:

toycalculator, an MLIR/LLVM compiler experiment.

Share this:

Like this:

A little float explorer program

Share this:

Like this:

Pages

Categories

Recent Posts

Tags

Archives

Meta

Categories

Recent Posts

Tags

Archives