riscv_emulator/disasm.rs
1//! RV32IMA disassembler — objdump-compatible output.
2//!
3//! Converts a 32-bit instruction word into a human-readable string using
4//! ABI register names and common pseudo-instructions. When a [`SymbolTable`]
5//! is provided, branch and jump targets are annotated with symbol names.
6//!
7//! ## Output format
8//!
9//! The format matches GNU objdump (`-M no-aliases` disabled, i.e. pseudos on):
10//!
11//! ```text
12//! 80000014 510010ef jal ra, <main>
13//! 80001524 ff010113 addi sp, sp, -16
14//! 80001528 00112623 sw ra, 12(sp)
15//! ```
16//!
17//! ## Pseudo-instructions
18//!
19//! The following common pseudos are emitted when the encoding matches:
20//!
21//! | Pseudo | Canonical form |
22//! |--------|---------------|
23//! | `nop` | `addi zero, zero, 0` |
24//! | `mv rd, rs` | `addi rd, rs, 0` |
25//! | `li rd, imm` | `addi rd, zero, imm` |
26//! | `not rd, rs` | `xori rd, rs, -1` |
27//! | `neg rd, rs` | `sub rd, zero, rs` |
28//! | `snez rd, rs` | `sltu rd, zero, rs` |
29//! | `ret` | `jalr zero, 0(ra)` |
30//! | `jr rs` | `jalr zero, 0(rs)` |
31//! | `j offset` | `jal zero, offset` |
32//! | `beqz/bnez/bltz/bgez` | branch with zero as one operand |
33//!
34//! ## Static disassembly
35//!
36//! [`disasm_elf`] disassembles an entire ELF image segment by segment, with
37//! symbol labels printed before each function — equivalent to `objdump -d`.
38
39use crate::elf::{ElfImage, SymbolTable};
40
41// ── ABI register names ───────────────────────────────────────────────────────
42
43const REG_NAMES: [&str; 32] = [
44 "zero", "ra", "sp", "gp", "tp", "t0", "t1", "t2", "s0", "s1", "a0", "a1", "a2", "a3", "a4",
45 "a5", "a6", "a7", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "t3", "t4",
46 "t5", "t6",
47];
48
49/// Return the ABI name of register `r` (masked to 5 bits).
50#[inline]
51fn reg(r: u32) -> &'static str {
52 REG_NAMES[(r & 0x1f) as usize]
53}
54
55// ── Immediate decoders (pure functions, same logic as emulator.rs) ────────────
56
57#[inline]
58fn imm_i(ir: u32) -> i32 {
59 let imm = ir >> 20;
60 (imm | if imm & 0x800 != 0 { 0xffff_f000 } else { 0 }) as i32
61}
62#[inline]
63fn imm_s(ir: u32) -> i32 {
64 let imm = ((ir >> 7) & 0x1f) | ((ir & 0xfe00_0000) >> 20);
65 (imm | if imm & 0x800 != 0 { 0xffff_f000 } else { 0 }) as i32
66}
67#[inline]
68fn imm_b(ir: u32) -> i32 {
69 let imm =
70 ((ir & 0xf00) >> 7) | ((ir & 0x7e00_0000) >> 20) | ((ir & 0x80) << 4) | ((ir >> 31) << 12);
71 (imm | if imm & 0x1000 != 0 { 0xffffe000 } else { 0 }) as i32
72}
73#[inline]
74fn imm_j(ir: u32) -> i32 {
75 let imm = ((ir & 0x8000_0000) >> 11)
76 | ((ir & 0x7fe0_0000) >> 20)
77 | ((ir & 0x0010_0000) >> 9)
78 | (ir & 0x000f_f000);
79 (imm | if imm & 0x0010_0000 != 0 {
80 0xffe0_0000
81 } else {
82 0
83 }) as i32
84}
85#[inline]
86fn imm_u(ir: u32) -> i32 {
87 (ir & 0xffff_f000) as i32
88}
89
90// ── Formatting helpers ───────────────────────────────────────────────────────
91
92/// Format a branch/jump target address, annotating it with a symbol name if available.
93///
94/// - Exact match (`addr == sym.addr`): `<name>`
95/// - Inside a symbol's range (`size > 0`): `<name+0xoffset>`
96/// - No match or `size == 0` without exact match: `0x80001234`
97fn fmt_target(addr: u32, syms: Option<&SymbolTable>) -> String {
98 if let Some(sym) = syms.and_then(|t| t.lookup_addr(addr)) {
99 if sym.addr == addr {
100 // Exact match — always show the symbol name.
101 format!("<{}>", sym.name)
102 } else if sym.size > 0 {
103 // Inside a known range — offset is reliable.
104 format!("<{}+0x{:x}>", sym.name, addr - sym.addr)
105 } else {
106 // size == 0: we do not know where the symbol ends;
107 // a numeric address is more honest than a made-up offset.
108 format!("0x{:08x}", addr)
109 }
110 } else {
111 format!("0x{:08x}", addr)
112 }
113}
114
115/// Format a memory operand as `imm(rs)` for load/store instructions.
116#[inline]
117fn mem_operand(rs: u32, imm: i32) -> String {
118 format!("{}({})", imm, reg(rs))
119}
120
121// ── Main disassembler ────────────────────────────────────────────────────────
122
123/// Disassemble one RV32IMA instruction into an objdump-compatible string.
124///
125/// - `ir` — the 32-bit instruction word.
126/// - `pc` — virtual address of the instruction, used to compute branch and
127/// jump targets relative to PC.
128/// - `syms` — optional symbol table for resolving target addresses to names.
129///
130/// Returns a string containing the mnemonic and operands, with no leading
131/// address and no trailing newline. Example: `"jal ra, <vTaskStartScheduler>"`.
132pub fn disassemble(ir: u32, pc: u32, syms: Option<&SymbolTable>) -> String {
133 let rd = (ir >> 7) & 0x1f;
134 let rs1 = (ir >> 15) & 0x1f;
135 let rs2 = (ir >> 20) & 0x1f;
136 let funct3 = (ir >> 12) & 0x7;
137 let funct7 = (ir >> 25) & 0x7f;
138
139 match ir & 0x7f {
140 // ── LUI ──────────────────────────────────────────────────────────────
141 0x37 => {
142 format!("lui {}, 0x{:x}", reg(rd), (imm_u(ir) as u32) >> 12)
143 }
144
145 // ── AUIPC ────────────────────────────────────────────────────────────
146 0x17 => {
147 format!("auipc {}, 0x{:x}", reg(rd), (imm_u(ir) as u32) >> 12)
148 }
149
150 // ── JAL ──────────────────────────────────────────────────────────────
151 0x6f => {
152 let target = pc.wrapping_add(imm_j(ir) as u32);
153 let sym = fmt_target(target, syms);
154 if rd == 0 {
155 format!("j {}", sym) // pseudo: j offset
156 } else if rd == 1 {
157 format!("jal ra, {}", sym) // pseudo: call / jal ra
158 } else {
159 format!("jal {}, {}", reg(rd), sym)
160 }
161 }
162
163 // ── JALR ─────────────────────────────────────────────────────────────
164 0x67 => {
165 let imm = imm_i(ir);
166 // ret: jalr zero, 0(ra)
167 if rd == 0 && rs1 == 1 && imm == 0 {
168 return "ret".to_string();
169 }
170 // jr rs: jalr zero, 0(rs)
171 if rd == 0 && imm == 0 {
172 return format!("jr {}", reg(rs1));
173 }
174 format!("jalr {}, {}({})", reg(rd), imm, reg(rs1))
175 }
176
177 // ── Branches ─────────────────────────────────────────────────────────
178 0x63 => {
179 let target = pc.wrapping_add(imm_b(ir) as u32);
180 let sym = fmt_target(target, syms);
181 let mnem = match funct3 {
182 0 => "beq",
183 1 => "bne",
184 4 => "blt",
185 5 => "bge",
186 6 => "bltu",
187 7 => "bgeu",
188 _ => return format!("unknown.branch funct3={}", funct3),
189 };
190 // Pseudos: beqz / bnez / bltz / bgez / blez / bgtz
191 if rs2 == 0 {
192 let pseudo = match funct3 {
193 0 => Some("beqz"),
194 1 => Some("bnez"),
195 4 => Some("bltz"),
196 5 => Some("bgez"),
197 _ => None,
198 };
199 if let Some(p) = pseudo {
200 return format!("{:<8}{}, {}", p, reg(rs1), sym);
201 }
202 }
203 if rs1 == 0 {
204 let pseudo = match funct3 {
205 4 => Some("bgtz"), // blt zero, rs2 → bgtz rs2
206 5 => Some("blez"), // bge zero, rs2 → blez rs2
207 _ => None,
208 };
209 if let Some(p) = pseudo {
210 return format!("{:<8}{}, {}", p, reg(rs2), sym);
211 }
212 }
213 format!("{:<8}{}, {}, {}", mnem, reg(rs1), reg(rs2), sym)
214 }
215
216 // ── Loads ────────────────────────────────────────────────────────────
217 0x03 => {
218 let imm = imm_i(ir);
219 let mnem = match funct3 {
220 0 => "lb",
221 1 => "lh",
222 2 => "lw",
223 4 => "lbu",
224 5 => "lhu",
225 _ => return format!("unknown.load funct3={}", funct3),
226 };
227 format!("{:<8}{}, {}", mnem, reg(rd), mem_operand(rs1, imm))
228 }
229
230 // ── Stores ───────────────────────────────────────────────────────────
231 0x23 => {
232 let imm = imm_s(ir);
233 let mnem = match funct3 {
234 0 => "sb",
235 1 => "sh",
236 2 => "sw",
237 _ => return format!("unknown.store funct3={}", funct3),
238 };
239 format!("{:<8}{}, {}", mnem, reg(rs2), mem_operand(rs1, imm))
240 }
241
242 // ── OP-IMM (0x13) ────────────────────────────────────────────────────
243 0x13 => {
244 let imm = imm_i(ir);
245 let shamt = (ir >> 20) & 0x1f;
246 match funct3 {
247 0 => {
248 // nop: addi zero, zero, 0
249 if rd == 0 && rs1 == 0 && imm == 0 {
250 return "nop".to_string();
251 }
252 // mv rd, rs: addi rd, rs, 0
253 if imm == 0 {
254 return format!("mv {}, {}", reg(rd), reg(rs1));
255 }
256 // li rd, imm: addi rd, zero, imm
257 if rs1 == 0 {
258 return format!("li {}, {}", reg(rd), imm);
259 }
260 format!("addi {}, {}, {}", reg(rd), reg(rs1), imm)
261 }
262 1 => format!("slli {}, {}, {}", reg(rd), reg(rs1), shamt),
263 2 => format!("slti {}, {}, {}", reg(rd), reg(rs1), imm),
264 3 => format!("sltiu {}, {}, {}", reg(rd), reg(rs1), imm as u32),
265 4 => {
266 // not rd, rs: xori rd, rs, -1
267 if imm == -1 {
268 return format!("not {}, {}", reg(rd), reg(rs1));
269 }
270 format!("xori {}, {}, {}", reg(rd), reg(rs1), imm)
271 }
272 5 => {
273 if funct7 == 0x20 {
274 format!("srai {}, {}, {}", reg(rd), reg(rs1), shamt)
275 } else {
276 format!("srli {}, {}, {}", reg(rd), reg(rs1), shamt)
277 }
278 }
279 6 => format!("ori {}, {}, {}", reg(rd), reg(rs1), imm),
280 7 => format!("andi {}, {}, {}", reg(rd), reg(rs1), imm),
281 _ => format!("unknown.op-imm funct3={}", funct3),
282 }
283 }
284
285 // ── OP (0x33) — RV32I + RV32M ────────────────────────────────────────
286 0x33 => {
287 if funct7 == 0x01 {
288 // RV32M
289 let mnem = match funct3 {
290 0 => "mul",
291 1 => "mulh",
292 2 => "mulhsu",
293 3 => "mulhu",
294 4 => "div",
295 5 => "divu",
296 6 => "rem",
297 7 => "remu",
298 _ => return format!("unknown.m funct3={}", funct3),
299 };
300 return format!("{:<8}{}, {}, {}", mnem, reg(rd), reg(rs1), reg(rs2));
301 }
302 match funct3 {
303 0 => {
304 if funct7 == 0x20 {
305 // neg rd, rs: sub rd, zero, rs
306 if rs1 == 0 {
307 return format!("neg {}, {}", reg(rd), reg(rs2));
308 }
309 format!("sub {}, {}, {}", reg(rd), reg(rs1), reg(rs2))
310 } else {
311 format!("add {}, {}, {}", reg(rd), reg(rs1), reg(rs2))
312 }
313 }
314 1 => format!("sll {}, {}, {}", reg(rd), reg(rs1), reg(rs2)),
315 2 => {
316 // seqz is not a standard pseudo here; emit canonical form.
317 format!("slt {}, {}, {}", reg(rd), reg(rs1), reg(rs2))
318 }
319 3 => {
320 // snez rd, rs: sltu rd, zero, rs
321 if rs1 == 0 {
322 return format!("snez {}, {}", reg(rd), reg(rs2));
323 }
324 format!("sltu {}, {}, {}", reg(rd), reg(rs1), reg(rs2))
325 }
326 4 => format!("xor {}, {}, {}", reg(rd), reg(rs1), reg(rs2)),
327 5 => {
328 if funct7 == 0x20 {
329 format!("sra {}, {}, {}", reg(rd), reg(rs1), reg(rs2))
330 } else {
331 format!("srl {}, {}, {}", reg(rd), reg(rs1), reg(rs2))
332 }
333 }
334 6 => format!("or {}, {}, {}", reg(rd), reg(rs1), reg(rs2)),
335 7 => format!("and {}, {}, {}", reg(rd), reg(rs1), reg(rs2)),
336 _ => format!("unknown.op funct3={}", funct3),
337 }
338 }
339
340 // ── FENCE ────────────────────────────────────────────────────────────
341 0x0f => "fence".to_string(),
342
343 // ── SYSTEM ───────────────────────────────────────────────────────────
344 0x73 => {
345 let csrno = ir >> 20;
346 let microop = funct3;
347 if microop == 0 {
348 return match csrno {
349 0x000 => "ecall".to_string(),
350 0x001 => "ebreak".to_string(),
351 0x302 => "mret".to_string(),
352 0x105 => "wfi".to_string(),
353 _ => format!("system 0x{:03x}", csrno),
354 };
355 }
356 // Zicsr
357 let csr_name = csr_name(csrno);
358 let zimm = rs1; // rs1 field used as imm on I variants
359 match microop {
360 1 => {
361 // csrw csr, rs: csrrw zero, csr, rs (rd == zero, discards reading)
362 if rd == 0 {
363 return format!("csrw {}, {}", csr_name, reg(rs1));
364 }
365 format!("csrrw {}, {}, {}", reg(rd), csr_name, reg(rs1))
366 }
367 2 => {
368 // csrs csr, rs: csrrs zero, csr, rs
369 if rd == 0 {
370 return format!("csrs {}, {}", csr_name, reg(rs1));
371 }
372 // csrr rd, csr: csrrs rd, csr, zero (rs1 == zero)
373 if rs1 == 0 {
374 return format!("csrr {}, {}", reg(rd), csr_name);
375 }
376 format!("csrrs {}, {}, {}", reg(rd), csr_name, reg(rs1))
377 }
378 3 => {
379 if rd == 0 {
380 return format!("csrc {}, {}", csr_name, reg(rs1));
381 }
382 format!("csrrc {}, {}, {}", reg(rd), csr_name, reg(rs1))
383 }
384 5 => {
385 if rd == 0 {
386 return format!("csrwi {}, {}", csr_name, zimm);
387 }
388 format!("csrrwi {}, {}, {}", reg(rd), csr_name, zimm)
389 }
390 6 => {
391 if rd == 0 {
392 return format!("csrsi {}, {}", csr_name, zimm);
393 }
394 format!("csrrsi {}, {}, {}", reg(rd), csr_name, zimm)
395 }
396 7 => {
397 if rd == 0 {
398 return format!("csrci {}, {}", csr_name, zimm);
399 }
400 format!("csrrci {}, {}, {}", reg(rd), csr_name, zimm)
401 }
402 _ => format!("unknown.csr microop={}", microop),
403 }
404 }
405
406 // ── RV32A ────────────────────────────────────────────────────────────
407 0x2f => {
408 let funct5 = (ir >> 27) & 0x1f;
409 let aq = (ir >> 26) & 1;
410 let rl = (ir >> 25) & 1;
411 let order = match (aq, rl) {
412 (1, 1) => ".aqrl",
413 (1, 0) => ".aq",
414 (0, 1) => ".rl",
415 _ => "",
416 };
417 match funct5 {
418 0b00010 => format!("lr.w{} {}, ({})", order, reg(rd), reg(rs1)),
419 0b00011 => format!("sc.w{} {}, {}, ({})", order, reg(rd), reg(rs2), reg(rs1)),
420 0b00001 => format!(
421 "amoswap.w{} {}, {}, ({})",
422 order,
423 reg(rd),
424 reg(rs2),
425 reg(rs1)
426 ),
427 0b00000 => format!(
428 "amoadd.w{} {}, {}, ({})",
429 order,
430 reg(rd),
431 reg(rs2),
432 reg(rs1)
433 ),
434 0b00100 => format!(
435 "amoxor.w{} {}, {}, ({})",
436 order,
437 reg(rd),
438 reg(rs2),
439 reg(rs1)
440 ),
441 0b01100 => format!(
442 "amoand.w{} {}, {}, ({})",
443 order,
444 reg(rd),
445 reg(rs2),
446 reg(rs1)
447 ),
448 0b01000 => format!(
449 "amoor.w{} {}, {}, ({})",
450 order,
451 reg(rd),
452 reg(rs2),
453 reg(rs1)
454 ),
455 0b10000 => format!(
456 "amomin.w{} {}, {}, ({})",
457 order,
458 reg(rd),
459 reg(rs2),
460 reg(rs1)
461 ),
462 0b10100 => format!(
463 "amomax.w{} {}, {}, ({})",
464 order,
465 reg(rd),
466 reg(rs2),
467 reg(rs1)
468 ),
469 0b11000 => format!(
470 "amominu.w{} {}, {}, ({})",
471 order,
472 reg(rd),
473 reg(rs2),
474 reg(rs1)
475 ),
476 0b11100 => format!(
477 "amomaxu.w{} {}, {}, ({})",
478 order,
479 reg(rd),
480 reg(rs2),
481 reg(rs1)
482 ),
483 _ => format!("unknown.amo funct5=0x{:02x}", funct5),
484 }
485 }
486
487 _ => format!("unknown 0x{:08x}", ir),
488 }
489}
490
491// ── CSR names ────────────────────────────────────────────────────────────────
492
493/// Return the conventional name of a CSR address, or `"0xNNN"` if unknown.
494fn csr_name(csr: u32) -> String {
495 match csr {
496 0x300 => "mstatus".into(),
497 0x301 => "misa".into(),
498 0x304 => "mie".into(),
499 0x305 => "mtvec".into(),
500 0x340 => "mscratch".into(),
501 0x341 => "mepc".into(),
502 0x342 => "mcause".into(),
503 0x343 => "mtval".into(),
504 0x344 => "mip".into(),
505 0xc00 => "cycle".into(),
506 0xc01 => "time".into(),
507 0xc02 => "instret".into(),
508 0xf11 => "mvendorid".into(),
509 0xf12 => "marchid".into(),
510 0xf13 => "mimpid".into(),
511 0xf14 => "mhartid".into(),
512 _ => format!("0x{:03x}", csr),
513 }
514}
515
516// ── Static disassembly ───────────────────────────────────────────────────────
517
518/// Disassemble an entire ELF image, segment by segment, like `objdump -d`.
519///
520/// Prints to stdout in the following format:
521///
522/// ```text
523/// Disassembly of section .text:
524///
525/// 80000000 <_start>:
526/// 80000000 02000117 auipc sp, 0x2000
527/// ```
528///
529/// Segments shorter than 4 bytes are skipped.
530pub fn disasm_elf(_data: &[u8], image: &ElfImage, syms: Option<&SymbolTable>) {
531 // Use PT_LOAD segments as a proxy for code sections.
532 // A true per-section disassembly would require parsing section headers,
533 // but segments are sufficient for the typical use case.
534 for (seg_idx, seg) in image.segments.iter().enumerate() {
535 let base = seg.vaddr;
536 let data_slice = &seg.data;
537
538 // Skip segments too short to contain even one instruction.
539 if data_slice.len() < 4 {
540 continue;
541 }
542
543 // Section label — use a generic name; we parse segments, not sections.
544 let section_label = if seg_idx == 0 {
545 ".text"
546 } else {
547 &format!(".text.{}", seg_idx)
548 };
549 println!("Disassembly of section {}:\n", section_label);
550
551 let mut i = 0usize;
552 while i + 3 < data_slice.len() {
553 let pc = base + i as u32;
554
555 // Print a symbol label if this address is the start of a known symbol.
556 if let Some(sym) = syms.and_then(|t| t.lookup_addr(pc)) {
557 if sym.addr == pc {
558 println!("{:08x} <{}>:", pc, sym.name);
559 }
560 }
561
562 let ir = u32::from_le_bytes([
563 data_slice[i],
564 data_slice[i + 1],
565 data_slice[i + 2],
566 data_slice[i + 3],
567 ]);
568
569 let mnem = disassemble(ir, pc, syms);
570 println!(" {:08x} {:08x} {}", pc, ir, mnem);
571
572 i += 4;
573 }
574 println!();
575 }
576}