Add rv64-matching-tool to tests

ethereum-optimism · Jan 13, 2025 · eaa84dc · eaa84dc
1 parent bf4e465
commit eaa84dc
Show file tree

Hide file tree

Showing 5 changed files with 591 additions and 0 deletions.
diff --git a/tests/rv64-matching-tool/README.md b/tests/rv64-matching-tool/README.md
@@ -0,0 +1,78 @@
+# RISCV64 matching tool
+
+This project aims to analyze a RISCV64 ELF binary to ensure that all its RISCV64 executable instructions
+are supported by a virtual machine implementation.
+
+The tool takes as input:
+- a RISCV64 binary: the ELF file to analyze
+- a JSON file: the definition of the opcodes in the virtual machine implementatiob
+
+The tool parses the opcode of an instruction and then some of the subfields (such as `funct3`) based on the given JSON file.
+
+When an instruction is found in the binary but is not in the JSON file, the number of `UNKNOWN` instruction is incremented
+and the instruction is collected.
+
+Moreover, the tool now supports detecting Linux syscalls.
+
+Finally, the tool prints out the number of `UNKNOWN` instruction and the number of occurences for each of them.
+
+
+## Limits
+
+### Instructions parsing
+
+This tool is an instruction parser. As RISCV64 instructions can be closed one to the other,
+the tool may not be able to detect small discrepancies.
+
+For example, based on the `supported_targets/asterisc-v1.1.2.json`, the tool is not yet able to differentiate
+`FLW` and `FLD` instructions.
+
+### Machine definition
+
+The machine definition is defined in the JSON file. It details all the instructions that the machine is able to support.
+
+The unsupported instructions must not be included in the machine definition.
+
+Note that this definition requires detailing instruction fields such as opcode, funct3, ...
+
+### ELF section
+
+The tool only parses the `.text` section of the ELF RISCV64 binary. Potential executable instructions in other sections are not parsed.
+
+## Syscalls detection
+
+The tool supports detecting some syscalls. When an `ECALL` instruction is detected, the tool navigates through the **5 previous instructions** to find if the value of the `a7` register was set to an immediate.
+The `a7` register holds the syscall identifier.
+
+Why **5 previous instructions**? This is an arbitrary value, determined by reverse-engineering a RISCV-64 binary.
+
+If no value for `a7` register is found or if the `a7` value is not recognized as supported, the tool will trigger an alert.
+
+## Install
+
+Clone the repository:
+
+```bash
+git clone https://github.com/zigtur/rv64-matching-tool
+cd rv64-matching-tool
+```
+
+Install the local environment and its dependencies:
+
+```bash
+python3 -m venv localenv
+source localenv/bin/activate
+pip3 install -r requirements.txt
+```
+
+## Usage
+
+Execute the Python script to analyze the RV64 binary:
+
+```bash
+python3 matching_tool.py ./path_to_binary ./supported_targets/your_VM_target.json
+```
+
+
+
+
diff --git a/tests/rv64-matching-tool/matching_tool.py b/tests/rv64-matching-tool/matching_tool.py
@@ -0,0 +1,213 @@
+import sys
+from elftools.elf.elffile import ELFFile
+import re
+import json
+
+# Lookback 5 instructions before the ECALL
+SYSCALL_INSTRUCTIONS_LOOKBACK = 5
+
+def extract_text_section_instructions(elf_path):
+    """
+    Extract and print executable instructions from the .text section of a RISC-V ELF binary.
+    
+    Args:
+    - elf_path (str): Path to the ELF binary file.
+    
+    Returns:
+    - List of hexadecimal instructions from the .text section.
+    """
+    try:
+        with open(elf_path, 'rb') as f:
+            elffile = ELFFile(f)
+
+            # Check if the ELF is for RISC-V architecture (EM_RISCV = 243)
+            if elffile['e_machine'] != 'EM_RISCV':
+                print(f"Error: ELF is not for RISC-V (detected: {elffile['e_machine']})")
+                exit(1)
+
+            # Get the .text section
+            text_section = elffile.get_section_by_name('.text')
+            if text_section is None:
+                print(f"Error: Could not find the .text section in {elf_path}")
+                exit(1)
+
+            # Extract the raw bytes from the .text section
+            text_data = text_section.data()
+
+            # Divide the text section data into 32-bit (4-byte) RISC-V instructions
+            instructions = []
+            for i in range(0, len(text_data), 4):
+                instruction_bytes = text_data[i:i + 4]
+                if len(instruction_bytes) < 4:
+                    break  # If the remaining bytes are less than 4, stop
+                instruction = int.from_bytes(instruction_bytes, byteorder='little')
+                instructions.append(instruction)
+
+            return instructions
+
+    except FileNotFoundError:
+        print(f"Error: File '{elf_path}' not found.")
+        exit(1)
+    except Exception as e:
+        print(f"Error: Unable to read the ELF file. Reason: {e}")
+        exit(1)
+
+def parse_rd(instr):
+    return (instr >> 7) & 0x1F
+
+def parse_imm_i(instr):
+    return (instr >> 20) & 0xFFF
+
+def parse_imm_u(instr):
+    return instr & 0xFFFFF000
+
+def parse_rs1(instr):
+    return (instr >> 15) & 0x1F
+
+def parse_funct3(instr):
+    return (instr >> 12) & 0x7
+
+def parse_funct7(instr):
+    return (instr >> 25)
+
+def parse_funct12(instr):
+    return (instr >> 20) & 0xFFF
+
+def parse_opcode(instr):
+    return instr & 0x7F
+
+def instruction_name(instruction, supported):
+    opcode = parse_opcode(instruction)
+    funct3 = parse_funct3(instruction)
+    funct7 = parse_funct7(instruction)
+    funct12 = parse_funct12(instruction)
+
+    opcode_hex = f"{opcode:02X}"
+    funct3_hex = f"{funct3:02X}"
+    funct7_hex = f"{funct7:02X}"
+    funct12_hex = f"{funct12:04X}"
+
+    for opcode_entry in supported['opcodes']:
+        if opcode_hex in opcode_entry:
+            opcode_data = opcode_entry[opcode_hex]
+
+            # Check if it's a direct instruction like LUI, JAL, etc.
+            if isinstance(opcode_data, str):
+                return opcode_data
+
+            # Check for funct3-based instructions
+            if 'funct3' in opcode_data:
+                for funct3_entry in opcode_data['funct3']:
+                    if funct3_hex in funct3_entry:
+                        funct3_data = funct3_entry[funct3_hex]
+
+                        # Check for funct12 (for ECALL, EBREAK, etc.)
+                        if 'funct12' in funct3_data:
+                            for funct12_entry in funct3_data['funct12']:
+                                if funct12_hex in funct12_entry:
+                                    funct12_data = funct12_entry[funct12_hex]
+                                    return funct12_data
+
+                        return funct3_data
+
+            # Check for funct7-based instructions
+            if 'funct7' in opcode_data:
+                for funct7_entry in opcode_data['funct7']:
+                    if funct7_hex in funct7_entry:
+                        funct7_data = funct7_entry[funct7_hex]
+                        if 'funct3' in funct7_data:
+                            for funct3_entry in funct7_data['funct3']:
+                                if funct3_hex in funct3_entry:
+                                    return funct3_entry[funct3_hex]
+                    elif 'default' in funct7_entry:
+                        funct7_data = funct7_entry['default']
+                        if 'funct3' in funct7_data:
+                            for funct3_entry in funct7_data['funct3']:
+                                if funct3_hex in funct3_entry:
+                                    return funct3_entry[funct3_hex]
+
+    return "UNKNOWN"
+
+def parse_instructions(instructions, json_path):
+    last_bytes = {}
+    unknown_syscalls = {}
+    unknown_instructions = {}
+    supported, syscall_map = dict_from_json(json_path)
+
+    u32max = (2**32)-1
+    for index, instruction in enumerate(instructions):
+        if instruction < u32max:
+            ins_name = instruction_name(instruction, supported)
+            if ins_name == "ECALL":
+                ins_name = parse_syscall(instructions, index, syscall_map)
+                if "UNKNOWN" in ins_name:
+                    unknown_syscalls[ins_name] = unknown_syscalls.get(ins_name,  0) +1
+            if ins_name == "UNKNOWN":
+                unknown_instructions[instruction] = unknown_instructions.get(instruction, 0) + 1
+            last_bytes[ins_name] = last_bytes.get(ins_name, 0) + 1
+        else:
+            print(f"Error: Unexpected instruction: {instruction}.")
+            exit(1)
+    return last_bytes, unknown_instructions, unknown_syscalls
+
+def find_a7_value(instructions, index):
+    # parse the 5 previous instructions, looking for A7 value
+    for i in range(max(0,index-SYSCALL_INSTRUCTIONS_LOOKBACK), index):
+        instr = instructions[i]
+        rd = parse_rd(instr)
+        if rd == 17:  # a7 = x17
+            opcode = parse_opcode(instr)
+            if opcode == 0x13:  # ADDI
+                imm = parse_imm_i(instr)
+                return imm
+            elif opcode == 0x37:  # LUI
+                imm = parse_imm_u(instr) >> 12
+                return imm
+            elif opcode == 0x13 and parse_rs1(instr) == 0:  # LI (ADDI x17, x0, imm)
+                imm = parse_imm_i(instr)
+                return imm
+    return None
+
+def parse_syscall(instructions, index, syscall_map):
+    a7 = find_a7_value(instructions, index)
+    if a7 == None:
+        return "UNKNOWN_SYSCALL (a7 = UNKNOWN)"
+    syscall_name = syscall_map.get(f"{a7:02X}")
+    if syscall_map.get(f"{a7:02X}") is None:
+        return f"UNKNOWN_SYSCALL (a7 = 0x{a7:X})"
+    return f"ECALL.{syscall_name}"
+
+
+def dict_from_json(json_path):
+    try:
+        with open(json_path, 'r') as f:
+            data = json.load(f)
+            syscalls = {list(s.keys())[0]: list(s.values())[0] for s in data.get('syscalls', [])}
+            return data, syscalls
+    except Exception as e:
+        print(f"Error: Unable to read the JSON file. Reason: {e}")
+        exit(1)
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python3 parse_riscv_elf.py <path_to_elf_file> <path_to_json_file>")
+        sys.exit(1)
+
+    elf_path = sys.argv[1]
+    json_path = sys.argv[2]
+    instructions = extract_text_section_instructions(elf_path)
+
+    instruction_counts, unknown_instr, unknown_syscalls = parse_instructions(instructions, json_path)
+
+    # SYSCALL results
+    for key in unknown_syscalls.keys():
+        print(f"There were {unknown_syscalls[key]} {key}.")
+
+    if instruction_counts.get("UNKNOWN", 0) != 0:
+        nb_unknown = instruction_counts["UNKNOWN"]
+        print(f"There were {nb_unknown} unknown instructions.\n")
+        for instru, count in sorted(unknown_instr.items()):
+            print(f"Unknown instruction: {instru:08X}: {count} times")
+        exit(1)
+    else:
+        print("All instructions known.")
diff --git a/tests/rv64-matching-tool/requirements.txt b/tests/rv64-matching-tool/requirements.txt
@@ -0,0 +1 @@
+pyelftools
diff --git a/tests/rv64-matching-tool/supported_targets/README.md b/tests/rv64-matching-tool/supported_targets/README.md
@@ -0,0 +1,17 @@
+# Targets
+
+
+
+## Asterisc
+
+### v1.1.2
+
+Despite having an handler for the following instructions, the Asteric v1.1.2 target implements them as NO-OP. They can't be considered as supported.
+- "07": "FLW/FLD"
+- "27": "FSW/FSD"
+- "53": "FADD"
+
+
+Multiple syscalls are implemented. The Asterisc JSON file defines only 3 of them: read, write and exit.
+
+