From 57923b33819a1ed4156f4b34211cbbf26d687e05 Mon Sep 17 00:00:00 2001 From: Henny Sipma Date: Fri, 26 Dec 2025 15:28:38 -0800 Subject: [PATCH 1/7] ARM: update instruction data --- chb/arm/opcodes/ARMPreloadData.py | 34 ++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/chb/arm/opcodes/ARMPreloadData.py b/chb/arm/opcodes/ARMPreloadData.py index 740564db..df7ba0ff 100644 --- a/chb/arm/opcodes/ARMPreloadData.py +++ b/chb/arm/opcodes/ARMPreloadData.py @@ -4,7 +4,7 @@ # ------------------------------------------------------------------------------ # The MIT License (MIT) # -# Copyright (c) 2021 Aarno Labs LLC +# Copyright (c) 2021-2025 Aarno Labs LLC # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -30,9 +30,11 @@ from chb.app.InstrXData import InstrXData from chb.arm.ARMDictionaryRecord import armregistry -from chb.arm.ARMOpcode import ARMOpcode, simplify_result +from chb.arm.ARMOpcode import ARMOpcode, ARMOpcodeXData, simplify_result from chb.arm.ARMOperand import ARMOperand +from chb.invariants.XXpr import XXpr + import chb.util.fileutil as UF from chb.util.IndexedTable import IndexedTableValue @@ -41,6 +43,29 @@ import chb.arm.ARMDictionary +class ARMPreloadDataXData(ARMOpcodeXData): + """Data format: + - expressions: + 0: xbase + 1: xmem + """ + + def __init__(self, xdata: InstrXData) -> None: + ARMOpcodeXData.__init__(self, xdata) + + @property + def xbase(self) -> "XXpr": + return self.xpr(0, "xbase") + + @property + def xmem(self) -> "XXpr": + return self.xpr(1, "xmem") + + @property + def annotation(self) -> str: + return "Preload-data(" + str(self.xmem) + + @armregistry.register_tag("PLDW", ARMOpcode) @armregistry.register_tag("PLD", ARMOpcode) class ARMPreloadData(ARMOpcode): @@ -72,6 +97,5 @@ def annotation(self, xdata: InstrXData) -> str: xprs[0]: value of base register xprs[1]: value of memory location """ - - rhs = str(xdata.xprs[1]) - return "Preload-data(" + rhs + ")" + xd = ARMPreloadDataXData(xdata) + return xd.annotation From 4ebd38d6bd6678985111cb0371466000742b4ad6 Mon Sep 17 00:00:00 2001 From: Henny Sipma Date: Fri, 26 Dec 2025 15:29:58 -0800 Subject: [PATCH 2/7] AST: handle stack array indices --- chb/astinterface/ASTInterface.py | 15 +++++++++------ chb/invariants/XXpr.py | 2 +- chb/invariants/XXprUtil.py | 6 ++++++ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/chb/astinterface/ASTInterface.py b/chb/astinterface/ASTInterface.py index 4b504572..4c3cdb93 100644 --- a/chb/astinterface/ASTInterface.py +++ b/chb/astinterface/ASTInterface.py @@ -1016,12 +1016,7 @@ def introduce_stack_variables( stackvartypes: Dict[int, "BCTyp"]) -> None: """Creates stack variables/buffers for all stack offsets with types.""" - # local variable stack offsets from the type inference are positive, - # so they must be negated here. For the same reason, to capture the - # largest extent of every varinfo, offsets must be traversed in reverse - # order. - for (offset, bctype) in sorted(stackvartypes.items(), reverse=True): - offset = -offset + for (offset, bctype) in sorted(stackvartypes.items()): vtype = bctype.convert(self.typconverter) self.mk_stack_variable_lval(offset, vtype=vtype) @@ -1115,6 +1110,7 @@ def mk_stack_variable_lval( if varinfo.vtype is None: return lval + # create stack variables for all fields and array elements if varinfo.vtype.is_compound: structtyp = cast(AST.ASTTypComp, varinfo.vtype) ckey = structtyp.compkey @@ -1159,6 +1155,13 @@ def mk_stack_variable_lval( self._stack_variables[elementoffset + cfoff] = fieldlval elementoffset += elsize + else: + elementoffset = offset + for i in range(arraysize): + indexoffset = self.mk_scalar_index_offset(i) + elemlval = self.astree.mk_vinfo_lval(varinfo, offset=indexoffset) + self._stack_variables[elementoffset] = elemlval + elementoffset += elsize return lval diff --git a/chb/invariants/XXpr.py b/chb/invariants/XXpr.py index 511ef2e5..b1836e3b 100644 --- a/chb/invariants/XXpr.py +++ b/chb/invariants/XXpr.py @@ -806,7 +806,7 @@ def stack_address_offset(self) -> int: elif self.is_stack_address and self.is_addressof_var: xvar = self.get_addressof_var if xvar is not None: - return xvar.denotation.offset.offsetvalue() + return xvar.denotation.offset.offsetconstant raise UF.CHBError( "Expression is not a stack address: " + str(self)) diff --git a/chb/invariants/XXprUtil.py b/chb/invariants/XXprUtil.py index 093f0af8..a204b473 100644 --- a/chb/invariants/XXprUtil.py +++ b/chb/invariants/XXprUtil.py @@ -1615,6 +1615,12 @@ def stack_variable_to_ast_lval( fldoffset, xdata, iaddr, astree, anonymous=anonymous) return astree.mk_vinfo_lval(vinfo, offset=astoffset, anonymous=anonymous) + if offset.offset.is_array_index_offset: + idxoffset = cast("VMemoryOffsetArrayIndexOffset", offset.offset) + astoffset = array_offset_to_ast_offset( + idxoffset, xdata, iaddr, astree, anonymous=anonymous) + return astree.mk_vinfo_lval(vinfo, offset=astoffset, anonymous=anonymous) + if not anonymous: chklogger.logger.warning( "Stack variable with offset %s not yet supported at address %s", From 4b3e6ba931dd083326113d89e2ccbc52edad50cc Mon Sep 17 00:00:00 2001 From: Henny Sipma Date: Mon, 5 Jan 2026 22:54:23 -0800 Subject: [PATCH 3/7] ASTI: allow for discrepancy in variable name --- chb/astinterface/ASTIProvenance.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/chb/astinterface/ASTIProvenance.py b/chb/astinterface/ASTIProvenance.py index 511049f7..a092cd5c 100644 --- a/chb/astinterface/ASTIProvenance.py +++ b/chb/astinterface/ASTIProvenance.py @@ -446,10 +446,17 @@ def resolve_reaching_defs(self) -> None: # Allow for change of name of return value if str(instr.lhs) == v or v == "R0" or v == "S0": self.add_reaching_definition(xid, instrid) + elif instr.lhs is None: + chklogger.logger.info( + "Lhs variable %s is suppressed in call to " + "%s for reaching def address %s", + v, str(instr.tgt), addr) + self.add_reaching_definition(xid, instrid) else: chklogger.logger.warning( - "Variable names don't match: %s vs %s", - str(instr.lhs), v) + "Lhs variable names don't match: %s vs %s" + + " to %s for reaching def address %s", + str(instr.lhs), v, str(instr.tgt), addr) else: chklogger.logger.warning( "Expression is defined by unknown instruction: " From 7d7e2a6844b3139a45262725355bfb237e792201 Mon Sep 17 00:00:00 2001 From: Henny Sipma Date: Mon, 5 Jan 2026 22:55:12 -0800 Subject: [PATCH 4/7] DOC: start userguide --- chb/app/CHVersion.py | 2 +- doc/user-guide/userdata.md | 77 +++++++ doc/user-guide/userdata/arm-thumb.md | 35 ++++ doc/user-guide/userdata/call-back-tables.md | 71 +++++++ doc/user-guide/userdata/call-targets.md | 65 ++++++ doc/user-guide/userdata/data-blocks.md | 34 ++++ .../userdata/function-annotations.md | 190 ++++++++++++++++++ .../userdata/function-entry-points.md | 106 ++++++++++ 8 files changed, 579 insertions(+), 1 deletion(-) create mode 100644 doc/user-guide/userdata.md create mode 100644 doc/user-guide/userdata/arm-thumb.md create mode 100644 doc/user-guide/userdata/call-back-tables.md create mode 100644 doc/user-guide/userdata/call-targets.md create mode 100644 doc/user-guide/userdata/data-blocks.md create mode 100644 doc/user-guide/userdata/function-annotations.md create mode 100644 doc/user-guide/userdata/function-entry-points.md diff --git a/chb/app/CHVersion.py b/chb/app/CHVersion.py index 62bec65c..34978db2 100644 --- a/chb/app/CHVersion.py +++ b/chb/app/CHVersion.py @@ -1 +1 @@ -chbversion: str = "0.3.0-20251022" +chbversion: str = "0.3.0-20260105" diff --git a/doc/user-guide/userdata.md b/doc/user-guide/userdata.md new file mode 100644 index 00000000..ccd322ac --- /dev/null +++ b/doc/user-guide/userdata.md @@ -0,0 +1,77 @@ +# Userdata + +User data can improve analysis and decompilation. Userdata can be provided in two +ways: json files and C header files. This section describes the json files; C +header files are described here. + + +## Add userdata + +Userdata files are passed to the analyzer via the command-line with the +command-line option --hints. Multiple userdata files can be +passed with this option. If data in multiple files conflict the data from +the last file passed is taken; previous version of the same data are +overwritten. + +Some command that provide the --hints option include +``` +> chkx analyze ... --hints ... +> chkx results ast ... --hints ... +> chkx relational prepare ... --hints ... +... +``` + +## Userdata file layout + +Userdata format is json. The general layout of the json file is +``` +{ + "userdata": { + "": { ... }, + "": { ... }, + "": { ... }, + .... + "": { ... } + } +} +``` +where section-i is the name of of a particular kind of userdata that is +supported. Each kind of userdata has its own format and meaning, as explained +below. It is recommended to add some additional top-level properties to the file, +such as a hash (e.g., md5 or sha256) to identify the binary to which the userdata +applies, or the name and release date of the binary. These additional properties, +however, are not enforced or used otherwise. + +**Caution** The section names must be exact. Sections with misspelled names are +silently ignored. To check if a section was read correctly, inspect the file +.ch/u/_system_u.xml after initiating the analysis, to verify +the corresponding xml section that is passed to the back-end ocaml analyzer. + + +## Kinds of userdata + +The kinds of userdata that can be passed to the analysis is varied and tends to +grow/change over time. Below is a list of the kinds of userdata currently +supported. + +- **ARM-Thumb switch points** ([arm-thumb](userdata/arm-thumb.md)): + A list of addresses where an ARM binary + switches from ARM representation to Thumb-2 and v.v. + +- **Call-back Tables** ([call-back-tables](userdata/call-back-tables.md)): + A table of addresses + mapped to the declared name of a call-back table in memory. + +- **Call Targets for Indirect Calls** ([call-targets](userdata/call-targets.md)): + A list of targets for indirect function calls. + +- **Data Regions within Code** ([data-blocks](userdata/data-blocks.md)): + A list of start and end addresses + of regions within the code section that contain data. + +- **Function Annotations** ([function-annotations](userdata/function-annotations.md)): + Annotations with the aim to improve the quality of a decompilation to C, including + names/types for register and stack variables. + +- **Function Entry Points** ([function-entry-points](userdata/function-entry-points.md)): + A list of addresses that are the start of a function. \ No newline at end of file diff --git a/doc/user-guide/userdata/arm-thumb.md b/doc/user-guide/userdata/arm-thumb.md new file mode 100644 index 00000000..e1516122 --- /dev/null +++ b/doc/user-guide/userdata/arm-thumb.md @@ -0,0 +1,35 @@ +### Arm-Thumb switch points + +**Description** + +ARM binaries may mix the ARM and Thumb-2 representation for code. The analyzer +supports both representations. In many binaries these switch points are indicated +in the binary itself by the compiler (this is always the case for binaries +compiled with debug, and often in other binaries as well). However, if the +swich points are not explicitly present in the binary, the current version of +the disassembler cannot automatically +determine them. For these binaries the user has the option to manually indicate +the switch points in the userdata. + +**Format** + +A list of addresses followed by a colon and the letter 'T' or 'A' +that indicate starting addresses of Thumb-2 and ARM code representation regions. + + +**Example** + +``` +{ + "userdata": { + .... + "arm-thumb": [ + "0x18638:A", + "0x18908:T", + "0x18950:A", + "0x18974:T", + "0x21210:A" + ] + } +} +``` diff --git a/doc/user-guide/userdata/call-back-tables.md b/doc/user-guide/userdata/call-back-tables.md new file mode 100644 index 00000000..ca4ce0c5 --- /dev/null +++ b/doc/user-guide/userdata/call-back-tables.md @@ -0,0 +1,71 @@ +### Call-back Tables + +**Description** + +Call-back tables are arrays of structs in global memory that contain related +function pointers, usually associated with some other identifying data. +Common examples of call-back tables are in binaries that serve requests based +on a particular keyword. In such systems the response to the request is often +invoked by matching the key to the identifying key in the table and executing +the associated function pointer. + +The userdata representation for such call-back tables consists of three elements: +1. The definition of the table in C (in the C header file) +2. The start address of the table in memory (in userdata) +3. The addresses of the indirect calls into the table (in userdata) + +This section only shows the format for (2). The addresses of the indirect +calls are specified in a separate section, described in +[call-targets](call-targets.md). + + +**Format** + +A table of virtual addresses in memory mapped to names of defined tables. + + +**Example** + +``` +{ + "userdata": { + .... + "call-back-tables": { + "0x4a5910": "request_table", + "0x4a5c30": "cgi_setobject_table" + } + } +} +``` + +This section must be accompanied by a definition of the corresponding table +in a header file that is passed to the analyzer at the same time. The +corresponding header definition in this case could be something like: + +``` +struct _cbt_http_request { + char *formname; + char *filetype; + char *cachecontrol; + int (*cpb_request_12)(void *state, void *stream, int len); + int (*cbp_request_16)(char *filename, void *stream); + int (*cbp_request_20)(char *level); +} cbt_http_request; + + +struct _cbt_http_request *request_table; + + +struct _cbt_cgi_setobject { + char *tag; + int num; + int (*cbp_cgi_setobject)(struct keyvaluepair_t *kvp, int len); +} cbt_cgi_setobject; + + +struct _cbt_cgi_setobject *cgi_setobject_table; +``` + + + + \ No newline at end of file diff --git a/doc/user-guide/userdata/call-targets.md b/doc/user-guide/userdata/call-targets.md new file mode 100644 index 00000000..60aa7e53 --- /dev/null +++ b/doc/user-guide/userdata/call-targets.md @@ -0,0 +1,65 @@ +### Call targets + +**Description** + +In many cases the analyzer is able to resolve indirect function calls. For +those cases where automatic resolution of targets fails the user can supply +a list of targets explicitly in the userdata. + +A call target may be specified in a number of ways depending on the kind of +target: +- *application function:* app:\ +- *shared-object function:* so:\ +- *java native interface:* jni:\ +- *call-back table function:* cba:\:\ + +**Format** + +A list of records of the following structure: +``` + {"fa":, + "ia":, + "tgts": [ + | {"app":
} + | {"so":} + | {"jni": } + | {"cba":
:} + ] + } +``` + +**Example** + +``` +{ + "userdata": { + ... + "call-targets": [ + {"ia": "0x40d5dc", + "fa": "0x40d510", + "tgts": [{"cba": "0x4a5c30:8"}] + }, + {"ia": "0x40a6a4", + "fa": "0x409dd0", + "tgts": [{"cba": "0x4a5910:12"}] + }, + {"ia": "0x40aba8", + "fa": "0x409dd0", + "tgts": [{"cba": "0x4a5910:16"}] + }, + {"ia": "0x40afd8", + "fa": "0x409dd0", + "tgts": [{"cba": "0x4a5910:20"}] + }, + {"ia": "0x40b304", + "fa": "0x40b288", + "tgts": [{"app": "0x401018"}, {"app": "0x403200"}] + }, + {"ia": "0x40c800", + "fa": "0x40c780", + "tgts": [{"so": "memcpy"}] + } + ] + } +} +``` \ No newline at end of file diff --git a/doc/user-guide/userdata/data-blocks.md b/doc/user-guide/userdata/data-blocks.md new file mode 100644 index 00000000..31d2f9fc --- /dev/null +++ b/doc/user-guide/userdata/data-blocks.md @@ -0,0 +1,34 @@ +### Data blocks + +**Description** + +Code sections may interleave code with data regions. This is particularly common +in ARM binaries. Most of these data regions are detected automatically by the +disassembler. For the cases where this fails the user can point out these data +regions in the userdata with the data-blocks section. + +**Format** + +A list of records that specify the start (inclusive) and end (exclusive) address +of a data region, where the record has the format: +``` + {"r": [, ]} +``` + + +**Example** + +``` +{ + "userdata": { + .... + "data-blocks": [ + {"r": ["0xa02425fc", "0xa0242674"]}, + {"r": ["0xa0255e68", "0xa0255e94"]}, + {"r": ["0xa03005d4", "0xa03005f8"]}, + {"r": ["0xa0300a9e", "0xa0300ab0"]}, + ... + ] + } +} +``` \ No newline at end of file diff --git a/doc/user-guide/userdata/function-annotations.md b/doc/user-guide/userdata/function-annotations.md new file mode 100644 index 00000000..e6e60d4e --- /dev/null +++ b/doc/user-guide/userdata/function-annotations.md @@ -0,0 +1,190 @@ +### Function Annotations + +Function annotations can be used to improve the quality of a decompilation of +a function to C code. A function annotation ranges from names and types for +register and stack +variables to corrections to reaching definitions and typing inference rules. + +**Format** + +The top-level format of function annotations is a list of individual function +annotations: +``` +{ + "userdata": { + ... + { + "function-annotations": [ + { + "faddr": , + "register-variable-introductions": [ + ... + ], + "stack-variable-introductions: [ + ... + ], + "typing-rules": [ + ... + ], + "remove-reaching-definitions": [ + ... + ] + }, + ... + } + } +} +``` +where all properties are optional except for the function address. + +**Format: register-variable-introductions**: + +The format for **register-variable introductions** is a list of individual +register annotations +``` + [ + { + "iaddr": , + "name": , + "typename": , + "mods": [] + }, + { + ... + + ] +``` +The instruction address is the address of the instruction where the +register to be renamed gets assigned, that is, the register is the +left-hand side in an instruction (assignment or call). If a register +gets assigned in multiple paths in parallel, the instruction address +should be the lowest address. These introductions can be considered +as ssa (static single assignment) locations. + +The chosen name is the name to be given to the register. The name will +be used in the lifting as long as the register has the current definition. +It is the user's responsibility to ensure that there are no name clashes +with other variables. + +The type name is the name of the type of the register for that particular +assignment (a register can have many types during its lifetime within a +function). The type name is either a primitive C type (like int or +unsigned short, etc.) or the name of a type for which a typedef is given +in the header file. The reason for restricting the type name to simple +names is that full-featured C parsing needs to be applied when reading +in these files. For convenience, some modifications can be added to the +mods property to modify the typename: +- ptrto: indicating that the register type is a pointer to + the type indicated by the type name +- cast: indicating that the type given should override the + type that may have been inferred by type inference. Adding cast + furthermore ensures that the assigning instruction will be exposed in + the lifting. + +*Note:* The name of the register itself does not have to be included in +the record, as it is automatically inferred from the instruction address. +At present the annotation is limited to instructions with a single LHS +register. That is, instructions that assign to multiple registers such +as the ARM instructions LDM or ARM call instructions that +assign to both R0 and R1 are currently not +handled. + +*Note:* The typename is optional. The analyzer performs its own type inference +based on function signatures and other type information. Unless types are +introduced that are not present in any function signatures or other type +information it is often better to omit the typename initially and only add +a typename if a typename is not inferred automatically. + +**Example: register-variable-introductions:** + +``` + "register-variable-introductions": [ + { + "iaddr": "0xe2b34", + "name": "t", + "typename": "EVP_PKEY_ASN1_METHOD", + "mods": ["ptrto", "cast"] + }, + { + "iaddr": "0xe2b40", + "name": "flags", + "typename": "unsigned long" + }, + { + "iaddr": "0xe2b88", + "name": "obj" + }, + ... +``` + +**Format: stack-variable-introductions:** + +The format for **stack-variable-introductions** is a list of individual +(local) stack variable annotations: +``` + [ + { + "offset": , + "name": + "typename": , + "mods": [] + }, + { + ... + ] +``` +The offset is the offset *in bytes* where the stack variable is located, defined +as +``` +
- +``` +Note that this number must be positive as the stack grows down, and thus any +local stack variable is located at an address that is less in value than the +address of the stack-pointer at function entry. + +The name, typename, and mods are the same as for register-variable introductions +with the exception that stack variables can have an additional type of modification +expressed in the mods property: +- array:\: indicating that the stack variable type is an array + of n elements of the type given. + +It is the user's responsibility to ensure that stack variables do not overlap and +that names do not clash with each other or with register variables. + + +**Example: stack-variable-introductions:** + +``` + "stack-variable-introductions": [ + { + "offset": 32, + "name": "md", + "typename": "unsigned char", + "mods": ["array:16"] + }, + { + "offset": 56, + "name": "md_ctx", + "typename": "EVP_MD_CTX" + } + ] +``` + +**Format: remove-reaching-definitions:** + +The format for **remove-reaching-definitions** is a list of register variables +associated with the reaching definitions to be removed: +``` + [ + { + "var": , + "uselocs": [ hex-addresses ], + "rdeflocs": [ hex-addresses ] + }, + { + ... + ] +``` +The var property holds the name of the register for which the +addresses given in the rdeflocs property are to be removed +from the instructions with addresses given in the uselocs property. diff --git a/doc/user-guide/userdata/function-entry-points.md b/doc/user-guide/userdata/function-entry-points.md new file mode 100644 index 00000000..a24b450f --- /dev/null +++ b/doc/user-guide/userdata/function-entry-points.md @@ -0,0 +1,106 @@ +### Function Entry Points + +**Description** + +For most binaries the disassembler is able to determine all function entry points +automatically. In some cases, however, some function entry points may be missed, +and may be manually pointed out in the userdata. + +**Format** + +A list of addresses that are the starting address of a function. + +**Example** +``` +{ + "userdata": { + ... + "function-entry-points": [ + "0xa0100044", + "0xa010011c", + "0xa0100292", + "0xa010029c", + "0xa0100710", + "0xa010072a", + ... + ] + } +} +``` + +**Finding Function Entry Points** + +Low function coverage may be an indicator of function entry points missed. +Function coverage is defined as the ratio of the number of instructions that +are part of some function and the total number of instructions in the code +sections (minus confirmed embedded data regions). Function coverage is +displayed in the printed output when running the disassembler (without +analysis): + +``` +> chkx analyze -d +... +Disassembly : 0.16 +Construct functions: 0.86 +Disassembly information: + Instructions : 32699 + Unknown instructions : 0 + Functions : 429 (coverage: 96.68%) + Function overlap : 993 (counting multiples: 993) + Jumptables : 16 + Data blocks : 20 +... +``` + +To aid the identificaton of function entry points, the disassembler prints +out a (text) file that contains a listing of all instructions not contained +in functions. E.g., +``` +> chkx analyze -d +... +> more .cch/a/_orphan.log +... +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Data block (size: 12 bytes) + + 0x9870 Code:<0x295d4> + 0x9874 FAddr:<0x9914> + 0x9878 Code:<0x9300> +================================================================================ + + 0x987c 08 40 2d e9 PUSH {R3,LR} + 0x9880 2c 30 9f e5 LDR R3, 0x98b4 + 0x9884 00 30 d3 e5 LDRB R3, [R3] + 0x9888 00 00 53 e3 CMP R3, #0 + 0x988c 08 80 bd 18 POPNE {R3,PC} + B 0x9890 20 30 9f e5 LDR R3, 0x98b8 + 0x9894 00 00 53 e3 CMP R3, #0 + 0x9898 01 00 00 0a BEQ 0x98a4 + B 0x989c 18 00 9f e5 LDR R0, 0x98bc + 0x98a0 23 ff ff eb BL 0x9534 + B 0x98a4 08 30 9f e5 LDR R3, 0x98b4 + 0x98a8 01 20 a0 e3 MOV R2, #1 + 0x98ac 00 20 c3 e5 STRB R2, [R3] + 0x98b0 08 80 bd e8 POP {R3,PC} + B 0x98b4 38 64 03 00 ANDEQ R6, R3, R8,LSR R4 + 0x98b8 00 00 00 00 ANDEQ R0, R0, R0 + 0x98bc cc dd 02 00 ANDEQ SP, R2, R12,ASR#27 + 0x98c0 08 40 2d e9 PUSH {R3,LR} + 0x98c4 34 30 9f e5 LDR R3, 0x9900 + 0x98c8 00 00 53 e3 CMP R3, #0 + 0x98cc 02 00 00 0a BEQ 0x98dc + B 0x98d0 2c 00 9f e5 LDR R0, 0x9904 + 0x98d4 2c 10 9f e5 LDR R1, 0x9908 + 0x98d8 cc ff ff eb BL 0x9810 + B 0x98dc 28 00 9f e5 LDR R0, 0x990c + 0x98e0 00 30 90 e5 LDR R3, [R0] + 0x98e4 00 00 53 e3 CMP R3, #0 + 0x98e8 08 80 bd 08 POPEQ {R3,PC} + B 0x98ec 1c 30 9f e5 LDR R3, 0x9910 + 0x98f0 00 00 53 e3 CMP R3, #0 + 0x98f4 08 80 bd 08 POPEQ {R3,PC} + B 0x98f8 33 ff 2f e1 BLX R3 + 0x98fc 08 80 bd e8 POP {R3,PC} +... +``` +Missing function entry points are easy to spot at 0x987c and 0x98c0. \ No newline at end of file From 5917f7d4102e69c351d19c82586162000b6d6de9 Mon Sep 17 00:00:00 2001 From: Henny Sipma Date: Sun, 11 Jan 2026 17:54:30 -0800 Subject: [PATCH 5/7] CMD: add option to show api calls to classification --- chb/app/InstrXData.py | 8 ++-- chb/cmdline/chkx | 8 ++++ chb/cmdline/commandutil.py | 89 +++++++++++++++++++++++++++----------- 3 files changed, 76 insertions(+), 29 deletions(-) diff --git a/chb/app/InstrXData.py b/chb/app/InstrXData.py index 3017943d..b7ba00cd 100644 --- a/chb/app/InstrXData.py +++ b/chb/app/InstrXData.py @@ -432,7 +432,9 @@ def has_call_target(self) -> bool: key = self.tags[0] if key.startswith("a:"): keyletters = key[2:] - return len(self.args) == len(keyletters) + 1 + return ( + len(self.args) == len(keyletters) + 1 + and self.args[-1] > 0) else: return False elif len(self.tags) >= 2 and self.tags[1] == "call": @@ -470,9 +472,9 @@ def has_indirect_call_target_exprs(self) -> bool: return (len(self.tags) == 2 and self.tags[1] == "u" and len(self.args) > 1) def call_target(self, ixd: "InterfaceDictionary") -> "CallTarget": - if self.has_call_target() and self.is_bx_call: + if self.has_call_target() and self.is_bx_call and self.args[-5] > 0: return ixd.call_target(self.args[-5]) - elif self.has_call_target(): + elif self.has_call_target() and self.args[-1] > 0: return ixd.call_target(self.args[-1]) else: raise UF.CHBError( diff --git a/chb/cmdline/chkx b/chb/cmdline/chkx index 4ea0fab4..a511cbf3 100755 --- a/chb/cmdline/chkx +++ b/chb/cmdline/chkx @@ -881,6 +881,14 @@ def parse() -> argparse.Namespace: resultsclassifyfunctions.add_argument( "classification_file", help="name of json classification file") + resultsclassifyfunctions.add_argument( + "--output", "-o", + required=True, + help="name of file to save results") + resultsclassifyfunctions.add_argument( + "--showapicalls", + action="store_true", + help="list classified functions individually in output file") resultsclassifyfunctions.set_defaults(func=UCC.results_classifyfunctions) # --- results functions --- diff --git a/chb/cmdline/commandutil.py b/chb/cmdline/commandutil.py index 9d9dec60..19136f83 100644 --- a/chb/cmdline/commandutil.py +++ b/chb/cmdline/commandutil.py @@ -930,6 +930,8 @@ def results_classifyfunctions(args: argparse.Namespace) -> NoReturn: xname: str = str(args.xname) classificationfile: str = str(args.classification_file) + showapicalls: bool = args.showapicalls + outputfilename: str = args.output with open(classificationfile, "r") as fp: classifier = json.load(fp) @@ -953,44 +955,76 @@ def results_classifyfunctions(args: argparse.Namespace) -> NoReturn: fns = app.appfunction_addrs classification: Dict[str, Dict[str, int]] = {} # faddr -> libcat -> count + classificationapi: Dict[str, Dict[str, Dict[str, int]]] = {} for faddr in fns: - classification.setdefault(faddr, {}) + if showapicalls: + classificationapi.setdefault(faddr, {}) + else: + classification.setdefault(faddr, {}) f = app.function(faddr) fcalls = f.call_instructions() for baddr in fcalls: for instr in fcalls[baddr]: tgtname = instr.call_target.name if tgtname in revclassifier: - category = revclassifier[tgtname] - classification[faddr].setdefault(category, 0) - classification[faddr][category] += 1 + if showapicalls: + category = revclassifier[tgtname] + classificationapi[faddr].setdefault(category, {}) + classificationapi[faddr][category].setdefault(tgtname, 0) + classificationapi[faddr][category][tgtname] += 1 + else: + category = revclassifier[tgtname] + classification[faddr].setdefault(category, 0) + classification[faddr][category] += 1 catfprevalence: Dict[str, int] = {} catcprevalence: Dict[str, int] = {} catstats: Dict[int, int] = {} singlecat: Dict[str, int] = {} doublecat: Dict[Tuple[str, str], int] = {} - for faddr in classification: - for cat in classification[faddr]: - catfprevalence.setdefault(cat, 0) - catcprevalence.setdefault(cat, 0) - catfprevalence[cat] += 1 - catcprevalence[cat] += classification[faddr][cat] - - numcats = len(classification[faddr]) - catstats.setdefault(numcats, 0) - catstats[numcats] += 1 - if numcats == 1: - cat = list(classification[faddr].keys())[0] - singlecat.setdefault(cat, 0) - singlecat[cat] += 1 - - if numcats == 2: - cats = sorted(list(classification[faddr].keys())) - cattuple = (cats[0], cats[1]) - doublecat.setdefault(cattuple, 0) - doublecat[cattuple] += 1 + + if showapicalls: + for faddr in classificationapi: + for cat in classificationapi[faddr]: + catfprevalence.setdefault(cat, 0) + catcprevalence.setdefault(cat, 0) + catfprevalence[cat] += 1 + catcprevalence[cat] += sum(classificationapi[faddr][cat].values()) + numcats = len(classificationapi[faddr]) + catstats.setdefault(numcats, 0) + catstats[numcats] += 1 + if numcats == 1: + cat = list(classificationapi[faddr].keys())[0] + singlecat.setdefault(cat, 0) + singlecat[cat] = 1 + + if numcats == 2: + cats = sorted(list(classificationapi[faddr].keys())) + cattuple = (cats[0], cats[1]) + doublecat.setdefault(cattuple, 0) + doublecat[cattuple] += 1 + else: + + for faddr in classification: + for cat in classification[faddr]: + catfprevalence.setdefault(cat, 0) + catcprevalence.setdefault(cat, 0) + catfprevalence[cat] += 1 + catcprevalence[cat] += classification[faddr][cat] + numcats = len(classification[faddr]) + catstats.setdefault(numcats, 0) + catstats[numcats] += 1 + if numcats == 1: + cat = list(classification[faddr].keys())[0] + singlecat.setdefault(cat, 0) + singlecat[cat] += 1 + + if numcats == 2: + cats = sorted(list(classification[faddr].keys())) + cattuple = (cats[0], cats[1]) + doublecat.setdefault(cattuple, 0) + doublecat[cattuple] += 1 for (m, c) in sorted(catstats.items()): print(str(m).rjust(5) + ": " + str(c).rjust(5)) @@ -1006,9 +1040,12 @@ def results_classifyfunctions(args: argparse.Namespace) -> NoReturn: classificationresults: Dict[str, Any] = {} classificationresults["catfprevalence"] = catfprevalence classificationresults["catcprevalence"] = catcprevalence - classificationresults["functions"] = classification + if showapicalls: + classificationresults["functions"] = classificationapi + else: + classificationresults["functions"] = classification - with open("classification_results.json", "w") as fp: + with open(outputfilename, "w") as fp: json.dump(classificationresults, fp, indent=2) exit(0) From c75db1f510b0ba742e490e7ca9cbf99af4d7c694 Mon Sep 17 00:00:00 2001 From: Henny Sipma Date: Mon, 12 Jan 2026 15:26:26 -0800 Subject: [PATCH 6/7] CMD: command to collect constant string arguments --- chb/cmdline/chkx | 7 ++++ chb/cmdline/commandutil.py | 5 ++- chb/cmdline/jsonresultutil.py | 5 +-- chb/cmdline/reportcmds.py | 54 +++++++++++++++++++++++++++++++ chb/invariants/FnVarDictionary.py | 3 +- 5 files changed, 70 insertions(+), 4 deletions(-) diff --git a/chb/cmdline/chkx b/chb/cmdline/chkx index a511cbf3..b0cf38be 100755 --- a/chb/cmdline/chkx +++ b/chb/cmdline/chkx @@ -1210,6 +1210,13 @@ def parse() -> argparse.Namespace: + " source for callgraph path")) report_calls.set_defaults(func=REP.report_calls_cmd) + # -- report arguments + report_arguments = reportparsers.add_parser("string_arguments") + report_arguments.add_argument("xname", help="name of executable") + report_arguments.add_argument( + "--output", "-o", required=True, help="name of json output file") + report_arguments.set_defaults(func=REP.report_string_arguments) + # -- report function api's report_functionapis = reportparsers.add_parser("function_apis") report_functionapis.add_argument("xname", help="name of executable") diff --git a/chb/cmdline/commandutil.py b/chb/cmdline/commandutil.py index 19136f83..d8dbc09c 100644 --- a/chb/cmdline/commandutil.py +++ b/chb/cmdline/commandutil.py @@ -1045,8 +1045,11 @@ def results_classifyfunctions(args: argparse.Namespace) -> NoReturn: else: classificationresults["functions"] = classification + jresult = JU.jsonok("none", classificationresults) + jresult["meta"]["app"] = JU.jsonappdata(xinfo, includepath=False) + with open(outputfilename, "w") as fp: - json.dump(classificationresults, fp, indent=2) + json.dump(jresult, fp, indent=2) exit(0) diff --git a/chb/cmdline/jsonresultutil.py b/chb/cmdline/jsonresultutil.py index 0b84d8f5..02e0ca49 100644 --- a/chb/cmdline/jsonresultutil.py +++ b/chb/cmdline/jsonresultutil.py @@ -77,9 +77,10 @@ def jsonok(schemaname: str, content: Dict[str, Any]) -> Dict[str, Any]: return jresult -def jsonappdata(xinfo: "XInfo") -> Dict[str, str]: +def jsonappdata(xinfo: "XInfo", includepath=True) -> Dict[str, str]: result: Dict[str, str] = {} - result["path"] = xinfo.path + if includepath: + result["path"] = xinfo.path result["file"] = xinfo.file result["md5"] = xinfo.md5 result["arch"] = xinfo.architecture diff --git a/chb/cmdline/reportcmds.py b/chb/cmdline/reportcmds.py index 73fd9dde..df442494 100644 --- a/chb/cmdline/reportcmds.py +++ b/chb/cmdline/reportcmds.py @@ -72,6 +72,7 @@ from chb.app.AppAccess import AppAccess from chb.app.BasicBlock import BasicBlock from chb.app.Instruction import Instruction + from chb.invariants.XConstant import XIntConst from chb.mips.MIPSInstruction import MIPSInstruction from chb.models.BTerm import BTerm, BTermArithmetic from chb.models.FunctionSummary import FunctionSummary @@ -602,6 +603,59 @@ def report_calls_cmd(args: argparse.Namespace) -> NoReturn: exit(1) +def report_string_arguments(args: argparse.Namespace) -> NoReturn: + + # arguments + xname: str = args.xname + outputfilename: str = args.output + + try: + (path, xfile) = UC.get_path_filename(xname) + UF.check_analysis_results(path, xfile) + except UF.CHBError as e: + print(str(e.wrap())) + exit(1) + + xinfo = XI.XInfo() + xinfo.load(path, xfile) + + app = UC.get_app(path, xfile, xinfo) + fns = app.functions + + argvals: Dict[str, Dict[str, Any]] = {} + + for (faddr, f) in fns.items(): + fcalls = f.call_instructions() + for baddr in fcalls: + for instr in fcalls[baddr]: + callee = instr.call_target.name + callargs = instr.call_arguments + for (index, callarg) in enumerate(callargs): + if callarg.is_string_reference: + constcallarg = cast("XprConstant", callarg).constant + intcallarg = cast("XIntConst", constcallarg) + argvals.setdefault(faddr, {}) + argvals[faddr].setdefault("call-string-args", []) + argrec = { + "iaddr": instr.iaddr, + "callee": callee, + "index": index + 1, + "value": intcallarg.string_reference() + } + argvals[faddr]["call-string-args"].append(argrec) + + result: Dict[str, Any] = {} + result["functions"] = argvals + + jresult = JU.jsonok("none", result) + jresult["meta"]["app"] = JU.jsonappdata(xinfo, includepath=False) + + with open(outputfilename, "w") as fp: + json.dump(jresult, fp, indent=2) + + exit(0) + + def report_function_apis(args: argparse.Namespace) -> NoReturn: # arguments diff --git a/chb/invariants/FnVarDictionary.py b/chb/invariants/FnVarDictionary.py index 5a044d5a..58842ffb 100644 --- a/chb/invariants/FnVarDictionary.py +++ b/chb/invariants/FnVarDictionary.py @@ -43,6 +43,7 @@ import chb.util.fileutil as UF import chb.util.IndexedTable as IT +from chb.util.loggingutil import chklogger if TYPE_CHECKING: from chb.api.InterfaceDictionary import InterfaceDictionary @@ -201,4 +202,4 @@ def initialize(self, xnode: ET.Element) -> None: t.reset() t.read_xml(xtable, "n") else: - raise UF.CHBError("Var dictionary table " + t.name + " not found") + chklogger.logger.error("Var dictionary table %s not found", t.name) From f867714a5bf85f55005956e38ba5a00be48941ba Mon Sep 17 00:00:00 2001 From: Henny Sipma Date: Thu, 22 Jan 2026 13:53:13 -0800 Subject: [PATCH 7/7] XPR: handle no-offset separately --- chb/app/CHVersion.py | 2 +- chb/invariants/XXprUtil.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/chb/app/CHVersion.py b/chb/app/CHVersion.py index 34978db2..ad754b54 100644 --- a/chb/app/CHVersion.py +++ b/chb/app/CHVersion.py @@ -1 +1 @@ -chbversion: str = "0.3.0-20260105" +chbversion: str = "0.3.0-20260122" diff --git a/chb/invariants/XXprUtil.py b/chb/invariants/XXprUtil.py index a204b473..855efea8 100644 --- a/chb/invariants/XXprUtil.py +++ b/chb/invariants/XXprUtil.py @@ -449,6 +449,8 @@ def memory_variable_to_lval_expression( offset = cast("VMemoryOffsetFieldOffset", offset) astoffset: AST.ASTOffset = field_offset_to_ast_offset( offset, xdata, iaddr, astree, anonymous=anonymous) + elif offset.is_no_offset: + astoffset = nooffset elif offset.is_array_index_offset: offset = cast("VMemoryOffsetArrayIndexOffset", offset) astoffset = array_offset_to_ast_offset( @@ -460,6 +462,11 @@ def memory_variable_to_lval_expression( return astree.mk_memref_expr( astbase, offset=astoffset, anonymous=anonymous) + elif offset.is_no_offset: + astlval = xvariable_to_ast_def_lval_expression( + base.basevar, xdata, iaddr, astree, anonymous=anonymous) + return astree.mk_memref_expr(astlval, anonymous=anonymous) + elif ( offset.is_field_offset or offset.is_array_index_offset @@ -1952,6 +1959,8 @@ def basevar_variable_to_ast_lval( offset = cast("VMemoryOffsetArrayIndexOffset", offset) astoffset = array_offset_to_ast_offset( offset, xdata, iaddr, astree, anonymous=anonymous) + elif offset.is_no_offset: + astoffset = nooffset elif offset.is_constant_value_offset: astoffset = astree.mk_scalar_index_offset(offset.offsetvalue()) else: