diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..071d077
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,48 @@
+name: Tests
+
+on:
+ push:
+ branches: [ main, test-suite ]
+ pull_request:
+ branches: [ main ]
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up OCaml
+ uses: ocaml/setup-ocaml@v3
+ with:
+ ocaml-compiler: 5.1.1
+ dune-cache: true
+ opam-repositories: |
+ default: https://github.com/ocaml/opam-repository.git
+
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y npm xz-utils libomp-dev llvm-dev
+ opam install . --deps-only --update-invariant
+ npm install --no-save typescript browserify pug-lexer pug-parser pug-walk
+
+ - name: Install QuickJS
+ run: |
+ curl https://bellard.org/quickjs/quickjs-2021-03-27.tar.xz > quickjs.tar.xz
+ tar xvf quickjs.tar.xz && rm quickjs.tar.xz
+ mv quickjs-2021-03-27 quickjs
+ cd quickjs && make
+
+ - name: Install Flow
+ run: |
+ git clone --branch v0.183.1 --depth 1 https://github.com/facebook/flow.git flow
+ ln -s "$(pwd)/flow/src/parser" src/flow_parser
+ ln -s "$(pwd)/flow/src/third-party/sedlex" src/sedlex
+ ln -s "$(pwd)/flow/src/hack_forked/utils/collections" src/collections
+
+ - name: Run tests
+ run: |
+ mkdir -p strings
+ opam exec -- dune runtest tests/
diff --git a/.gitignore b/.gitignore
index f6b1e6b..da36571 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,5 @@ bad/
src/flow_parser
src/sedlex
src/collections
+
+tests/integration_test_run/
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..44ea1c5
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,89 @@
+# Agent Information - String Extractor
+
+This repository contains an OCaml-based internationalization (i18n) string extraction tool. It parses source files (JS, TS, Vue, Pug, HTML) and extracts strings for translation management.
+
+## Documentation
+
+- **[ARCHITECTURE.md](ARCHITECTURE.md)**: Contains a deep-dive into the codebase layout, directory structure, and a comprehensive API reference. **Read this file first** when:
+ - Starting a new task to understand which files are relevant.
+ - Investigating the impact of changes across the system.
+ - Looking for specific functionality or function definitions before searching.
+
+- **[DEVELOPMENT.md](DEVELOPMENT.md)**: Contains instructions for environment setup, build processes for various platforms, and release workflows. **Read this file first** when:
+ - Setting up the development environment or installing dependencies (OCaml, JS, QuickJS).
+ - Building the project for development or release.
+ - Executing the tool for manual verification or testing.
+ - Managing version numbers or release artifacts.
+
+## Project Overview
+
+- **Language**: OCaml (5.1.1) with some C++ (QuickJS bridge) and JavaScript (parsers via Browserify).
+- **Architecture**:
+ - `src/cli/`: Main entry point, command-line interface, and output generation logic.
+ - `src/parsing/`: OCaml parsers using `Angstrom` for custom formats and `Flow_parser` for JS.
+ - `src/quickjs/`: Bridge to QuickJS to run JavaScript-based parsers (TypeScript/Pug) from OCaml.
+ - `src/utils/`: Common utilities for collection, timing, and I/O.
+- **Key Libraries**: `Core`, `Eio` (concurrency), `Angstrom` (parsing), `Yojson`, `Ppx_jane`.
+
+## Essential Commands
+
+### Build
+- **Development build**: `dune build src/cli/strings.exe`
+- **Watch mode**: `dune build src/cli/strings.exe -w`
+- **Release build (MacOS)**: `DUNE_PROFILE=release dune build src/cli/strings.exe`
+- **Full release cycle**: See `DEVELOPMENT.md` for `cp`, `strip`, and Docker commands.
+
+### Run
+- After building: `./_build/default/src/cli/strings.exe [directory-to-extract-from]`
+- The CLI expects to be run from the root of a project containing a `strings/` directory (or it will create one if a `.git` folder is present).
+
+### Installation (Dev Setup)
+Refer to `DEVELOPMENT.md` for specific `opam` and `npm` setup steps, as the project has several external dependencies (Flow, QuickJS, pug-lexer, etc.).
+
+## Code Conventions & Patterns
+
+### Parsing Strategy
+1. **Direct Parsers**: Simple formats like `.strings`, `HTML`, and basic `Vue` tags are parsed using `Angstrom` in `src/parsing/`.
+2. **JS/TS Parsing**:
+ - Javascript uses `Flow_parser` and a custom AST walker in `src/parsing/js_ast.ml`.
+ - TypeScript uses the official TS parser running inside QuickJS (`src/quickjs/`).
+3. **Pug Parsing**: Has a "fast" OCaml implementation (`src/parsing/pug.ml`) and a "slow" official Pug implementation via QuickJS (`src/quickjs/`).
+
+### Extraction Pattern
+- Content is extracted into a `Utils.Collector.t`.
+- The collector tracks found strings, potential scripts (to be further parsed), and file errors.
+- **Convention**: Strings found inside `L("...")` calls are treated as translations in JS/TS.
+
+### Concurrency
+- Uses OCaml 5's `Eio` for multicore processing.
+- Parallel traversal of directories is handled in `src/cli/strings.ml` via `Fiber.List.iter` and `Eio.Executor_pool`.
+- JS workers (QuickJS) are managed via a pool in `src/quickjs/quickjs.ml`.
+
+## Important Gotchas
+
+- **QuickJS Dependency**: Requires a compiled `quickjs` directory at the project root for building. `dune` rules in `src/quickjs/dune` copy headers and libraries from there.
+- **Generated Headers**: `src/quickjs/runtime.h` is generated from `src/quickjs/parsers.js` using `browserify` and `qjsc`.
+- **Linking**: MacOS builds use specific link flags (e.g., `ld64.lld`) defined in `src/cli/link_flags.*`.
+- **OCamlFormat**: `.ocamlformat` is present; ensure you format OCaml code before submitting.
+- **Memory Safety**: Be cautious with C++ FFI code in `src/quickjs/quickjs.cpp`, particularly regarding OCaml's GC interaction (`CAMLparam`, `CAMLreturn`, `caml_release_runtime_system`).
+
+## Testing Approach
+
+- **Inline Tests**: The project uses `ppx_inline_test`. Parsers in `src/parsing/` can be tested directly within the OCaml files or in the `tests/` directory.
+- **Test Suite**: A standard test suite is located in `tests/test_runner.ml`. It covers JS, HTML, Pug, and `.strings` file parsing.
+- **Integration Tests**: Verification can be performed by running the built binary against fixtures in `tests/fixtures/` and checking the generated output in the `strings/` directory.
+- **Debug Flags**: Use `--show-debugging` or `--debug-pug` / `--debug-html` flags in the CLI to inspect internal parsing results.
+
+## Troubleshooting
+
+### "File modified since last read"
+If you receive an error stating that a file has been **"modified since it was last read"**, it usually indicates a discrepancy between the file's filesystem timestamp and the internal state tracking.
+
+**Example Error:**
+> `Edit failed: The file '/path/to/file' was modified since it was last read. Please read the file again before trying to edit it.`
+
+**Recommended Fix:**
+1. Execute `touch filename` to reset the file's modification time to the current system time.
+2. Re-read the file using the `view` tool.
+3. Attempt the edit again.
+
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
new file mode 100644
index 0000000..14e9c41
--- /dev/null
+++ b/ARCHITECTURE.md
@@ -0,0 +1,80 @@
+# Architecture Documentation - String Extractor
+
+This document provides a high-level overview of the String Extractor's architecture, directory structure, and internal APIs.
+
+## Project Entry Point
+
+The main entry point of the application is **`src/cli/strings.ml`**. It handles command-line argument parsing using `Core.Command`, sets up the `Eio` runtime, and initiates the file traversal process.
+
+## Directory Structure
+
+```text
+/
+├── src/
+│ ├── cli/ # Main CLI application logic
+│ │ ├── strings.ml # CLI entry point, traversal coordination
+│ │ ├── vue.ml # Vue-specific parsing and extraction logic
+│ │ └── generate.ml # Localization file generation (.strings, .json)
+│ ├── parsing/ # Core parsers using Angstrom and Flow
+│ │ ├── basic.ml # Common parsing utilities and combinators
+│ │ ├── js_ast.ml # Flow AST walker for string extraction
+│ │ ├── js.ml # JavaScript string extraction entry point
+│ │ ├── pug.ml # Native Pug template parsing
+│ │ ├── html.ml # HTML template parsing
+│ │ ├── strings.ml # .strings file parsing logic
+│ │ └── ... # Other specialized parsers (vue blocks, styles)
+│ ├── quickjs/ # Interface to QuickJS for JS/TS/Pug parsing
+│ │ ├── quickjs.ml # OCaml FFI to QuickJS
+│ │ ├── quickjs.cpp # C++ implementation of the bridge
+│ │ └── parsers.js # JS-based parsers running in QuickJS
+│ └── utils/ # Shared utility modules
+│ ├── collector.ml # State container for collected strings/errors
+│ ├── io.ml # I/O helpers
+│ ├── timing.ml # Performance measurement
+│ └── exception.ml # Exception handling
+├── strings/ # Directory where .strings files are managed
+├── dune-project # Dune build system configuration
+└── README.md # Project overview and usage instructions
+```
+
+## Core API Reference
+
+### `src/cli/`
+- **`Strings.main`**: Coordinates the entire run, including directory traversal and result generation.
+- **`Vue.parse`**: Splits a `.vue` file into its constituent parts (template, script, style).
+- **`Generate.write_english`**: Creates `english.strings` and `english.json` from the collected strings.
+- **`Generate.write_other`**: Updates existing translations for other languages.
+
+### `src/parsing/`
+- **`Parsing.Basic`**: Provides foundational Angstrom parsers for whitespace, strings, and standard error handling.
+- **`Parsing.Js.extract_to_collector`**: Entry point for scanning JavaScript source code.
+- **`Parsing.Js_ast.extract`**: A comprehensive walker for the Flow AST that identifies and extracts strings from `L("...")` calls.
+- **`Parsing.Pug.collect`**: Traverses the native Pug AST to extract strings.
+- **`Parsing.Strings.parse`**: Parses existing `.strings` files into a lookup table.
+
+### `src/quickjs/`
+- **`Quickjs.extract_to_collector`**: Offloads extraction to QuickJS for TypeScript and advanced Pug templates.
+
+### `src/utils/`
+- **`Utils.Collector.create`**: Initializes a new string collection state for a specific file. (type `t = { path: string; strings: string Queue.t; ... }`)
+- **`Utils.Collector.blit_transfer`**: Merges results from one collector into another.
+
+## Control Flow
+1. **Initiation**: `strings.exe` starts, parses CLI flags, and identifies the target directory.
+2. **Traversal**: Uses `Eio` to recursively walk the directory tree.
+3. **Dispatch**: For each supported file extension, the corresponding parser in `src/parsing` is invoked.
+4. **Collection**: Parsers find strings (usually inside `L()`) and add them to a `Collector.t`.
+5. **Generation**: `Generate.ml` aggregates strings from all collectors and updates the `strings/` directory.
+
+## Testing Setup
+
+The project implements a multi-layered testing strategy:
+
+1. **Inline Tests**: Using `ppx_inline_test`, logic can be tested directly within the source files. This is primarily used for parser validation in `src/parsing/`.
+2. **Standard Test Suite**: Located in `tests/test_runner.ml`, this suite uses `ppx_expect` and `ppx_assert` to verify:
+ - JavaScript string extraction via `Flow_parser`.
+ - HTML and Pug extraction via `SZXX` and `Angstrom`.
+ - Apple-style `.strings` file parsing.
+3. **Integration Testing**: The `tests/fixtures/` directory contains sample files of all supported types. The CLI can be run against these fixtures to verify end-to-end extraction and output generation (`.strings` and `.json` files).
+
+The `tests/dune` file configures the test library and enables inline tests for the module.
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index e4e4e69..613d9d5 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -1,11 +1,14 @@
## Local development
### Setup
+
From the root of the repo:
+
```sh
brew install opam libomp llvm
opam switch create . ocaml-variants.5.1.1+options --no-install
+eval $(opam env)
opam install . --deps-only -t
# Remove old Flow version
@@ -28,6 +31,7 @@ cd quickjs && make && cd -
```
### MacOS - Development
+
```sh
# Build
dune build src/cli/strings.exe -w
@@ -37,6 +41,7 @@ cp _build/default/src/cli/strings.exe strings.mac && ./strings.mac
```
### MacOS - Build & Run
+
```sh
# Don't forget to update the version number in [strings.ml]
@@ -46,6 +51,7 @@ rm -f strings.mac && dune clean && DUNE_PROFILE=release dune build src/cli/strin
```
### Docker (Linux) - Build & Run
+
```sh
# Don't forget to update the version number in [strings.ml]
@@ -68,3 +74,14 @@ docker run -it --rm \
## apt-get update && apt-get install musl
## /app/strings.linux frontend
```
+
+### Testing
+
+To run the automated tests and generate the translation files, first create the `strings/` directory at the project root, then run the tests. Ensure your opam environment is initialized:
+
+```sh
+eval $(opam env)
+mkdir -p strings && opam exec -- dune runtest tests/
+```
+
+This command builds the project, executes the test suite, and populates the `strings/` directory with `english.strings` (extracted from fixtures) and merged `french.strings`.
diff --git a/dune b/dune
index 7d80e55..89f3c7b 100644
--- a/dune
+++ b/dune
@@ -1,4 +1,4 @@
-(data_only_dirs node_modules quickjs)
+(data_only_dirs node_modules quickjs flow)
(env
(dev
(flags (:standard -warn-error -A))
diff --git a/src/cli/link_flags.linux.dev.dune b/src/cli/link_flags.linux.dev.dune
new file mode 100644
index 0000000..6a452c1
--- /dev/null
+++ b/src/cli/link_flags.linux.dev.dune
@@ -0,0 +1 @@
+()
diff --git a/src/quickjs/dune b/src/quickjs/dune
index d40ef41..887b7ee 100644
--- a/src/quickjs/dune
+++ b/src/quickjs/dune
@@ -53,8 +53,25 @@
(rule
(targets libomp.a)
(action (bash "
- cp /usr/local/Cellar/libomp/17.0.6/lib/libomp.a . &> /dev/null \
- || cp /usr/lib/libgomp.a libomp.a
+ OUT=\"libomp.a\"
+ PAWTHS=\"
+ /usr/local/Cellar/libomp/17.0.6/lib/libomp.a
+ /usr/lib/x86_64-linux-gnu/libomp.a
+ /usr/lib/x86_64-linux-gnu/libgomp.a
+ /usr/lib/libgomp.a
+ /usr/lib/gcc/x86_64-linux-gnu/*/libgomp.a
+ /usr/lib/gcc/aarch64-redhat-linux/*/libgomp.a
+ \"
+ for p in $PAWTHS; do
+ for matched_path in $p; do
+ if [ -f \"$matched_path\" ]; then
+ cp \"$matched_path\" \"$OUT\"
+ exit 0
+ fi
+ done
+ done
+ echo \"Error: Could not find libomp.a or libgomp.a\" >&2
+ exit 1
"))
(mode standard)
)
diff --git a/tests/dune b/tests/dune
new file mode 100644
index 0000000..a0436c6
--- /dev/null
+++ b/tests/dune
@@ -0,0 +1,54 @@
+(library
+ (name parsing_tests)
+ (inline_tests)
+ (libraries parsing utils core eio_main)
+ (preprocess (pps ppx_jane ppx_inline_test))
+)
+
+(rule
+ (alias runtest)
+ (deps
+ ../src/cli/strings.exe
+ (source_tree fixtures))
+ (action
+ (bash "
+ TMP_DIR=\"integration_test_run\"
+ rm -rf $TMP_DIR
+ mkdir -p $TMP_DIR/strings
+ mkdir -p $TMP_DIR/.git
+ printf '\"Hello from HTML\" = \"Bonjour de HTML\";\n' > $TMP_DIR/strings/french.strings
+ cp -r fixtures $TMP_DIR/
+ cd $TMP_DIR
+ ../../src/cli/strings.exe fixtures --output strings &> /dev/null
+ cd ..
+
+ if ! grep -q \"Bonjour de HTML\" $TMP_DIR/strings/french.strings; then
+ echo \"Error: French translation lost in .strings\"
+ exit 1
+ fi
+ if ! grep -q \"Bonjour de HTML\" $TMP_DIR/strings/french.json; then
+ echo \"Error: French translation lost in .json\"
+ exit 1
+ fi
+ if ! grep -q \"MISSING TRANSLATION - demo.pug\" $TMP_DIR/strings/french.strings; then
+ echo \"Error: Missing translation marker not found in .strings\"
+ exit 1
+ fi
+
+ echo \"✅ French integration test passed\"
+ rm -rf $TMP_DIR
+
+ # Help user populate root strings/ if it exists
+ # We use absolute paths to ensure we hit the real source directory
+ # even if sandboxed in _build/default/tests
+ # The dune-project is used as a landmark for the root
+ # Traverse up to find the root of the source tree
+ # In dune, we are at _build/default/tests
+ ROOT_SRC=\"$(cd ../../.. && pwd)\"
+ if [ -d \"$ROOT_SRC/strings\" ]; then
+ # Extraction generates 5 strings.
+ # We pre-populate 3 translations.
+ printf '\"Hello from HTML\" = \"Bonjour de HTML\";\n\"Hello from JS\" = \"Bonjour de JS\";\n\"Hello from Pug\" = \"Bonjour de Pug\";\n' > \"$ROOT_SRC/strings/french.strings\"
+ ./../src/cli/strings.exe \"$ROOT_SRC/tests/fixtures\" --output \"$ROOT_SRC/strings\"
+ fi
+ ")))
diff --git a/tests/fixtures/demo.html b/tests/fixtures/demo.html
new file mode 100644
index 0000000..6c9bdfd
--- /dev/null
+++ b/tests/fixtures/demo.html
@@ -0,0 +1 @@
+Hello from HTML
diff --git a/tests/fixtures/demo.js b/tests/fixtures/demo.js
new file mode 100644
index 0000000..6a9852c
--- /dev/null
+++ b/tests/fixtures/demo.js
@@ -0,0 +1 @@
+L('Hello from JS');
diff --git a/tests/fixtures/demo.pug b/tests/fixtures/demo.pug
new file mode 100644
index 0000000..e4b3a99
--- /dev/null
+++ b/tests/fixtures/demo.pug
@@ -0,0 +1 @@
+i18n Hello from Pug
diff --git a/tests/fixtures/demo.vue b/tests/fixtures/demo.vue
new file mode 100644
index 0000000..7f61388
--- /dev/null
+++ b/tests/fixtures/demo.vue
@@ -0,0 +1,13 @@
+
+Hello from Vue Template
+
+
+
diff --git a/tests/test_runner.ml b/tests/test_runner.ml
new file mode 100644
index 0000000..ee177db
--- /dev/null
+++ b/tests/test_runner.ml
@@ -0,0 +1,70 @@
+open! Core
+open Parsing
+
+let%test_unit "js_extraction_basic" =
+ let collector = Utils.Collector.create ~path:"test.js" in
+ let source = "L('Hello World'); L('Foo Bar');" in
+ Js.extract_to_collector collector source;
+ let strings = Queue.to_list collector.strings in
+ [%test_eq: string list] (List.sort strings ~compare:String.compare) (List.sort ["Hello World"; "Foo Bar"] ~compare:String.compare)
+
+let%test_unit "js_extraction_nested" =
+ let collector = Utils.Collector.create ~path:"test.js" in
+ let source = "function test() { if (true) { return L('Nested'); } }" in
+ Js.extract_to_collector collector source;
+ let strings = Queue.to_list collector.strings in
+ [%test_eq: string list] strings ["Nested"]
+
+let%test_unit "js_extraction_no_match" =
+ let collector = Utils.Collector.create ~path:"test.js" in
+ let source = "console.log('Hello');" in
+ Js.extract_to_collector collector source;
+ let strings = Queue.to_list collector.strings in
+ [%test_eq: string list] strings []
+
+let%test_unit "strings_parsing" =
+ Eio_posix.run @@ fun env ->
+ let _fs = Eio.Stdenv.fs env in
+ let path = "test.strings" in
+ let content = {|
+/* Comment */
+"Hello" = "Bonjour";
+"World" = "Monde";
+|} in
+ let flow = Eio.Flow.string_source content in
+ let table = Strings.parse ~path flow in
+ [%test_eq: string option] (Hashtbl.find table "Hello") (Some "Bonjour");
+ [%test_eq: string option] (Hashtbl.find table "World") (Some "Monde");
+ [%test_eq: string option] (Hashtbl.find table "Missing") None
+
+let%test_unit "french_strings_parsing" =
+ Eio_posix.run @@ fun _env ->
+ let path = "french.strings" in
+ let content = {|
+/* Accented characters */
+"Logout" = "Déconnexion";
+"You and {count} others" = "Vous et {count} autres";
+"Settings" = "Paramètres";
+|} in
+ let flow = Eio.Flow.string_source content in
+ let table = Strings.parse ~path flow in
+ [%test_eq: string option] (Hashtbl.find table "Logout") (Some "Déconnexion");
+ [%test_eq: string option] (Hashtbl.find table "You and {count} others") (Some "Vous et {count} autres");
+ [%test_eq: string option] (Hashtbl.find table "Settings") (Some "Paramètres")
+
+let%test_unit "html_extraction" =
+ let collector = Utils.Collector.create ~path:"test.html" in
+ let source = "Hello HTML" in
+ let on_ok parsed = Parsing.Html.collect collector parsed in
+ Parsing.Basic.exec_parser ~on_ok Parsing.Html.parser ~path:"test.html" ~language_name:"HTML" source;
+ let strings = Queue.to_list collector.strings in
+ [%test_eq: string list] strings ["Hello HTML"]
+
+let%test_unit "pug_extraction" =
+ let collector = Utils.Collector.create ~path:"test.pug" in
+ let source = "i18n Hello Pug" in
+ let string_parsers = Parsing.Basic.make_string_parsers () in
+ let on_ok parsed = Parsing.Pug.collect collector parsed in
+ Parsing.Basic.exec_parser ~on_ok (Parsing.Pug.parser string_parsers) ~path:"test.pug" ~language_name:"Pug" source;
+ let strings = Queue.to_list collector.strings in
+ [%test_eq: string list] strings ["Hello Pug"]