microsoft · njukenanli · Jun 9, 2026 · Jun 7, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/data/examples/config.json b/data/examples/config.json
@@ -4,7 +4,7 @@
         "organize": true
     },
     "model_config": {
-        "model": "openai/gpt-5.4"
+        "model": "gpt-5.5"
     },
     "workspace_root": "data/examples/",
     "dataset": "data/examples/dataset.jsonl",

diff --git a/data/examples/playground/rayon-rs__rayon-5142c8d/result.json b/data/examples/playground/rayon-rs__rayon-5142c8d/result.json
diff --git a/data/examples/playground/robbert-vdh__nih-plug-28b149e/result.json b/data/examples/playground/robbert-vdh__nih-plug-28b149e/result.json
diff --git a/data/examples/result.jsonl b/data/examples/result.jsonl
diff --git a/docs/Development.md b/docs/Development.md
@@ -15,7 +15,7 @@ pip install -e .
 
 ## Run RepoLaunch
 
-We provide an example input file `data/examples/dataset.jsonl` and a run config `data/examples/config.json` in [examples](../data/examples) to help you quickly go through the launch process.
+We provide an example input file `data/examples/dataset.jsonl` and a run config `data/examples/config.json` in [examples](../data/examples) to help you quickly go through the launch process. Expected output files are `data/examples/result.jsonl` and `data/examples/playground/`.
 
 Before getting started, please set your `TAVILY_API_KEY` environment variable. We use [tavily](https://www.tavily.com/) for LLM search engine support.
 
@@ -93,7 +93,7 @@ The configs required for this step:
 
 | Field              | Type    |  Description                                                                 |
 |--------------------|---------|-----------------------------------------------------------------------------|
-| `mode`             | dict     |   default to {"setup": true, "organize": false}, set to {"setup": true, "organize": true} to do the two steps together, or set to {"setup": false, "organize": true} to do the second step separately AFTER the first step is DONE    |
+| `mode`             | dict     |   default to `{"setup": true, "organize": false}`, set to `{"setup": true, "organize": true}` to do the two steps together, or set to `{"setup": false, "organize": true}` to do the second step separately AFTER the first step is DONE. By default the `testone` step in the organize stage to get the command to specify each single test to run is enabled; specify `"mode": {"setup": true, "organize": true, "get_pertest_cmd": false}` to disable this step in the organize stage.  |
 | `max_steps_organize` | integer |   how many steps agent can attemp to organize the commands, default 20   |
 
 
@@ -107,14 +107,15 @@ LLM API logs (input/output/token_count/cost) will be saved in `{workspace_root}/
 
 | Field            | Description                                                                                      |
 |------------------|--------------------------------------------------------------------------------------------------|
-| `instance_id`    | Unique identifier of the instance                                                                |
+| `instance_id`    | Unique identifier of the instance                                   |
 | `docker_image_layers` | {"base_image": ..., "setup_layer": list[commands]}, can convert to Dockerfile |
-| `docker_image`   | Commited Image                               |
-| `setup_commands` | Records of shell commands used to set up the environment                                            |
-| `test_commands`  | Records of shell commands used to run the tests with verbose output                                                 |
-| `duration`       | Time taken to run the process (in minutes)         |
-| `completed`      | Boolean indicating whether the execution completed successfully                                  |
-| `exception`      | Error message or `null` if no exception occurred                                                 |
+| `docker_image`   | Commited Image                                                      |
+| `setup_commands` | Records of shell commands used to set up the environment            |
+| `test_commands`  | Records of shell commands used to run the tests with verbose output |
+| `duration`       | Time taken to run the process (in minutes)                          |
+| `cost`           | Accumulative LM API token count & cost of the setup stage           |
+| `completed`      | Boolean indicating whether the execution completed successfully     |
+| `exception`      | Error message or `null` if no exception occurred                    |
 
 Summary would be saved to `{workspace_root}/setup.jsonl`
 
@@ -125,13 +126,14 @@ The `setup_commands` and `test_commands` of the first step would be noisy, with
 | Field            | Description                                                                                      |
 |------------------|--------------------------------------------------------------------------------------------------|
 | `docker_image_layers` | {"base_image": ..., "setup_layer": list[commands], "organize_layer":  list[commands]}, can convert to Dockerfile |
-| `organize_duration`       | Time taken to run the process (in minutes)         |
-| `organize_completed`      | Boolean indicating whether the organization attempt completed successfully                                  |
-| `rebuild_commands`    | Minimal commands to rebuild the repo instance                                                                |
-| `test_commands`     | Clean test commands                            |
-| `parse`   | python script to parse the test output intp testcase-status mapping                               |
-| `test_status` | Parsed testcase-status mapping in JSON                                         |
-| `pertest_command` | Command to specify a testcase to run, might do not exists                                         |
+| `organize_duration`   | Time taken to run the process (in minutes)                    |
+| `cost`                | Accumulative LM API token count & cost of the setup stage and the organize stage, respectively |
+| `organize_completed`  | Boolean indicating whether the organization attempt completed successfully   |
+| `rebuild_commands`    | Minimal commands to rebuild the repo instance                 |
+| `test_commands`       | Clean test commands                                           |
+| `parse`               | python script to parse the test output intp testcase-status mapping    |
+| `test_status`         | Parsed testcase-status mapping in JSON                        |
+| `pertest_command`     | Command to specify a testcase to run, might do not exists     |
 
 
 Summary would be saved to `{workspace_root}/organize.jsonl`

diff --git a/launch/agent/locate.py b/launch/agent/locate.py
@@ -7,6 +7,7 @@
 
 from launch.agent.state import AgentState, auto_catch
 from launch.utilities.get_repo_structure import view_repo_structure
+from launch.utilities.llm import form_llm_cost_log, update_accumulative_cost
 
 prompt = """Given this repository structure:
 ------ BEGIN REPOSITORY STRUCTURE ------
@@ -57,8 +58,10 @@ def locate_related_file(state: AgentState) -> dict:
         AgentState: Updated state with documentation content and related files
     """
     llm = state["llm"]
+    cost = state["cost"]
     logger = state["logger"]
     repo_structure = state["repo_structure"]
+
     locate_prompt = HumanMessage(
         content=prompt.format(structure=repo_structure)
     )
@@ -69,6 +72,7 @@ def locate_related_file(state: AgentState) -> dict:
         )
 
     response = llm.invoke([locate_prompt])
+    update_accumulative_cost(cost["preparation"], response)
     potential_files = [
         line.split("<file>")[1].split("</file>")[0].strip()
         for line in response.content.split("\n")
@@ -81,7 +85,7 @@ def locate_related_file(state: AgentState) -> dict:
     ]
     potential_files = list(set(potential_files))
 
-    logger.info(f"Potential files: {potential_files}")
+    logger.info(f"Potential files: {potential_files}  {form_llm_cost_log(response)}")
     logger.info("Start determine relevance of these files...")
     related_files = []
 
@@ -103,12 +107,10 @@ def locate_related_file(state: AgentState) -> dict:
 {content}
 ------ END FILE {file} ------"""
         determine_input = HumanMessage(content=determine_prompt.format(file=file_info))
-        try:
-            response = llm.invoke([determine_input])
-        except Exception:
-            logger.error(f"Error determining file: {file}")
-            continue
-        logger.info(f"File: {file} - {response.content}")
+
+        response = llm.invoke([determine_input])
+        update_accumulative_cost(cost["preparation"], response)
+        logger.info(f"File: {file} - {response.content}  {form_llm_cost_log(response)}")
         if "<rel>Yes</rel>" in response.content:
             docs += f"File: {file}\n```\n"
             docs += content + "\n"
@@ -123,6 +125,7 @@ def locate_related_file(state: AgentState) -> dict:
         "docs": docs,
         # We do not require the full repo structure later
         "repo_structure": repo_structure,
+        "cost": cost,
     }
 
 

diff --git a/launch/agent/organize/parselog.py b/launch/agent/organize/parselog.py
@@ -11,6 +11,7 @@
 from launch.agent.prompt import ReAct_prompt
 from launch.agent.state import AgentState, auto_catch
 from launch.scripts.parser import run_parser
+from launch.utilities.llm import form_llm_cost_log, update_accumulative_cost
 
 system_msg: str = """You are a developer specializing in test output analysis and parsing. Your task is to examine the test output, evaluate the current parser, and generate an improved, fully robust parser.
 
@@ -223,6 +224,7 @@ def observation_for_parselog_action(
 
     session = state["session"]
     llm = state["llm"]
+    cost = state["cost"]
     logger = state["logger"]
 
     # Get data from previous testall stage
@@ -274,7 +276,6 @@ def observation_for_parselog_action(
 
     prefix_messages = len(messages)
     step = 0
-    answer = None
 
     # Store test_output in state for testing
     state["test_output"] = test_output
@@ -292,7 +293,9 @@ def observation_for_parselog_action(
             )
 
         response = llm.invoke(input_messages)
-        logger.info("\n" + response.pretty_repr())
+        update_accumulative_cost(cost["organize"], response)
+
+        logger.info(f"\n{response.pretty_repr()}\n\n{form_llm_cost_log(response)}\n")
         messages.append(response)
 
         action = parse_parselog_action(response.content)
@@ -324,4 +327,5 @@ def observation_for_parselog_action(
         "test_status": final_test_status,
         "success": bool(final_test_status and final_parser),
         "test_output": test_output,
+        "cost": cost,
     }
diff --git a/launch/agent/organize/rebuild.py b/launch/agent/organize/rebuild.py
@@ -6,30 +6,16 @@
 import time
 from typing import Any, Literal, ClassVar  
 
-from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
 from pydantic import BaseModel, Field
 
 from launch.agent.action_parser import ActionParser
 from launch.agent.prompt import ReAct_prompt
 from launch.agent.state import AgentState, auto_catch
 from launch.core.runtime import SetupRuntime
 from launch.utilities.language_handlers import get_language_handler
+from launch.utilities.llm import form_llm_cost_log, update_accumulative_cost
 
-
-# system_msg = """You are a developer. You have already setup all dependencies and build the repository in the current folder.
-# However, for the maintainance of the project, you need to organize the minimal commands to re-install ONLY modified packages and build the projects again after edits to the source code / package list.
-
-# - You are inside a docker container with source code already inside the container under the current directory called /testbed
-# - The dependencies of the repository have already been set up by you before.
-# - The full history commands that you used to try to set up the repo: {commands}
-
-# You can send commands in the container for several times to try to test the commands to re-build the repo and expolre the repo freely if you need more information.
-# You do not need to include the commands to run test cases because we will do it later.
-
-# The final objective is: 
-#     to "find the minimal commands to re-install ONLY modified packages AND re-build the project" again after package list / source code edits and "output your minimal re-install & re-build commands in one line".
-# You need to finish it in {steps} steps.
-# """
 system_msg = """You are a developer. You have already set up all dependencies and successfully built the repository in the current folder.
 Now, for project maintenance, you must organize the minimal commands required to re-install only modified packages and re-build the project after any edits to the source code or package list.
 
@@ -199,7 +185,7 @@ def reload_container(state: AgentState) -> dict:
 SETUP_CONVERSATION_WINDOW = 40
 
 
-def analyze_verification_with_llm(llm, submitted_commands: str, verification_output: str) -> bool:
+def analyze_verification_with_llm(llm, submitted_commands: str, verification_output: str) -> BaseMessage:
     """
     Use LLM to analyze verification results and determine if rebuild was successful.
 
@@ -235,15 +221,8 @@ def analyze_verification_with_llm(llm, submitted_commands: str, verification_out
 
 Your response:"""
 
-    try:
-        response = llm.invoke([HumanMessage(analysis_prompt)])
-        analysis = response.content.strip().upper()
-
-        # Return True if LLM says SUCCESS, False otherwise
-        return analysis == "SUCCESS"
-    except Exception as e:
-        # Fallback to return code check if LLM analysis fails
-        return analysis == "FAILURE"
+    response = llm.invoke([HumanMessage(analysis_prompt)])
+    return response
 
 @auto_catch
 def organize_setup(state: AgentState, max_steps: int) -> dict:
@@ -259,9 +238,9 @@ def organize_setup(state: AgentState, max_steps: int) -> dict:
     """
 
     llm = state["llm"]
+    cost = state["cost"]
     logger = state["logger"]
 
-    logger.info(f"setup state: {state.get("success" , "false")}, {state["trials"]}, {state["exception"]} ... ")
     hints = "\n\n"
     history_cmds = state["instance"].get("setup_cmds", [])
     history_cmds += state["instance"].get("test_cmds", [])
@@ -308,8 +287,9 @@ def organize_setup(state: AgentState, max_steps: int) -> dict:
             )
 
         response = llm.invoke(input_messages)
+        update_accumulative_cost(cost["organize"], response)
 
-        logger.info("\n" + response.pretty_repr())
+        logger.info(f"\n{response.pretty_repr()}\n\n{form_llm_cost_log(response)}\n")
         messages.append(response)
         action = parse_setup_action(response.content)
         if action and action.action == "command":
@@ -328,9 +308,12 @@ def organize_setup(state: AgentState, max_steps: int) -> dict:
             verification_output = verification_result.to_observation()
 
             # Use LLM to analyze verification results instead of just checking return code
-            verification_success = analyze_verification_with_llm(
+            response = analyze_verification_with_llm(
                 llm, submitted_commands, verification_output
             )
+            update_accumulative_cost(cost["organize"], response)
+            logger.info(form_llm_cost_log(response))
+            verification_success = "SUCCESS" in response.content.strip().upper()
 
             if verification_success:
                 # Verification passed according to LLM analysis
@@ -356,13 +339,14 @@ def organize_setup(state: AgentState, max_steps: int) -> dict:
         logger.info("\n" + message.pretty_repr())
         messages.append(message)
 
-    logger.info("-" * 10 + "End rebuild organization conversation" + "-" * 10)
+    logger.info("-" * 10 + "End rebuild conversation" + "-" * 10)
     return {
         "session": state["session"],
         "messages": messages,
         "commands": commands,
         "setup_messages": messages[prefix_messages:],
         "setup_commands": [answer] if answer else [],
-        "success": (answer is not None)
+        "success": (answer is not None),
+        "cost": cost,
     }
 
diff --git a/launch/agent/organize/save.py b/launch/agent/organize/save.py
@@ -99,6 +99,12 @@ def save_organize_result(state: AgentState) -> dict:
         "organize_layer": state["commands"]
     }
 
+    cost = history.get("cost", {})
+    if cost:
+        cost["organize"] = state["cost"]["organize"]
+    else:
+        cost = state["cost"]
+
     result = json.dumps(
             {
                 **history,
@@ -113,6 +119,7 @@ def save_organize_result(state: AgentState) -> dict:
                 "log_parser": state.get("parser", ""),
                 "unittest_generator": state.get("unittest_generator", ""),
                 "organize_duration": duration,
+                "cost": cost,
                 "organize_completed": state.get("success", False),
                 "exception": exception,
                 "repo_structure": state["repo_structure"],

diff --git a/launch/agent/organize/testall.py b/launch/agent/organize/testall.py
@@ -12,6 +12,7 @@
 from launch.agent.prompt import ReAct_prompt
 from launch.agent.state import AgentState, auto_catch
 from launch.utilities.language_handlers import get_language_handler
+from launch.utilities.llm import form_llm_cost_log, update_accumulative_cost
 
 from launch.scripts.parser import run_parser
 
@@ -412,10 +413,10 @@ def observation_for_verify_action(
 
     hints = "\n\n"
     llm = state["llm"]
+    cost = state["cost"]
     logger = state["logger"]
     setup_commands = state["setup_commands"]
 
-    logger.info(f"setup state: {state.get("success" , "false")}, {state["exception"]} ... ")
     hints = "\n\n"
     history_cmds = state["instance"].get("setup_cmds", [])
     history_cmds += state["instance"].get("test_cmds", [])
@@ -447,8 +448,7 @@ def observation_for_verify_action(
     prefix_messages = len(messages)
     commands = state["commands"]
     step = 0
-    answer = None
-    logger.info("-" * 10 + "Start test conversation" + "-" * 10)
+    logger.info("-" * 10 + "Start organize-test conversation" + "-" * 10)
     while step < max_steps:
         step += 1
         # uses a window to avoid exceed context
@@ -458,9 +458,11 @@ def observation_for_verify_action(
             input_messages = (
                 messages[:prefix_messages] + messages[-VERIFY_CONVERSATION_WINDOW:]
             )
+
         response = llm.invoke(input_messages)
+        update_accumulative_cost(cost["organize"], response)
 
-        logger.info("\n" + response.pretty_repr())
+        logger.info(f"\n{response.pretty_repr()}\n\n{form_llm_cost_log(response)}\n")
         messages.append(response)
         action = parse_verify_action(response.content)
         observation = observation_for_verify_action(state, action)
@@ -472,7 +474,7 @@ def observation_for_verify_action(
         logger.info("\n" + message.pretty_repr())
         messages.append(message)
 
-    logger.info("-" * 10 + "End verify conversation" + "-" * 10)
+    logger.info("-" * 10 + "End organize-test conversation" + "-" * 10)
     try:
         test_status = json.loads(test_status)
     except:
@@ -487,4 +489,5 @@ def observation_for_verify_action(
         "parser": parser,
         "test_status": test_status,
         "success": bool(test_command.strip() and parser.strip() and test_status),
+        "cost": cost,
     }