当前位置：首页 > news >正文

如何借助人工智能AI模型开发一个类似OpenAI Operator的智能体实现电脑自动化操作？

news 来源：原创 2025/8/25 0:01:32

这几天关于Manus的新闻铺天盖地，于是研究了一下AI智能体的实现思路，发现Openai 的OpenAI Operator智能体已经实现了很强的功能，但是每月200美金的价格高不可攀，而Manus的邀请码据说炒到了几万块！就想能不能求助人工智能，让它指导我自己写一个。把我的问题抛给了豆包、Deepseek、Claude，我的问题如下：

如何使用大语言模型API开发一个类似OpenAI Operator这样的智能体实现本地电脑自动化操作？

豆包、Deepseek虽然也给出了实现思路和程序样本，但是发现Claude的回复要专业得多，文档深度和程序结构都非常清晰，代码更容易落地执行，实在是太强悍太厉害了！所以分享出来。下面是Claude生成的回复内容和对应文档：

我可以帮你了解如何使用大语言模型API开发一个类似于OpenAI Operator的智能体，实现本地电脑自动化操作。
开发智能体自动化系统架构要实现这样的自动化智能体，你需要构建几个关键组件：

与大语言模型的接口本地操作系统交互层工具集成框架状态管理和上下文追踪安全性控制

让我为你创建一个详细的开发指南：

LLM驱动的本地自动化智能体开发指南

系统架构概述

开发一个类似OpenAI Operator的本地自动化智能体需要几个核心组件协同工作：
希望这个指南能帮助你开始构建自己的本地自动化智能体！
用户 ↔ 界面层 ↔ 核心控制器 ↔ LLM API
↕
工具管理器
↕
本地系统接口 / 外部集成

1. 核心组件

1.1 LLM接口层

这一层负责与大语言模型API通信：

import anthropic  # 或其他LLM提供商的SDKclass LLMInterface:def __init__(self, api_key, model="claude-3-7-sonnet-20250219"):self.client = anthropic.Anthropic(api_key=api_key)self.model = modelself.history = []def query(self, prompt, system_prompt=None, tools=None):# 构建消息历史messages = self.history + [{"role": "user", "content": prompt}]# 添加工具定义(如果有)if tools:response = self.client.messages.create(model=self.model,messages=messages,system=system_prompt,tools=tools,tool_choice="auto")else:response = self.client.messages.create(model=self.model,messages=messages,system=system_prompt)# 更新历史self.history.append({"role": "user", "content": prompt})self.history.append({"role": "assistant", "content": response.content})return response

1.2 工具管理器

这个组件管理可用于自动化的各种工具：

class ToolManager:def __init__(self):self.tools = {}def register_tool(self, name, description, function, parameters_schema):"""注册一个新工具"""self.tools[name] = {"name": name,"description": description,"function": function,"parameters_schema": parameters_schema}def execute_tool(self, tool_name, parameters):"""执行指定工具"""if tool_name not in self.tools:return {"error": f"Tool {tool_name} not found"}try:result = self.tools[tool_name]["function"](**parameters)return {"result": result}except Exception as e:return {"error": str(e)}def get_tools_schema(self):"""返回所有工具的schema定义，用于LLM的工具调用"""return [{"type": "function","function": {"name": tool["name"],"description": tool["description"],"parameters": tool["parameters_schema"]}}for tool in self.tools.values()]

1.3 本地系统接口

这一层负责执行具体的本地系统操作：

import os
import subprocess
import pyautogui
import platform
import psutilclass SystemInterface:def __init__(self):self.os_type = platform.system()def execute_command(self, command):"""执行Shell命令"""try:result = subprocess.run(command, shell=True, capture_output=True, text=True)return {"stdout": result.stdout,"stderr": result.stderr,"return_code": result.returncode}except Exception as e:return {"error": str(e)}def list_files(self, path="."):"""列出指定目录下的文件"""try:files = os.listdir(path)return {"files": files}except Exception as e:return {"error": str(e)}def get_system_info(self):"""获取系统信息"""return {"os": self.os_type,"cpu_percent": psutil.cpu_percent(),"memory_percent": psutil.virtual_memory().percent,"disk_usage": psutil.disk_usage('/').percent}def mouse_click(self, x, y):"""模拟鼠标点击"""pyautogui.click(x, y)return {"status": "success"}def type_text(self, text):"""模拟键盘输入"""pyautogui.write(text)return {"status": "success"}def take_screenshot(self):"""截取屏幕"""screenshot = pyautogui.screenshot()path = "screenshot.png"screenshot.save(path)return {"path": path}

2. 智能体核心控制器

核心控制器协调所有组件，处理用户输入，调用LLM进行决策，并执行相应的工具：

class AgentController:def __init__(self, api_key, system_prompt=None):self.llm = LLMInterface(api_key)self.tool_manager = ToolManager()self.system = SystemInterface()self.system_prompt = system_prompt or self._default_system_prompt()# 注册基本工具self._register_default_tools()def _default_system_prompt(self):return """你是一个能够操作本地计算机的智能助手。用户会告诉你他们想完成的任务，
你需要思考如何使用可用的工具来实现这些任务。在操作前，请先分析任务并制定计划，
然后一步步执行。如果遇到问题，尝试诊断并解决。优先考虑用户数据的安全性。"""def _register_default_tools(self):"""注册默认工具"""# 执行命令工具self.tool_manager.register_tool(name="execute_command",description="在系统Shell中执行命令",function=self.system.execute_command,parameters_schema={"type": "object","properties": {"command": {"type": "string","description": "要执行的命令"}},"required": ["command"]})# 列出文件工具self.tool_manager.register_tool(name="list_files",description="列出目录中的文件",function=self.system.list_files,parameters_schema={"type": "object","properties": {"path": {"type": "string","description": "要列出文件的目录路径"}}})# 截屏工具self.tool_manager.register_tool(name="take_screenshot",description="截取当前屏幕",function=self.system.take_screenshot,parameters_schema={"type": "object","properties": {}})# 鼠标点击工具self.tool_manager.register_tool(name="mouse_click",description="在指定位置点击鼠标",function=self.system.mouse_click,parameters_schema={"type": "object","properties": {"x": {"type": "integer","description": "屏幕X坐标"},"y": {"type": "integer","description": "屏幕Y坐标"}},"required": ["x", "y"]})# 键盘输入工具self.tool_manager.register_tool(name="type_text",description="模拟键盘输入文本",function=self.system.type_text,parameters_schema={"type": "object","properties": {"text": {"type": "string","description": "要输入的文本"}},"required": ["text"]})def process_user_request(self, request):"""处理用户请求"""# 获取工具schematools_schema = self.tool_manager.get_tools_schema()# 查询LLMresponse = self.llm.query(prompt=request,system_prompt=self.system_prompt,tools=tools_schema)# 检查是否有工具调用if hasattr(response, 'tool_calls') and response.tool_calls:results = []for tool_call in response.tool_calls:tool_name = tool_call.function.nameparameters = json.loads(tool_call.function.arguments)# 执行工具result = self.tool_manager.execute_tool(tool_name, parameters)results.append({"tool": tool_name,"parameters": parameters,"result": result})# 将工具执行结果发送给LLM进行总结summary_prompt = f"我执行了你的指令，以下是结果:\n{json.dumps(results, ensure_ascii=False, indent=2)}\n请分析这些结果并给出下一步建议。"summary = self.llm.query(summary_prompt).contentreturn {"thinking": response.content,"actions": results,"summary": summary}else:# 如果没有工具调用，直接返回LLM回复return {"thinking": response.content,"actions": [],"summary": response.content}

3. 用户界面层

可以选择不同的界面实现方式：

3.1 命令行界面

class CommandLineInterface:def __init__(self, api_key):self.agent = AgentController(api_key)def run(self):print("🤖 本地自动化智能体已启动，输入'exit'退出")while True:user_input = input("\n✍️ 输入你的请求: ")if user_input.lower() == 'exit':print("👋 再见!")breakprint("\n🧠 智能体思考中...")result = self.agent.process_user_request(user_input)print("\n💭 思考过程:")print(result["thinking"])if result["actions"]:print("\n🛠️ 执行的操作:")for i, action in enumerate(result["actions"], 1):print(f"  {i}. {action['tool']}({action['parameters']})")print(f"     结果: {action['result']}")print("\n📝 总结:")print(result["summary"])# 使用示例
if __name__ == "__main__":api_key = "your_api_key_here"cli = CommandLineInterface(api_key)cli.run()

3.2 Web界面

使用Flask或Streamlit等框架可以创建简单的Web界面。

4. 高级功能扩展

4.1 视觉理解

通过集成计算机视觉模型，智能体可以理解屏幕内容：

from PIL import Image
import base64
import io
import requestsclass VisionHelper:def __init__(self, api_key):self.api_key = api_keydef analyze_screenshot(self):# 截取屏幕screenshot = pyautogui.screenshot()# 转换为base64buffered = io.BytesIO()screenshot.save(buffered, format="PNG")img_str = base64.b64encode(buffered.getvalue()).decode()# 调用视觉模型APIresponse = self._call_vision_api(img_str)return responsedef _call_vision_api(self, image_base64):# 这里使用Anthropic的Claude Vision API作为示例client = anthropic.Anthropic(api_key=self.api_key)response = client.messages.create(model="claude-3-7-sonnet-20250219",max_tokens=1000,messages=[{"role": "user","content": [{"type": "image","source": {"type": "base64","media_type": "image/png","data": image_base64}},{"type": "text","text": "描述这个屏幕上的内容，特别关注界面元素、按钮、文本和其他可交互的组件。"}]}])return response.content

4.2 工作流自动化

创建和保存常用工作流以便重复使用：

import jsonclass WorkflowManager:def __init__(self, storage_path="workflows.json"):self.storage_path = storage_pathself.workflows = self._load_workflows()def _load_workflows(self):try:with open(self.storage_path, 'r') as f:return json.load(f)except (FileNotFoundError, json.JSONDecodeError):return {}def _save_workflows(self):with open(self.storage_path, 'w') as f:json.dump(self.workflows, f, indent=2)def create_workflow(self, name, description, steps):"""创建新工作流"""self.workflows[name] = {"description": description,"steps": steps}self._save_workflows()def get_workflow(self, name):"""获取指定工作流"""return self.workflows.get(name)def execute_workflow(self, name, agent_controller):"""执行指定工作流"""workflow = self.get_workflow(name)if not workflow:return {"error": f"Workflow '{name}' not found"}results = []for step in workflow["steps"]:result = agent_controller.process_user_request(step)results.append(result)return resultsdef list_workflows(self):"""列出所有工作流"""return {name: workflow["description"] for name, workflow in self.workflows.items()}def delete_workflow(self, name):"""删除指定工作流"""if name in self.workflows:del self.workflows[name]self._save_workflows()return Truereturn False

4.3 任务规划与分解

智能体需要能够将复杂任务分解为更小的步骤，以便更好地执行：

class TaskPlanner:def __init__(self, llm_interface):self.llm = llm_interfacedef plan_task(self, task_description, available_tools):"""将复杂任务分解为步骤序列"""prompt = f"""
我需要完成以下任务: "{task_description}"可用的工具有:
{json.dumps(available_tools, indent=2)}请帮我将这个任务分解为逻辑步骤，每一步都应该可以通过使用上述工具来完成。
对于每一步，请指定:
1. 步骤描述
2. 要使用的工具
3. 该工具的参数
4. 成功标准以JSON格式返回计划:
```json
{{"task": "任务描述","steps": [{{"description": "步骤1描述","tool": "工具名称","parameters": {{参数JSON对象}},"success_criteria": "成功标准描述"}},// 更多步骤...]
}}

“”"
response = self.llm.query(prompt)
try:
# 从回复中提取JSON
import re
json_match = re.search(r’json\s*(.*?)\s*', response.content, re.DOTALL)
if json_match:
plan_json = json_match.group(1)
return json.loads(plan_json)
else:
# 尝试直接解析整个回复
return json.loads(response.content)
except json.JSONDecodeError:
return {“error”: “无法解析计划”, “raw_response”: response.content}

def execute_plan(self, plan, agent_controller):"""执行计划中的步骤"""results = []for i, step in enumerate(plan["steps"]):print(f"执行步骤 {i+1}/{len(plan['steps'])}: {step['description']}")# 执行工具调用result = agent_controller.tool_manager.execute_tool(step["tool"], step["parameters"])# 记录结果step_result = {"step": i+1,"description": step["description"],"result": result}results.append(step_result)# 检查是否成功success_prompt = f"""

我执行了步骤: “{step[‘description’]}”
结果是: {json.dumps(result, ensure_ascii=False)}

成功标准是: “{step[‘success_criteria’]}”

这个步骤是否成功完成了? 请回答 “成功” 或 “失败”, 并简要解释原因。
“”"
success_response = self.llm.query(success_prompt)
step_result[“success_evaluation”] = success_response.content

        # 如果步骤失败，可以选择停止或尝试修复if "失败" in success_response.content:fix_prompt = f"""

步骤 “{step[‘description’]}” 失败了。
结果是: {json.dumps(result, ensure_ascii=False)}

请提供一种修复或替代方案，包括:

修改后的工具选择
修改后的参数
为什么这个修改会解决问题

以JSON格式返回修复计划:

{{"tool": "工具名称","parameters": {{参数JSON对象}},"explanation": "解释"
}}

“”"
fix_response = self.llm.query(fix_prompt)
try:
json_match = re.search(r’json\s*(.*?)\s*', fix_response.content, re.DOTALL)
if json_match:
fix_plan = json.loads(json_match.group(1))
else:
fix_plan = json.loads(fix_response.content)

                # 尝试执行修复方案print(f"尝试修复: {fix_plan['explanation']}")fix_result = agent_controller.tool_manager.execute_tool(fix_plan["tool"], fix_plan["parameters"])step_result["fix_attempt"] = {"plan": fix_plan,"result": fix_result}except Exception as e:step_result["fix_attempt"] = {"error": str(e),"raw_response": fix_response.content}return results


### 4.4 安全管理为防止危险操作，实现安全检查机制：```python
class SafetyManager:def __init__(self, llm_interface):self.llm = llm_interfaceself.sensitive_operations = ["rm", "del", "format", "shutdown", "reboot", "drop", "truncate", "sudo"]def analyze_command(self, command):"""分析命令的安全性"""# 基础检查 - 检查是否包含敏感操作for op in self.sensitive_operations:if op in command.lower():return {"safe": False,"reason": f"命令包含敏感操作: {op}","needs_confirmation": True}# 使用LLM进行更深入的分析prompt = f"""
请分析以下系统命令的安全性:
"{command}"请考虑:
1. 这个命令是否会删除或修改重要文件?
2. 这个命令是否会改变系统配置?
3. 这个命令是否会影响系统运行或网络连接?
4. 这个命令是否会泄露敏感信息?以JSON格式回答:
```json
{{"safe": true或false,"reason": "详细解释为什么安全或不安全","potential_impact": "如果执行可能产生的影响","needs_confirmation": true或false
}}

“”"
response = self.llm.query(prompt)
try:
import re
json_match = re.search(r’json\s*(.*?)\s*', response.content, re.DOTALL)
if json_match:
return json.loads(json_match.group(1))
else:
return json.loads(response.content)
except json.JSONDecodeError:
# 如果解析失败，默认需要确认
return {
“safe”: False,
“reason”: “无法确定安全性，需要人工确认”,
“needs_confirmation”: True,
“raw_response”: response.content
}


## 5. 实际应用案例### 5.1 自动化文件整理```python
def setup_file_organization_agent(agent_controller):"""配置文件整理自动化智能体"""# 注册相关工具agent_controller.tool_manager.register_tool(name="analyze_directory",description="分析目录中的文件类型和数量",function=lambda path: {"file_counts": {ext: sum(1 for f in os.listdir(path) if f.endswith(ext))for ext in {os.path.splitext(f)[1] for f in os.listdir(path) if os.path.splitext(f)[1]}}},parameters_schema={"type": "object","properties": {"path": {"type": "string", "description": "要分析的目录路径"}},"required": ["path"]})agent_controller.tool_manager.register_tool(name="create_directory",description="创建新目录",function=lambda path: {"success": os.makedirs(path, exist_ok=True) or True},parameters_schema={"type": "object","properties": {"path": {"type": "string", "description": "要创建的目录路径"}},"required": ["path"]})agent_controller.tool_manager.register_tool(name="move_file",description="移动文件到指定目录",function=lambda source, destination: {"success": bool(shutil.move(source, destination)),"source": source,"destination": destination},parameters_schema={"type": "object","properties": {"source": {"type": "string", "description": "源文件路径"},"destination": {"type": "string", "description": "目标路径"}},"required": ["source", "destination"]})# 更新系统提示agent_controller.system_prompt = """你是一个专门负责文件整理的助手。你可以帮助用户:
1. 分析目录中的文件类型和数量
2. 按照文件类型、日期或其他标准创建组织结构
3. 移动文件到适当的位置
4. 重命名文件以保持一致性在执行任何文件移动或重命名操作前，请先分析目录结构并制定组织计划。
确保不会删除任何文件，只进行移动操作。
"""return agent_controller# 使用示例
if __name__ == "__main__":api_key = "your_api_key_here"agent = AgentController(api_key)file_agent = setup_file_organization_agent(agent)# 现在可以处理文件整理请求result = file_agent.process_user_request("请帮我整理下载文件夹，按文件类型分类")

5.2 自动化应用程序操作

def setup_app_automation_agent(agent_controller):"""配置应用程序自动化智能体"""# 注册应用程序特定工具agent_controller.tool_manager.register_tool(name="find_ui_element",description="在屏幕上查找UI元素",function=lambda element_image_path=None, element_text=None: {"locations": pyautogui.locateAllOnScreen(element_image_path) if element_image_path else [pyautogui.locateOnScreen(text) for text in element_text.split(',')] if element_text else []},parameters_schema={"type": "object","properties": {"element_image_path": {"type": "string", "description": "UI元素图像的路径"},"element_text": {"type": "string", "description": "要查找的UI元素文本"}}})agent_controller.tool_manager.register_tool(name="start_application",description="启动应用程序",function=lambda app_name: {"success": subprocess.Popen(app_name) is not None,"app_name": app_name},parameters_schema={"type": "object","properties": {"app_name": {"type": "string", "description": "要启动的应用程序名称或路径"}},"required": ["app_name"]})# 更新系统提示agent_controller.system_prompt = """你是一个专门负责应用程序自动化的助手。你可以帮助用户:
1. 启动应用程序
2. 在应用程序界面上找到并点击特定元素
3. 输入文本
4. 执行特定的操作序列
5. 在多个应用程序之间协调工作在执行操作前，请先确认应用程序是否已经启动，并分析当前屏幕状态。
对于不确定的操作，先尝试找到UI元素再执行点击等操作。
"""return agent_controller# 使用示例
if __name__ == "__main__":api_key = "your_api_key_here"agent = AgentController(api_key)app_agent = setup_app_automation_agent(agent)# 现在可以处理应用程序自动化请求result = app_agent.process_user_request("打开浏览器，访问google.com并搜索'Python自动化'")

6. 安全与隐私注意事项

在开发和使用本地自动化智能体时，需要考虑以下安全和隐私问题：

6.1 权限控制

最小权限原则：只授予智能体完成任务所需的最小权限
敏感操作确认：对删除文件、修改系统设置等操作实现二次确认机制
操作日志：记录所有执行的命令和操作，便于审计和回溯

6.2 数据隐私

本地处理：尽量在本地处理敏感数据，减少数据传输
数据筛选：向LLM发送数据前进行筛选，移除不必要的敏感信息
加密传输：确保与LLM API的通信使用加密连接

6.3 安全审查

定期代码审查：检查是否存在安全漏洞或逻辑错误
定期更新依赖：确保使用的库和框架没有已知安全漏洞
模拟攻击测试：测试智能体是否能够抵抗恶意指令

7. 完整项目结构示例

local_automation_agent/
├── README.md
├── requirements.txt
├── src/
│   ├── __init__.py
│   ├── agent/
│   │   ├── __init__.py
│   │   ├── controller.py      # 核心控制器
│   │   ├── llm_interface.py   # LLM接口
│   │   ├── tool_manager.py    # 工具管理器
│   │   └── system_interface.py # 系统接口
│   ├── tools/
│   │   ├── __init__.py
│   │   ├── file_tools.py      # 文件操作工具
│   │   ├── system_tools.py    # 系统操作工具
│   │   ├── browser_tools.py   # 浏览器操作工具
│   │   └── ui_tools.py        # UI操作工具
│   ├── safety/
│   │   ├── __init__.py
│   │   ├── command_analyzer.py # 命令安全分析
│   │   └── permission_manager.py # 权限管理
│   ├── ui/
│   │   ├── __init__.py
│   │   ├── cli.py             # 命令行界面
│   │   └── web_app.py         # Web界面
│   └── utils/
│       ├── __init__.py
│       ├── workflow_manager.py # 工作流管理
│       └── task_planner.py    # 任务规划器
├── config/
│   ├── default_config.json    # 默认配置
│   └── tools_config.json      # 工具配置
├── examples/
│   ├── file_organization.py   # 文件整理示例
│   └── app_automation.py      # 应用程序自动化示例
└── main.py                    # 主入口

8. 安装与依赖

8.1 所需Python库

# requirements.txt
anthropic>=0.7.0  # 或其他LLM API客户端
pyautogui>=0.9.53
pillow>=9.0.0
psutil>=5.9.0
flask>=2.0.0  # 如果使用Web界面
requests>=2.27.0
pynput>=1.7.6
opencv-python>=4.5.0  # 用于高级UI元素检测
python-dotenv>=0.19.0

8.2 安装步骤

# 创建虚拟环境
python -m venv venv
source venv/bin/activate  # 在Windows上使用: venv\Scripts\activate# 安装依赖
pip install -r requirements.txt

9. 未来发展方向

9.1 增强交互能力

语音接口：添加语音识别和合成，实现语音指令
自然语言理解改进：优化智能体对用户意图的理解能力
上下文记忆：改进对过去操作和系统状态的记忆能力

9.2 增强自主性

任务学习：学习用户经常执行的操作序列
错误恢复：更智能地从操作错误中恢复
主动建议：基于系统状态主动提供优化建议

9.3 多平台支持

跨平台兼容：同时支持Windows、macOS和Linux
移动端集成：扩展到Android和iOS平台
云端协作：实现本地智能体与云服务的无缝协作

10. 实际项目启动示例

# main.py
import os
import json
from dotenv import load_dotenv
from src.agent.controller import AgentController
from src.ui.cli import CommandLineInterface
from src.ui.web_app import WebInterfacedef main():# 加载环境变量load_dotenv()# 获取API密钥api_key = os.getenv("LLM_API_KEY")if not api_key:print("错误: 未设置LLM_API_KEY环境变量")return# 加载配置try:with open("config/default_config.json", "r") as f:config = json.load(f)except (FileNotFoundError, json.JSONDecodeError):print("警告: 未找到配置文件或格式错误，使用默认配置")config = {"interface": "cli", "debug": False}# 创建智能体控制器agent = AgentController(api_key)# 根据配置选择界面if config.get("interface") == "web":ui = WebInterface(agent, debug=config.get("debug", False))else:ui = CommandLineInterface(agent)# 启动界面ui.run()if __name__ == "__main__":main()

结论

通过结合大语言模型和本地自动化工具，你可以创建一个强大的智能体系统，帮助用户完成各种计算机任务。关键是构建好核心组件之间的通信机制，以及提供足够的安全保障。随着大语言模型能力的不断提升，这类智能体的应用场景将越来越广泛。