多模态 AI 是具备全感官交互能力的智能系统,集感知、理解、内容生成于一体,全面支持文本、图片、音频、视频等多类型信息的输入输出。

实际使用时,我们可以以图片模态为例熟悉具体用法,需要说明的是,图片模态的调用逻辑和写法,与其他模态完全一致,掌握一种就能举一反三,无需重复学习不同模态的调用方式。

该模板在参数注入上也具备很高的灵活性,支持将图片的URL链接、Base64编码,以及本地文件路径等多种形式,作为参数动态注入到模板中,适配不同的开发场景需求,无论是在线图片还是本地图片,都能轻松处理。
prompt_template = ChatPromptTemplate.from_messages([ {"role": "system", "content": "你是专业的多模态内容分析助手"}, {"role": "user", "content": [ {"type": "text", "text": "用中文简短描述图片内容"},{"type": "image_url", "image_url": {"url": "{image_url}"}} ]}])prompt_value = prompt_template.invoke( { "image_url": "图片地址" })
了解完基础用法后,就进入实战环节,我们可以通过具体的开发案例,感受多模态AI的实际应用价值,将理论知识转化为可落地的功能。
import osfrom dotenv import load_dotenvfrom langchain.chat_models import init_chat_modelimport asynciofrom langchain_core.prompts import ChatPromptTemplateload_dotenv()prefix = "QWEN"llm = init_chat_model( model_provider="openai", configurable_fields=["model", "api_key", "base_url"], config_prefix=prefix, temperature=0.5, max_tokens=200)config = { "configurable": { f"{prefix}_model": os.getenv(f"{prefix}_MODEL"), f"{prefix}_api_key": os.getenv(f"{prefix}_API_KEY"), f"{prefix}_base_url": os.getenv(f"{prefix}_BASE_URL"), }}prompt_template = ChatPromptTemplate.from_messages([ {"role": "system", "content": "你是专业的多模态内容分析助手"}, {"role": "user", "content": [ {"type": "text", "text": "用中文简短描述图片内容"},{"type": "image_url", "image_url": {"url": "{image_url}"}} ]}])prompt_value = prompt_template.invoke( { "image_url": "图片地址" })res = llm.invoke(prompt_value, config=config)print(res.content)
在实战中,我们可以将图片识别这一核心功能,集成到基于PySide6开发的AI应用界面中,最终实现对图片中文字、链接等信息的精准识别,让应用具备更实用的交互能力,满足日常开发中的实际需求。
.env文件,大模型参数配置文件
# ========== 阿里巴巴-通义千问 ==========QWEN_API_KEY="你的API KEY"QWEN_BASE_URL='https://dashscope.aliyuncs.com/compatible-mode/v1'QWEN_CONSOLE_URL='https://dashscope.console.aliyun.com/'QWEN_MODELS='qwen3.6-plus,qwen-3.5-plus,qwen-3.5-72b-instruct'# ========== 月之暗面 - Kimi ==========KIMI_API_KEY=''KIMI_BASE_URL='https://api.moonshot.cn/v1'KIMI_CONSOLE_URL='https://platform.moonshot.cn/'KIMI_MODELS='moonshot-v1-8k,moonshot-v1-128k,moonshot-v1-256k'# ========== MiniMax ==========MINIMAX_API_KEY='你的API KEY'MINIMAX_BASE_URL='https://api.minimaxi.com/v1'MINIMAX_CONSOLE_URL='https://platform.minimax.chat/'MINIMAX_MODELS='minimax-m2.7,abab-6.5-pro,minimax-m2-her'# ========== 智谱AI-智谱清言 ==========ZHIPU_API_KEY=''ZHIPU_BASE_URL='https://open.bigmodel.cn/api/paas/v4'ZHIPU_CONSOLE_URL='https://open.bigmodel.cn/'ZHIPU_MODELS='glm-4.5-flash,glm-4.5-pro,glm-4-air'# ========== 字节跳动-豆包 ==========DOUBAO_API_KEY=''DOUBAO_BASE_URL='https://ark.cn-beijing.volces.com/api/v3'DOUBAO_CONSOLE_URL='https://console.volcengine.com/ark/'# 重要:豆包不直接使用模型名称,必须替换为火山方舟创建的推理接入点IDDOUBAO_MODELS='doubao-pro-32k,doubao-lite-32k,doubao-pro-128k'# ========== 百度-文心一言 ==========ERNIE_API_KEY=''ERNIE_BASE_URL='https://qianfan.baidubce.com/v2'ERNIE_CONSOLE_URL='https://console.bce.baidu.com/qianfan/'ERNIE_MODELS='ernie-4.0-turbo-8k,ernie-3.5-turbo-128k,ernie-speed-128k'# ========== 腾讯-混元 ==========HUNYUAN_API_KEY=''HUNYUAN_BASE_URL='https://api.hunyuan.cloud.tencent.com/v1'HUNYUAN_CONSOLE_URL='https://console.cloud.tencent.com/hunyuan/'HUNYUAN_MODELS='hunyuan-turbo,hunyuan-pro,hunyuan-lite'# ========== DeepSeek ==========DEEPSEEK_API_KEY=''DEEPSEEK_BASE_URL='https://api.deepseek.com/v1'DEEPSEEK_CONSOLE_URL='https://platform.deepseek.com/'DEEPSEEK_MODELS='deepseek-chat,deepseek-reasoner'
ocr_page.py,记得先安装pyside6开发包。
import sysimport osfrom PySide6.QtCore import Qt, Signal, QThread, QTimerfrom PySide6.QtWidgets import ( QWidget, QVBoxLayout, QHBoxLayout, QLabel, QLineEdit, QPushButton, QComboBox, QScrollArea, QFrame, QTextEdit, QApplication)from env_util import EnvUtilclass OCRPage(QWidget): def __init__(self, parent=None): super().__init__(parent) self.worker = None self.thinking_timer = QTimer() self.thinking_timer.timeout.connect(self.update_thinking) self.dot_count = 0 self.is_thinking = False self.env_util = EnvUtil() self.model_configs = {} self.all_models = [] self.load_models_from_env() layout = QVBoxLayout(self) layout.setContentsMargins(0, 0, 0, 0) layout.setSpacing(0) content = QWidget() content_layout = QVBoxLayout(content) content_layout.setContentsMargins(40, 20, 40, 20) content_layout.setSpacing(20) title = QLabel("图片内容识别") title.setStyleSheet("font-size: 20px; font-weight: bold; color: #333;") model_group = QVBoxLayout() model_label = QLabel("选择模型") model_label.setStyleSheet("font-weight: 600; color: #444; margin-bottom: 6px;") self.model_combo = QComboBox() self.model_combo.addItems(self.all_models) self.model_combo.setStyleSheet(""" QComboBox { background-color: #ffffff; color: #333333; border: 1px solid #d1d5db; border-radius: 4px; padding: 6px 10px; } """) self.model_combo.setMinimumHeight(32) model_group.addWidget(model_label) model_group.addWidget(self.model_combo) image_url_group = QVBoxLayout() image_url_label = QLabel("图片地址") image_url_label.setStyleSheet("font-weight: 600; color: #444; margin-bottom: 6px;") self.image_url_input = QLineEdit() self.image_url_input.setPlaceholderText("请输入或粘贴图片地址") self.image_url_input.setStyleSheet(""" QLineEdit { background-color: #ffffff; border: 1px solid #d1d5db; border-radius: 4px; padding: 6px 10px; font-size: 14px; } """) image_url_group.addWidget(image_url_label) image_url_group.addWidget(self.image_url_input) button_group = QHBoxLayout() self.recognize_btn = QPushButton("识 别") self.recognize_btn.setFixedWidth(120) self.recognize_btn.setStyleSheet(""" QPushButton { background-color: #1890ff; color: white; border: none; border-radius: 4px; padding: 8px 12px; font-size: 14px; } QPushButton:hover { background-color: #40a9ff; } QPushButton:disabled { background-color: #d9d9d9; } """) self.recognize_btn.clicked.connect(self.on_recognize_clicked) self.clear_btn = QPushButton("清 空") self.clear_btn.setFixedWidth(120) self.clear_btn.setStyleSheet(""" QPushButton { background-color: #f5f5f5; color: #333; border: 1px solid #d9d9d9; border-radius: 4px; padding: 8px 12px; font-size: 14px; } QPushButton:hover { background-color: #e6e6e6; } """) self.clear_btn.clicked.connect(self.on_clear_clicked) button_group.addWidget(self.recognize_btn) button_group.addWidget(self.clear_btn) button_group.addStretch() result_group = QVBoxLayout() result_label = QLabel("识别结果") result_label.setStyleSheet("font-weight: 600; color: #444; margin-bottom: 6px;") self.result_text = QTextEdit() self.result_text.setReadOnly(True) self.result_text.setMinimumHeight(200) self.result_text.setPlaceholderText("识别结果将显示在这里...") self.result_text.setStyleSheet(""" QTextEdit { background-color: #ffffff; border: 1px solid #d1d5db; border-radius: 4px; padding: 10px; font-size: 14px; color: #333; } """) result_group.addWidget(result_label) result_group.addWidget(self.result_text) content_layout.addWidget(title) content_layout.addLayout(model_group) content_layout.addLayout(image_url_group) content_layout.addLayout(button_group) content_layout.addLayout(result_group, 1) content_layout.setContentsMargins(40, 20, 40, 15) scroll = QScrollArea() scroll.setWidget(content) scroll.setWidgetResizable(True) scroll.setFrameShape(QFrame.NoFrame) layout.addWidget(scroll) self.init_chat_model() def closeEvent(self, event): if self.worker and self.worker.isRunning(): self.worker.stop() self.worker.wait() self.thinking_timer.stop() event.accept() def load_models_from_env(self): providers = ["QWEN", "KIMI", "MINIMAX", "ZHIPU", "DOUBAO", "ERNIE", "HUNYUAN", "DEEPSEEK"] config = self.env_util.load_config(providers) for provider, provider_config in config.items(): for model in provider_config["models"]: self.model_configs[model] = { "api_key": provider_config["api_key"], "base_url": provider_config["base_url"], "provider": provider.lower() } self.all_models.append(model) def init_chat_model(self, model_name=None): from langchain.chat_models import init_chat_model if model_name is None: model_name = self.model_combo.currentText() if model_name in self.model_configs: config = self.model_configs[model_name] self.model = init_chat_model( model=model_name, model_provider="openai", api_key=config["api_key"], base_url=config["base_url"], temperature=0.5 ) def update_thinking(self): self.dot_count = (self.dot_count 1) % 4 dots = "." * self.dot_count self.result_text.setPlainText("识别中" dots) def on_recognize_clicked(self): image_url = self.image_url_input.text().strip() if not image_url or self.worker: return model_name = self.model_combo.currentText() if model_name not in self.model_configs: return self.init_chat_model(model_name) self.dot_count = 0 self.is_thinking = True self.thinking_timer.start(500) self.recognize_btn.setEnabled(False) self.image_url_input.setEnabled(False) self.worker = OCRWorker(self.model, image_url) self.worker.result_received.connect(self.on_result_received) self.worker.finished.connect(self.on_finished) self.worker.start() def on_result_received(self, result_text): if self.thinking_timer.isActive(): self.thinking_timer.stop() self.result_text.setPlainText(result_text) def on_finished(self): self.thinking_timer.stop() self.is_thinking = False self.recognize_btn.setEnabled(True) self.image_url_input.setEnabled(True) self.worker = None def on_clear_clicked(self): self.image_url_input.clear() self.result_text.clear()class OCRWorker(QThread): result_received = Signal(str) finished = Signal() def __init__(self, model, image_url): super().__init__() self.model = model self.image_url = image_url self._is_running = True def stop(self): self._is_running = False def run(self): import asyncio try: asyncio.run(self.async_run()) except Exception as e: print(f"OCRWorker error: {e}") self.result_received.emit(f"识别出错: {str(e)}") finally: self.finished.emit() async def async_run(self): from langchain_core.prompts import ChatPromptTemplate try: prompt_template = ChatPromptTemplate.from_messages([ {"role": "system", "content": "用中文简短描述图片内容"}, {"role": "user", "content": [{"image_url": "{image_url}"}]} ]) prompt_value = prompt_template.invoke( { "image_url": self.image_url } ) res = await self.model.ainvoke(prompt_value) if self._is_running: self.result_received.emit(res.content) except Exception as e: print(f"async_run error: {e}") if self._is_running: self.result_received.emit(f"识别出错: {str(e)}")if __name__ == "__main__": app = QApplication(sys.argv) ocr_page = OCRPage() ocr_page.setWindowTitle("图片内容识别") ocr_page.resize(800, 600) ocr_page.show() sys.exit(app.exec())
#人工智能##AI妙生图##langchain##程序员##python##热门##热搜##今日头条##热搜##智能体##编程#
相关文章









猜你喜欢
成员 网址收录40418 企业收录2986 印章生成263660 电子证书1157 电子名片68 自媒体110024