锦书在线
80.52M · 2026-03-21
昨天使用了Qwen3-ASR对视频文件进行文字识别,刚开始安装的时候其实挺快的,但是真的在使用过程中,遇到了挺多问题,不过所有问题都可以通过AI解决,现在我将我所有遇到的问题和AI的回答进行总结,输出这篇博客,帮助其他人提前可以避免一些问题。
| 项目 | 要求 |
|---|---|
| 操作系统 | Windows 10/11 |
| Python | 3.9+ |
| 内存 | 8GB+(CPU模式) |
| 磁盘空间 | 10GB+ |
# 创建新环境
conda create -n qwen3-asr python=3.10 -y
# 激活环境
conda activate qwen3-asr
# 卸载旧版本(如有)
pip uninstall torch torchvision torchaudio -y
# 清理缓存
pip cache purge
# 安装CPU版本(使用清华源加速)
pip install torch torchvision torchaudio -i
pip install qwen-asr pydub ffmpeg-python
python -c "import torch; print('PyTorch版本:', torch.__version__)"
python -c "from qwen_asr import Qwen3ASRModel; print('qwen_asr 导入成功')"
# download_model.py
from huggingface_hub import snapshot_download
model_path = "./Qwen3-ASR-0.6B"
snapshot_download(
repo_id="Qwen/Qwen3-ASR-0.6B",
local_dir=model_path
)
print(f"模型已下载到:{model_path}")
从 HuggingFace 下载模型文件到本地目录:
./Qwen3-ASR-0.6Bconfig.json、pytorch_model.bin 等必要文件# mp4_to_wav.py
from pydub import AudioSegment
import os
def mp4_to_wav(mp4_path, wav_path):
"""将MP4转换为WAV格式"""
audio = AudioSegment.from_file(mp4_path, format="mp4")
audio = audio.set_frame_rate(16000) # 16kHz
audio = audio.set_channels(1) # 单声道
audio.export(wav_path, format="wav")
print(f" 转换完成:{wav_path}")
if __name__ == "__main__":
mp4_file = r"C:ResourceDataQwen-Asrinput.mp4"
wav_file = r"C:ResourceDataQwen-Asrtemp_audio.wav"
mp4_to_wav(mp4_file, wav_file)
如果音频过长,建议分段处理:
def split_audio_if_needed(audio_path, max_duration=120):
"""如果音频过长则分段"""
audio = AudioSegment.from_wav(audio_path)
duration = len(audio) / 1000
if duration <= max_duration:
return [audio_path], duration
chunks = []
num_chunks = int(duration // max_duration) + 1
for i in range(num_chunks):
start = i * max_duration * 1000
end = min((i + 1) * max_duration * 1000, len(audio))
chunk = audio[start:end]
chunk_path = f"temp_audio_chunk_{i}.wav"
chunk.export(chunk_path, format="wav")
chunks.append(chunk_path)
return chunks, duration
通过 inspect.signature 查看 transcribe 方法的正确签名:
python -c "from qwen_asr import Qwen3ASRModel; import inspect; print(inspect.signature(Qwen3ASRModel.transcribe))"
输出:
(self, audio: Union[str, Tuple[numpy.ndarray, int], List[...]],
context: Union[str, List[str]] = '',
language: Union[str, List[Optional[str]], NoneType] = None,
return_time_stamps: bool = False) -> List[ASRTranscription]
| 参数 | 正确写法 | 常见错误 |
|---|---|---|
| 时间戳参数 | return_time_stamps | return_timestamps |
| 批次大小 | 不支持 | batch_size |
| 返回类型 | List[ASRTranscription] | 字符串 |
from qwen_asr import Qwen3ASRModel
# 加载模型
model = Qwen3ASRModel.from_pretrained(
"./Qwen3-ASR-0.6B",
device_map="cpu",
torch_dtype="auto"
)
# 识别音频
result = model.transcribe(
"audio.wav",
return_time_stamps=False
)
# 提取文本(返回的是列表)
text = " ".join([item.text for item in result])
print(text)
原因: PyTorch 未安装或环境未激活
解决:
conda activate qwen3-asr
pip install torch -i
原因: transcribe() 方法不支持 batch_size 参数
解决: 移除该参数,只传音频路径
原因: 参数名错误,应该是 return_time_stamps(带下划线)
解决:
# 错误
model.transcribe(audio, return_timestamps=False)
# 正确
model.transcribe(audio, return_time_stamps=False)
原因: 返回的是 List[ASRTranscription],需要遍历提取
解决:
def extract_text_from_result(result):
if isinstance(result, list):
return " ".join([item.text for item in result])
elif hasattr(result, 'text'):
return result.text
else:
return str(result)
# mp4_to_text_cpu.py
import time
import threading
import os
from qwen_asr import Qwen3ASRModel
from pydub import AudioSegment
class ProgressDisplay:
"""进度显示类"""
def __init__(self):
self.stop_flag = False
self.start_time = None
self.thread = None
self.current_chunk = 0
self.total_chunks = 1
def start(self, total_chunks=1):
self.start_time = time.time()
self.stop_flag = False
self.total_chunks = total_chunks
self.thread = threading.Thread(target=self._show_progress, daemon=True)
self.thread.start()
def update_chunk(self, chunk_num):
self.current_chunk = chunk_num
def stop(self):
self.stop_flag = True
if self.thread:
self.thread.join(timeout=0.5)
print("n")
def _show_progress(self):
symbols = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']
i = 0
while not self.stop_flag:
elapsed = time.time() - self.start_time
if self.total_chunks > 1:
print(f"r {symbols[i]} 处理中... 片段 {self.current_chunk}/{self.total_chunks} | 已用时:{elapsed:.1f}秒", end='', flush=True)
else:
print(f"r {symbols[i]} 处理中... 已用时:{elapsed:.1f}秒", end='', flush=True)
i = (i + 1) % len(symbols)
time.sleep(0.1)
def split_audio_if_needed(audio_path, max_duration=120):
audio = AudioSegment.from_wav(audio_path)
duration = len(audio) / 1000
if duration <= max_duration:
return [audio_path], duration
chunks = []
num_chunks = int(duration // max_duration) + 1
for i in range(num_chunks):
start = i * max_duration * 1000
end = min((i + 1) * max_duration * 1000, len(audio))
chunk = audio[start:end]
chunk_path = f"temp_audio_chunk_{i}.wav"
chunk.export(chunk_path, format="wav")
chunks.append(chunk_path)
return chunks, duration
def extract_text_from_result(result):
if isinstance(result, list):
return " ".join([item.text for item in result if hasattr(item, 'text')])
elif hasattr(result, 'text'):
return result.text
else:
return str(result)
def main():
# 环境检测
print("=" * 50)
print(" 环境检测")
print("=" * 50)
import torch
print(f"PyTorch版本:{torch.__version__}")
print(f"CUDA可用:{torch.cuda.is_available()}")
print("=" * 50)
# 加载模型
print("n 加载模型")
model_path = r"./Qwen3-ASR-0.6B"
if not os.path.exists(model_path):
print(f" 模型目录不存在:{model_path}")
return
model = Qwen3ASRModel.from_pretrained(
model_path,
device_map="cpu",
torch_dtype="auto"
)
print(" 模型加载完成!")
# 准备音频
print("n 音频处理")
audio_path = r"C:ResourceDataQwen-Asrtemp_audio.wav"
if not os.path.exists(audio_path):
print(f" 音频文件不存在:{audio_path}")
return
audio_chunks, total_duration = split_audio_if_needed(audio_path, max_duration=120)
print(f"总时长:{total_duration:.1f}秒,片段数:{len(audio_chunks)}")
# 识别音频
print("n 开始识别")
progress = ProgressDisplay()
progress.start(total_chunks=len(audio_chunks))
all_results = []
start_time = time.time()
for i, chunk in enumerate(audio_chunks):
progress.update_chunk(i + 1)
try:
result = model.transcribe(chunk, return_time_stamps=False)
text = extract_text_from_result(result)
all_results.append(text)
except Exception as e:
print(f"n 片段 {i+1} 处理失败:{e}")
all_results.append("[识别失败]")
progress.stop()
total_time = time.time() - start_time
# 输出结果
print("n" + "=" * 50)
print(" 识别结果")
print("=" * 50)
full_text = " ".join(all_results)
print(full_text)
# 保存结果
output_path = r"C:ResourceDataQwen-Asrresult.txt"
with open(output_path, 'w', encoding='utf-8') as f:
f.write(full_text)
print("n" + "=" * 50)
print(" 统计信息")
print("=" * 50)
print(f"音频时长:{total_duration:.1f}秒")
print(f"处理耗时:{total_time:.1f}秒")
print(f"实时率:{total_time/total_duration:.2f}x")
print(f"结果保存:{output_path}")
print("=" * 50)
if __name__ == "__main__":
main()
# 1. 激活环境
conda activate qwen3-asr
# 2. 转换音频(MP4 → WAV)
python mp4_to_wav.py
# 3. 运行识别
python mp4_to_text_cpu.py
# 4. 查看结果
cat result.txt
| 模式 | 实时率 | 适用场景 |
|---|---|---|
| CPU | 2-5x | 无GPU、小批量处理 |
| GPU | 0.3-0.8x | 大批量、生产环境 |
使用 Qwen3-ASR 的关键点:
return_time_stamps 不是 return_timestampsbatch_size、language 等通过本文的步骤,你可以顺利在本地部署并使用 Qwen3-ASR 进行语音识别。如有问题,建议先用 inspect.signature 查看 API 签名,避免参数错误。
本文基于实际调试经验整理,希望能帮助你少走弯路!