AI 語音與影像辨識技術指南

📋 目錄

語音辨識技術
影像辨識技術
多模態應用
安裝指南
實際應用案例

🎤 語音辨識技術

1. OpenAI Whisper（推薦）

特點：

離線運行，保護隱私
支援 99 種語言
準確度極高
免費開源

安裝：

pip install openai-whisper

基本使用：

import whisper

# 載入模型 (tiny, base, small, medium, large)
model = whisper.load_model("base")

# 辨識音檔
result = model.transcribe("audio.mp3")
print(result["text"])

# 支援中文
result = model.transcribe("chinese_audio.mp3", language="zh")
print(result["text"])

# 取得時間戳記
result = model.transcribe("audio.mp3", verbose=True)
for segment in result["segments"]:
    print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['text']}")

2. Google Speech-to-Text

特點：

雲端服務，準確度高
支援即時串流
自動標點符號

安裝：

pip install google-cloud-speech

使用範例：

from google.cloud import speech
import io

def transcribe_file(speech_file):
    """轉錄本地音檔"""
    client = speech.SpeechClient()

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="zh-TW",  # 繁體中文
        enable_automatic_punctuation=True,
    )

    response = client.recognize(config=config, audio=audio)
    
    for result in response.results:
        print(f"轉錄結果: {result.alternatives[0].transcript}")
        print(f"信心分數: {result.alternatives[0].confidence}")

3. 即時語音辨識

安裝：

pip install SpeechRecognition pyaudio

即時麥克風辨識：

import speech_recognition as sr

def live_speech_recognition():
    recognizer = sr.Recognizer()
    mic = sr.Microphone()
    
    print("調整環境噪音...")
    with mic as source:
        recognizer.adjust_for_ambient_noise(source, duration=1)
    
    print("開始說話...")
    
    while True:
        try:
            with mic as source:
                # 設定超時時間
                audio = recognizer.listen(source, timeout=1, phrase_time_limit=5)
                
            # 使用 Google API（免費）
            text = recognizer.recognize_google(audio, language="zh-TW")
            print(f"你說: {text}")
            
            # 也可以使用 Whisper
            # text = recognizer.recognize_whisper(audio, language="chinese")
            
        except sr.UnknownValueError:
            pass  # 無法辨識
        except sr.RequestError as e:
            print(f"錯誤: {e}")
        except KeyboardInterrupt:
            print("\n停止辨識")
            break

if __name__ == "__main__":
    live_speech_recognition()

📷 影像辨識技術

1. YOLO v8（物件偵測）

特點：

即時偵測
高準確度
支援影片串流

安裝：

pip install ultralytics

使用範例：

from ultralytics import YOLO
import cv2

# 載入預訓練模型
model = YOLO('yolov8n.pt')  # n=nano, s=small, m=medium, l=large, x=extra large

# 圖片偵測
def detect_image(image_path):
    results = model(image_path)
    
    for r in results:
        boxes = r.boxes
        for box in boxes:
            # 取得座標
            x1, y1, x2, y2 = box.xyxy[0]
            # 取得類別和信心分數
            conf = box.conf[0]
            cls = box.cls[0]
            print(f"偵測到: {model.names[int(cls)]} (信心度: {conf:.2f})")
    
    # 儲存結果
    results[0].save(filename='result.jpg')

# 影片即時偵測
def detect_video(video_path):
    cap = cv2.VideoCapture(video_path)  # 或使用 0 為攝影機
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
            
        results = model(frame)
        annotated_frame = results[0].plot()
        
        cv2.imshow('YOLOv8 偵測', annotated_frame)
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

2. Transformers 影像分類

安裝：

pip install transformers torch pillow

使用範例：

from transformers import pipeline
from PIL import Image

# 建立分類器
classifier = pipeline("image-classification", 
                     model="google/vit-base-patch16-224")

def classify_image(image_path):
    image = Image.open(image_path)
    results = classifier(image)
    
    print("影像分類結果:")
    for item in results[:5]:  # 顯示前5個結果
        print(f"  {item['label']}: {item['score']:.3f}")
    
    return results

# 物件偵測
detector = pipeline("object-detection", 
                   model="facebook/detr-resnet-50")

def detect_objects(image_path):
    image = Image.open(image_path)
    results = detector(image)
    
    print("偵測到的物件:")
    for item in results:
        print(f"  {item['label']}: {item['score']:.2f}")
        print(f"    位置: {item['box']}")
    
    return results

3. 臉部辨識

安裝：

pip install face-recognition opencv-python

使用範例：

import face_recognition
import cv2
import numpy as np

def face_detection_and_recognition():
    # 載入已知人臉
    known_image = face_recognition.load_image_file("person1.jpg")
    known_encoding = face_recognition.face_encodings(known_image)[0]
    
    known_face_encodings = [known_encoding]
    known_face_names = ["Person 1"]
    
    # 開啟攝影機
    video_capture = cv2.VideoCapture(0)
    
    while True:
        ret, frame = video_capture.read()
        
        # 轉換 BGR (OpenCV) 到 RGB (face_recognition)
        rgb_frame = frame[:, :, ::-1]
        
        # 找出所有臉部位置和編碼
        face_locations = face_recognition.face_locations(rgb_frame)
        face_encodings = face_recognition.face_encodings(rgb_frame, face_locations)
        
        for (top, right, bottom, left), face_encoding in zip(face_locations, face_encodings):
            # 比對臉部
            matches = face_recognition.compare_faces(known_face_encodings, face_encoding)
            name = "Unknown"
            
            # 計算距離找出最佳匹配
            face_distances = face_recognition.face_distance(known_face_encodings, face_encoding)
            best_match_index = np.argmin(face_distances)
            if matches[best_match_index]:
                name = known_face_names[best_match_index]
            
            # 畫框和標籤
            cv2.rectangle(frame, (left, top), (right, bottom), (0, 255, 0), 2)
            cv2.putText(frame, name, (left, top - 10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 255, 0), 2)
        
        cv2.imshow('Video', frame)
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    video_capture.release()
    cv2.destroyAllWindows()

🔄 多模態應用

1. CLIP - 圖文匹配

安裝：

pip install transformers torch pillow

使用範例：

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def image_text_similarity(image_path, text_descriptions):
    """計算圖片與文字描述的相似度"""
    image = Image.open(image_path)
    
    # 處理輸入
    inputs = processor(
        text=text_descriptions, 
        images=image, 
        return_tensors="pt", 
        padding=True
    )
    
    # 計算相似度
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    
    # 顯示結果
    for i, (desc, prob) in enumerate(zip(text_descriptions, probs[0])):
        print(f"{desc}: {prob:.2%}")
    
    # 回傳最可能的描述
    max_idx = probs.argmax()
    return text_descriptions[max_idx]

# 使用範例
descriptions = [
    "一隻貓在睡覺",
    "一隻狗在玩球",
    "一個人在跑步",
    "一輛車在路上"
]

best_match = image_text_similarity("test_image.jpg", descriptions)
print(f"\n最佳匹配: {best_match}")

2. 影片理解（結合語音和視覺）

import whisper
import cv2
from ultralytics import YOLO
import numpy as np

class VideoAnalyzer:
    def __init__(self):
        self.whisper_model = whisper.load_model("base")
        self.yolo_model = YOLO('yolov8n.pt')
    
    def analyze_video(self, video_path):
        """完整分析影片內容"""
        # 提取音訊並轉錄
        audio_text = self.transcribe_audio(video_path)
        
        # 分析視覺內容
        visual_summary = self.analyze_visual(video_path)
        
        return {
            "audio_transcript": audio_text,
            "visual_summary": visual_summary
        }
    
    def transcribe_audio(self, video_path):
        """提取並轉錄音訊"""
        # 使用 ffmpeg 提取音訊（需要先安裝 ffmpeg）
        import subprocess
        audio_path = "temp_audio.wav"
        subprocess.run([
            "ffmpeg", "-i", video_path, 
            "-vn", "-acodec", "pcm_s16le", 
            "-ar", "16000", "-ac", "1", 
            audio_path, "-y"
        ])
        
        result = self.whisper_model.transcribe(audio_path)
        return result["text"]
    
    def analyze_visual(self, video_path, sample_rate=30):
        """分析視覺內容"""
        cap = cv2.VideoCapture(video_path)
        frame_count = 0
        detected_objects = {}
        
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            
            # 每 sample_rate 幀分析一次
            if frame_count % sample_rate == 0:
                results = self.yolo_model(frame)
                
                for r in results:
                    for box in r.boxes:
                        cls = int(box.cls[0])
                        label = self.yolo_model.names[cls]
                        
                        if label not in detected_objects:
                            detected_objects[label] = 0
                        detected_objects[label] += 1
            
            frame_count += 1
        
        cap.release()
        return detected_objects

# 使用範例
analyzer = VideoAnalyzer()
results = analyzer.analyze_video("sample_video.mp4")
print("語音內容:", results["audio_transcript"])
print("視覺內容:", results["visual_summary"])

📦 安裝指南

基礎環境設定

# 建立虛擬環境
python -m venv ai_recognition_env
source ai_recognition_env/bin/activate  # Linux/Mac
# 或
ai_recognition_env\Scripts\activate  # Windows

# 升級 pip
pip install --upgrade pip

完整安裝套件

# 語音辨識套件
pip install openai-whisper
pip install SpeechRecognition
pip install pyaudio  # 可能需要額外安裝 portaudio

# 影像辨識套件
pip install ultralytics  # YOLO
pip install transformers  # Hugging Face models
pip install torch torchvision  # PyTorch
pip install opencv-python  # OpenCV
pip install face-recognition  # 臉部辨識

# 工具套件
pip install pillow  # 圖片處理
pip install numpy  # 數值運算
pip install matplotlib  # 視覺化

Docker 容器設定

FROM python:3.9-slim

WORKDIR /app

# 安裝系統依賴
RUN apt-get update && apt-get install -y \
    ffmpeg \
    libsm6 \
    libxext6 \
    libxrender-dev \
    libgomp1 \
    wget \
    && rm -rf /var/lib/apt/lists/*

# 安裝 Python 套件
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

CMD ["python", "app.py"]

💡 實際應用案例

1. 智慧會議系統

功能：

即時語音轉文字紀錄
發言者識別
重點摘要生成

class SmartMeetingSystem:
    def __init__(self):
        self.whisper_model = whisper.load_model("medium")
        self.speakers = {}
    
    def process_meeting(self, audio_file):
        # 轉錄會議內容
        result = self.whisper_model.transcribe(
            audio_file,
            language="zh",
            verbose=True
        )
        
        # 產生時間戳記的逐字稿
        transcript = []
        for segment in result["segments"]:
            transcript.append({
                "start": segment["start"],
                "end": segment["end"],
                "text": segment["text"],
                "speaker": self.identify_speaker(segment)  # 需實作
            })
        
        return transcript

2. 智慧安防系統

功能：

人臉識別門禁
異常行為偵測
即時警報通知

class SecuritySystem:
    def __init__(self):
        self.yolo_model = YOLO('yolov8x.pt')
        self.known_faces = self.load_known_faces()
        self.alert_actions = ['fighting', 'falling', 'running']
    
    def monitor_camera(self, camera_id):
        cap = cv2.VideoCapture(camera_id)
        
        while True:
            ret, frame = cap.read()
            if not ret:
                continue
            
            # 物件和行為偵測
            results = self.yolo_model(frame)
            
            # 檢查異常行為
            for r in results:
                for box in r.boxes:
                    label = self.yolo_model.names[int(box.cls[0])]
                    if label in self.alert_actions:
                        self.send_alert(f"偵測到異常行為: {label}")
            
            # 人臉識別
            faces = self.detect_faces(frame)
            for face in faces:
                if not self.is_authorized(face):
                    self.send_alert("偵測到未授權人員")

3. 無障礙輔助工具

功能：

為視障者描述環境
文字轉語音
手語翻譯

class AccessibilityAssistant:
    def __init__(self):
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.yolo_model = YOLO('yolov8n.pt')
    
    def describe_scene(self, image_path):
        """為視障者描述場景"""
        image = Image.open(image_path)
        
        # 偵測物件
        results = self.yolo_model(image)
        objects = []
        for r in results:
            for box in r.boxes:
                label = self.yolo_model.names[int(box.cls[0])]
                objects.append(label)
        
        # 生成場景描述
        description = f"場景中包含: {', '.join(set(objects))}"
        
        # 使用 CLIP 獲取更詳細的描述
        scene_types = [
            "室內場景", "室外場景", "街道", "公園", 
            "辦公室", "家庭環境", "商店"
        ]
        
        inputs = self.clip_processor(
            text=scene_types, 
            images=image, 
            return_tensors="pt", 
            padding=True
        )
        
        outputs = self.clip_model(**inputs)
        probs = outputs.logits_per_image.softmax(dim=1)
        best_scene = scene_types[probs.argmax()]
        
        description += f"，這似乎是一個{best_scene}"
        
        return description

4. 內容創作助手

功能：

自動生成影片字幕
內容標籤建議
精彩片段擷取

class ContentCreatorAssistant:
    def __init__(self):
        self.whisper_model = whisper.load_model("base")
        self.yolo_model = YOLO('yolov8n.pt')
    
    def generate_subtitles(self, video_path, output_srt):
        """生成 SRT 字幕檔"""
        result = self.whisper_model.transcribe(video_path)
        
        with open(output_srt, 'w', encoding='utf-8') as f:
            for i, segment in enumerate(result["segments"], 1):
                # SRT 格式
                f.write(f"{i}\n")
                f.write(f"{self.format_time(segment['start'])} --> {self.format_time(segment['end'])}\n")
                f.write(f"{segment['text'].strip()}\n\n")
    
    def format_time(self, seconds):
        """轉換為 SRT 時間格式"""
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        seconds = seconds % 60
        return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}".replace('.', ',')
    
    def suggest_tags(self, video_path, num_frames=10):
        """建議影片標籤"""
        cap = cv2.VideoCapture(video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        sample_interval = total_frames // num_frames
        
        all_objects = {}
        
        for i in range(0, total_frames, sample_interval):
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if not ret:
                continue
            
            results = self.yolo_model(frame)
            for r in results:
                for box in r.boxes:
                    label = self.yolo_model.names[int(box.cls[0])]
                    all_objects[label] = all_objects.get(label, 0) + 1
        
        cap.release()
        
        # 排序並回傳最常出現的標籤
        sorted_tags = sorted(all_objects.items(), key=lambda x: x[1], reverse=True)
        return [tag for tag, _ in sorted_tags[:10]]

📚 參考資源

官方文檔

教學資源

模型庫

資料集

🎯 下一步建議

入門練習：
- 從 Whisper 語音轉文字開始
- 嘗試 YOLOv8 物件偵測
- 結合兩者做簡單應用
進階專案：
- 建立即時翻譯系統
- 開發智慧監控應用
- 製作無障礙輔助工具
效能優化：
- 學習模型量化技術
- 使用 GPU 加速
- 部署到邊緣裝置
持續學習：
- 關注最新論文和技術
- 參與開源專案
- 加入 AI 社群討論

📝 授權與注意事項

使用開源模型時注意授權條款
處理個人資料時遵守隱私法規
商業使用前確認模型授權
注意 API 使用限制和費用

Keyboard shortcuts

Jason's Notes