Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Mojo 爬蟲完整指南

📚 目錄

🚀 Mojo 安裝

系統需求

  • 作業系統: Ubuntu 20.04+ 或 macOS 12+
  • 硬體: x86-64 或 ARM64 架構
  • 記憶體: 至少 4GB RAM

安裝步驟

方法 1: 官方安裝器(推薦)

# 1. 訪問 Modular 官網
curl -s https://get.modular.com | sh -

# 2. 安裝 Mojo
modular install mojo

# 3. 設定環境變數
echo 'export MODULAR_HOME="$HOME/.modular"' >> ~/.bashrc
echo 'export PATH="$MODULAR_HOME/pkg/packages.modular.com_mojo/bin:$PATH"' >> ~/.bashrc
source ~/.bashrc

# 4. 驗證安裝
mojo --version

方法 2: MAX Platform

# 1. 註冊並下載 MAX Platform
# 2. 安裝 MAX
sudo dpkg -i max-*.deb

# 3. 啟動 MAX
max auth login

# 4. 安裝 Mojo
max install mojo

# 5. 驗證
mojo --version

開發環境設定

VS Code 擴展

# 安裝 Mojo 語言支援
code --install-extension modular-mojotools.mojo

Jupyter Notebook 支援

# 安裝 Jupyter
pip install jupyter

# 註冊 Mojo 核心
max install jupyter

# 啟動 Jupyter
jupyter notebook

🛠️ 環境準備

Python 依賴安裝

# 安裝爬蟲必要套件
pip install requests beautifulsoup4 lxml html5lib aiohttp

# 可選套件
pip install selenium pandas numpy

項目結構

mojo-crawler/
├── crawler.mojo          # 主爬蟲檔案
├── utils.mojo           # 工具函數
├── config.mojo          # 配置檔案
├── requirements.txt     # Python 依賴
└── README.md           # 說明文件

🕷️ 基礎爬蟲範例

1. 簡單的網頁爬蟲

# crawler.mojo
from python import Python

def main():
    """基礎爬蟲範例"""
    
    # 導入 Python 模組
    let requests = Python.import_module("requests")
    let bs4 = Python.import_module("bs4")
    
    # 設定請求標頭
    let headers = Python.dict()
    headers["User-Agent"] = "Mozilla/5.0 (compatible; MojoCrawler/1.0)"
    
    try:
        # 發送 HTTP 請求
        let url = "https://example.com"
        let response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            print("✅ 請求成功")
            
            # 解析 HTML
            let soup = bs4.BeautifulSoup(response.content, "html.parser")
            
            # 提取標題
            let title = soup.find("title")
            if title:
                print("標題:", title.get_text())
            
            # 提取所有連結
            let links = soup.find_all("a", href=True)
            print(f"找到 {len(links)} 個連結")
            
            for i in range(min(5, len(links))):  # 只顯示前5個
                let link = links[i]
                print(f"  {i+1}. {link.get_text()}: {link['href']}")
                
        else:
            print("❌ 請求失敗, 狀態碼:", response.status_code)
            
    except Exception as e:
        print("錯誤:", e)

2. JSON API 爬蟲

def crawl_json_api():
    """爬取 JSON API 數據"""
    
    let requests = Python.import_module("requests")
    let json = Python.import_module("json")
    
    let headers = Python.dict()
    headers["Accept"] = "application/json"
    headers["User-Agent"] = "MojoCrawler/1.0"
    
    try:
        # 爬取 API 數據
        let api_url = "https://jsonplaceholder.typicode.com/posts"
        let response = requests.get(api_url, headers=headers)
        
        if response.status_code == 200:
            let data = response.json()
            print(f"📊 獲取到 {len(data)} 筆數據")
            
            # 處理前3筆數據
            for i in range(min(3, len(data))):
                let post = data[i]
                print(f"\n📝 貼文 {i+1}:")
                print(f"  標題: {post['title']}")
                print(f"  用戶ID: {post['userId']}")
                print(f"  內容: {post['body'][:50]}...")
                
        else:
            print("❌ API 請求失敗")
            
    except Exception as e:
        print("錯誤:", e)

🚀 進階爬蟲範例

1. 面向對象的爬蟲類

from python import Python
from memory import Reference

struct WebCrawler:
    """高性能網頁爬蟲"""
    var session: PythonObject
    var headers: PythonObject
    var delay: Float64
    var max_retries: Int
    
    fn __init__(inout self):
        """初始化爬蟲"""
        let requests = Python.import_module("requests")
        self.session = requests.Session()
        
        # 設定標頭
        self.headers = Python.dict()
        self.headers["User-Agent"] = "Mozilla/5.0 (compatible; MojoCrawler/2.0)"
        self.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
        self.headers["Accept-Language"] = "zh-TW,zh;q=0.9,en;q=0.8"
        self.headers["Accept-Encoding"] = "gzip, deflate, br"
        self.headers["Connection"] = "keep-alive"
        
        self.delay = 1.0
        self.max_retries = 3
    
    fn fetch_url(self, url: String) raises -> PythonObject:
        """獲取 URL 內容(帶重試機制)"""
        let time = Python.import_module("time")
        
        for retry in range(self.max_retries):
            try:
                let response = self.session.get(
                    url, 
                    headers=self.headers, 
                    timeout=10,
                    allow_redirects=True
                )
                
                if response.status_code == 200:
                    return response
                elif response.status_code == 429:  # Too Many Requests
                    print(f"⚠️ 請求過於頻繁,等待 {(retry + 1) * 2} 秒...")
                    time.sleep((retry + 1) * 2)
                else:
                    print(f"❌ HTTP {response.status_code}")
                    
            except Exception as e:
                print(f"🔄 重試 {retry + 1}/{self.max_retries}: {e}")
                if retry < self.max_retries - 1:
                    time.sleep(self.delay * (retry + 1))
        
        raise Error("所有重試都失敗了")
    
    fn parse_html(self, html_content: PythonObject) -> PythonObject:
        """解析 HTML 內容"""
        let bs4 = Python.import_module("bs4")
        return bs4.BeautifulSoup(html_content, "html.parser")
    
    fn extract_data(self, soup: PythonObject) -> PythonObject:
        """提取結構化數據"""
        let data = Python.dict()
        
        # 提取標題
        let title = soup.find("title")
        data["title"] = title.get_text().strip() if title else "無標題"
        
        # 提取 meta 描述
        let meta_desc = soup.find("meta", attrs={"name": "description"})
        data["description"] = meta_desc.get("content", "") if meta_desc else ""
        
        # 提取所有標題
        let headings = Python.list()
        for level in range(1, 7):  # h1-h6
            let tags = soup.find_all(f"h{level}")
            for i in range(len(tags)):
                let heading = tags[i]
                headings.append({
                    "level": level,
                    "text": heading.get_text().strip()
                })
        data["headings"] = headings
        
        # 提取連結
        let links = Python.list()
        let link_tags = soup.find_all("a", href=True)
        for i in range(len(link_tags)):
            let link = link_tags[i]
            links.append({
                "text": link.get_text().strip(),
                "url": link["href"]
            })
        data["links"] = links
        
        # 提取圖片
        let images = Python.list()
        let img_tags = soup.find_all("img", src=True)
        for i in range(len(img_tags)):
            let img = img_tags[i]
            images.append({
                "src": img["src"],
                "alt": img.get("alt", "")
            })
        data["images"] = images
        
        return data

2. 批量爬蟲範例

def batch_crawl_demo():
    """批量爬蟲示例"""
    
    let time = Python.import_module("time")
    let json = Python.import_module("json")
    
    var crawler = WebCrawler()
    
    # 要爬取的 URL 列表
    let urls = [
        "https://example.com",
        "https://httpbin.org/html",
        "https://httpbin.org/json"
    ]
    
    let results = Python.list()
    
    print("🚀 開始批量爬取...")
    
    for i in range(len(urls)):
        let url = urls[i]
        print(f"\n📄 正在處理第 {i+1}/{len(urls)} 個: {url}")
        
        try:
            # 獲取頁面
            let response = crawler.fetch_url(url)
            
            # 解析內容
            if "application/json" in str(response.headers.get("content-type", "")):
                # JSON 數據
                let data = response.json()
                results.append({
                    "url": url,
                    "type": "json",
                    "data": data
                })
            else:
                # HTML 數據
                let soup = crawler.parse_html(response.content)
                let extracted_data = crawler.extract_data(soup)
                results.append({
                    "url": url,
                    "type": "html",
                    "data": extracted_data
                })
            
            print("✅ 處理完成")
            
        except Exception as e:
            print(f"❌ 處理失敗: {e}")
            results.append({
                "url": url,
                "type": "error",
                "error": str(e)
            })
        
        # 請求間隔
        if i < len(urls) - 1:
            time.sleep(crawler.delay)
    
    # 保存結果
    try:
        with open("crawl_results.json", "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print("\n💾 結果已保存到 crawl_results.json")
    except Exception as e:
        print(f"💥 保存失敗: {e}")
    
    print(f"\n🎉 批量爬取完成! 共處理 {len(urls)} 個 URL")

🛠️ 實用工具函數

1. URL 工具

def normalize_url(base_url: String, relative_url: String) -> String:
    """規範化 URL"""
    let urllib = Python.import_module("urllib.parse")
    return str(urllib.urljoin(base_url, relative_url))

def is_valid_url(url: String) -> Bool:
    """檢查 URL 是否有效"""
    let urllib = Python.import_module("urllib.parse")
    let parsed = urllib.urlparse(url)
    return bool(parsed.netloc and parsed.scheme)

2. 數據處理工具

def clean_text(text: PythonObject) -> String:
    """清理文本數據"""
    let re = Python.import_module("re")
    
    # 移除多餘空白
    cleaned = re.sub(r'\s+', ' ', str(text))
    
    # 移除特殊字符
    cleaned = re.sub(r'[^\w\s\u4e00-\u9fff]', '', cleaned)
    
    return str(cleaned).strip()

def extract_emails(text: String) -> PythonObject:
    """提取電子郵件地址"""
    let re = Python.import_module("re")
    let pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    return re.findall(pattern, text)

def extract_phone_numbers(text: String) -> PythonObject:
    """提取電話號碼(臺灣格式)"""
    let re = Python.import_module("re")
    let patterns = [
        r'09\d{8}',           # 手機號碼
        r'0\d{1,2}-\d{7,8}',  # 市話
        r'\(\d{2,3}\)\d{7,8}' # 括號格式
    ]
    
    let results = Python.list()
    for pattern in patterns:
        let matches = re.findall(pattern, text)
        results.extend(matches)
    
    return results

3. 數據存儲工具

def save_to_csv(data: PythonObject, filename: String):
    """保存數據到 CSV"""
    let pandas = Python.import_module("pandas")
    
    try:
        let df = pandas.DataFrame(data)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"💾 數據已保存到 {filename}")
    except Exception as e:
        print(f"💥 CSV 保存失敗: {e}")

def save_to_json(data: PythonObject, filename: String):
    """保存數據到 JSON"""
    let json = Python.import_module("json")
    
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"💾 數據已保存到 {filename}")
    except Exception as e:
        print(f"💥 JSON 保存失敗: {e}")

📋 最佳實踐

1. 尊重 robots.txt

def check_robots_txt(base_url: String) -> Bool:
    """檢查 robots.txt"""
    let urllib = Python.import_module("urllib.robotparser")
    
    let robots_url = base_url.rstrip('/') + '/robots.txt'
    let rp = urllib.RobotFileParser()
    rp.set_url(robots_url)
    
    try:
        rp.read()
        return rp.can_fetch('*', base_url)
    except:
        return True  # 如果無法讀取,假設允許

2. 請求限制

struct RateLimiter:
    """請求速率限制器"""
    var last_request_time: Float64
    var min_interval: Float64
    
    fn __init__(inout self, requests_per_second: Float64):
        self.last_request_time = 0.0
        self.min_interval = 1.0 / requests_per_second
    
    fn wait_if_needed(inout self):
        """如有需要則等待"""
        let time = Python.import_module("time")
        let current_time = float(time.time())
        
        let time_since_last = current_time - self.last_request_time
        if time_since_last < self.min_interval:
            let wait_time = self.min_interval - time_since_last
            time.sleep(wait_time)
        
        self.last_request_time = float(time.time())

3. 錯誤處理

def robust_crawl(url: String) -> PythonObject:
    """具有強健錯誤處理的爬蟲函數"""
    let requests = Python.import_module("requests")
    let time = Python.import_module("time")
    
    let max_retries = 3
    let backoff_factor = 2
    
    for attempt in range(max_retries):
        try:
            let response = requests.get(
                url,
                timeout=10,
                headers={"User-Agent": "MojoCrawler/1.0"}
            )
            
            if response.status_code == 200:
                return response
            elif response.status_code == 429:
                let wait_time = backoff_factor ** attempt
                print(f"⏳ 請求限制,等待 {wait_time} 秒...")
                time.sleep(wait_time)
            else:
                print(f"❌ HTTP {response.status_code}")
                
        except Exception as e:
            print(f"🔄 嘗試 {attempt + 1}: {e}")
            if attempt < max_retries - 1:
                time.sleep(backoff_factor ** attempt)
    
    raise Error("所有嘗試都失敗了")

💡 常見問題

Q: Mojo 相比 Python 有什麼優勢?

A: Mojo 在爬蟲方面的優勢:

  • 性能: 比 Python 快 10-100 倍
  • 記憶體效率: 更好的記憶體管理
  • 並行處理: 原生支援並行計算
  • 兼容性: 可以直接使用 Python 套件

Q: 如何處理反爬蟲機制?

A: 常見策略:

# 1. 隨機 User-Agent
let user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
]

# 2. 使用代理
let proxies = {
    "http": "http://proxy:port",
    "https": "https://proxy:port"
}

# 3. 模擬真實行為
time.sleep(random.uniform(1, 3))

Q: 如何處理 JavaScript 渲染的頁面?

A: 使用 Selenium:

def crawl_js_page(url: String) -> PythonObject:
    let selenium = Python.import_module("selenium")
    let webdriver = selenium.webdriver
    
    let driver = webdriver.Chrome()
    driver.get(url)
    
    # 等待頁面載入
    let time = Python.import_module("time")
    time.sleep(3)
    
    let content = driver.page_source
    driver.quit()
    
    return content

Q: 如何進行分散式爬蟲?

A: 可以結合 Celery 或 RQ:

# 任務分發
def distribute_urls(urls: PythonObject, num_workers: Int) -> PythonObject:
    let chunks = Python.list()
    let chunk_size = len(urls) // num_workers
    
    for i in range(num_workers):
        let start = i * chunk_size
        let end = start + chunk_size if i < num_workers - 1 else len(urls)
        chunks.append(urls[start:end])
    
    return chunks

🎯 完整範例運行

# 1. 創建項目目錄
mkdir mojo-crawler && cd mojo-crawler

# 2. 創建並運行爬蟲
echo '# 上面的完整代碼' > crawler.mojo
mojo crawler.mojo

# 3. 查看結果
cat crawl_results.json

📚 進階學習資源


注意: 請務必遵守目標網站的使用條款和 robots.txt 規則,進行合理合法的數據收集。