高頻交易 Rust 終極效能優化指南

完整涵蓋：位元運算、查表法、CPU 綁定、記憶體優化、Cache 優化、系統調校

1. 位元運算優化

基本運算替換

#![allow(unused)]
fn main() {
// ============ 乘法/除法 ============
// ❌ 慢
let a = x * 2;
let b = x * 4;
let c = x * 8;
let d = x / 2;
let e = x / 16;

// ✅ 快（快 3-10 倍）
let a = x << 1;      // 乘以 2
let b = x << 2;      // 乘以 4
let c = x << 3;      // 乘以 8
let d = x >> 1;      // 除以 2
let e = x >> 4;      // 除以 16

// ============ 模運算 ============
// ❌ 慢
let remainder = x % 8;
let r2 = x % 16;
let r3 = x % 32;

// ✅ 快
let remainder = x & 7;    // % 8  (7 = 2^3 - 1)
let r2 = x & 15;          // % 16 (15 = 2^4 - 1)
let r3 = x & 31;          // % 32 (31 = 2^5 - 1)

// ============ 奇偶判斷 ============
// ❌ 慢
if x % 2 == 0 { /* 偶數 */ }

// ✅ 快
if (x & 1) == 0 { /* 偶數 */ }

// ============ 判斷 2 的冪次 ============
fn is_power_of_two(x: u32) -> bool {
    x != 0 && (x & (x - 1)) == 0
}

// ============ 取絕對值（整數）============
fn abs_i32(x: i32) -> i32 {
    let mask = x >> 31;  // 負數全為 1，正數全為 0
    (x ^ mask) - mask
}

fn abs_i64(x: i64) -> i64 {
    let mask = x >> 63;
    (x ^ mask) - mask
}

// ============ 交換變數 ============
// ❌ 需要臨時變數
let temp = a;
a = b;
b = temp;

// ✅ XOR 交換（無需額外記憶體）
a ^= b;
b ^= a;
a ^= b;

// ============ 最小/最大值（無分支）============
fn min_branchless(a: i32, b: i32) -> i32 {
    b ^ ((a ^ b) & -((a < b) as i32))
}

fn max_branchless(a: i32, b: i32) -> i32 {
    a ^ ((a ^ b) & -((a < b) as i32))
}

// ============ 位元操作進階 ============
// 找到最低位的 1
fn lowest_set_bit(x: u64) -> u64 {
    x & x.wrapping_neg()
}

// 清除最低位的 1
fn clear_lowest_bit(x: u64) -> u64 {
    x & (x - 1)
}

// 計算 trailing zeros（使用 CPU 指令）
fn trailing_zeros(x: u64) -> u32 {
    x.trailing_zeros()  // 編譯成 BSF/TZCNT 指令
}

// 計算 leading zeros
fn leading_zeros(x: u64) -> u32 {
    x.leading_zeros()   // 編譯成 BSR/LZCNT 指令
}

// 計算 1 的數量（popcount）
fn count_ones(x: u64) -> u32 {
    x.count_ones()      // 編譯成 POPCNT 指令
}

// 位元反轉
fn reverse_bits(x: u32) -> u32 {
    x.reverse_bits()    // 硬體加速
}

// ============ 條件選擇（無分支）============
// 選擇 a 或 b，根據 condition
fn select(condition: bool, a: i32, b: i32) -> i32 {
    let mask = -(condition as i32);  // true: -1, false: 0
    (a & mask) | (b & !mask)
}

// ============ 符號擴展 ============
// 將 8-bit 符號數擴展到 32-bit
fn sign_extend_8to32(x: u8) -> i32 {
    ((x as i8) as i32)
}

// ============ 快速整數平方根（近似）============
fn isqrt_approx(x: u32) -> u32 {
    if x == 0 { return 0; }
    let mut z = x;
    let mut result = 0u32;
    let mut bit = 1u32 << 30;
    
    while bit > x {
        bit >>= 2;
    }
    
    while bit != 0 {
        if x >= result + bit {
            z = x - (result + bit);
            result = (result >> 1) + bit;
        } else {
            result >>= 1;
        }
        bit >>= 2;
    }
    result
}
}

實際交易應用

#![allow(unused)]
fn main() {
// 價格 tick 計算
const TICK_SIZE: u32 = 25;  // 2^5 = 32 的倍數，可用位移

fn price_to_tick(price: u32) -> u32 {
    price >> 5  // 除以 32，等同於 price / TICK_SIZE
}

fn tick_to_price(tick: u32) -> u32 {
    tick << 5   // 乘以 32
}

// 訂單 ID 編碼/解碼
// 高 32 位：時間戳，低 32 位：序號
fn encode_order_id(timestamp: u32, sequence: u32) -> u64 {
    ((timestamp as u64) << 32) | (sequence as u64)
}

fn decode_timestamp(order_id: u64) -> u32 {
    (order_id >> 32) as u32
}

fn decode_sequence(order_id: u64) -> u32 {
    (order_id & 0xFFFFFFFF) as u32
}

// 快速判斷買賣方向（bit flag）
const BUY_FLAG: u8 = 0b0000_0001;
const MARKET_ORDER: u8 = 0b0000_0010;
const IOC_FLAG: u8 = 0b0000_0100;

fn is_buy_order(flags: u8) -> bool {
    (flags & BUY_FLAG) != 0
}

fn is_market_order(flags: u8) -> bool {
    (flags & MARKET_ORDER) != 0
}

// 組合多個標誌
fn create_flags(is_buy: bool, is_market: bool, is_ioc: bool) -> u8 {
    ((is_buy as u8) * BUY_FLAG) |
    ((is_market as u8) * MARKET_ORDER) |
    ((is_ioc as u8) * IOC_FLAG)
}
}

2. 查表法 (Lookup Table)

基礎查表

#![allow(unused)]
fn main() {
// ============ 預計算常用值 ============
// 位元計數表（8-bit）
static POPCOUNT_TABLE: [u8; 256] = {
    let mut table = [0u8; 256];
    let mut i = 0;
    while i < 256 {
        table[i] = (i as u8).count_ones() as u8;
        i += 1;
    }
    table
};

fn popcount_lookup(mut x: u32) -> u32 {
    let mut count = 0u32;
    while x != 0 {
        count += POPCOUNT_TABLE[(x & 0xFF) as usize] as u32;
        x >>= 8;
    }
    count
}

// ============ 對數表 ============
static LOG2_TABLE: [f64; 256] = {
    let mut table = [0.0; 256];
    let mut i = 1;
    while i < 256 {
        table[i] = (i as f64).log2();
        i += 1;
    }
    table
};

// ============ 平方根表 ============
static SQRT_TABLE: [f64; 1024] = {
    let mut table = [0.0; 1024];
    let mut i = 0;
    while i < 1024 {
        table[i] = (i as f64).sqrt();
        i += 1;
    }
    table
};

// ============ 手續費計算表 ============
static FEE_TABLE: [f64; 1000] = {
    let mut table = [0.0; 1000];
    let mut i = 0;
    while i < 1000 {
        table[i] = if i < 100 {
            i as f64 * 0.001
        } else if i < 500 {
            i as f64 * 0.0008
        } else {
            i as f64 * 0.0005
        };
        i += 1;
    }
    table
};

#[inline(always)]
fn calculate_fee(volume: u32) -> f64 {
    if volume < 1000 {
        FEE_TABLE[volume as usize]
    } else {
        volume as f64 * 0.0005
    }
}
}

交易應用查表

#![allow(unused)]
fn main() {
// ============ 價格等級查表 ============
// 假設價格範圍 10000-20000，精度 0.01
const PRICE_MIN: u32 = 1000000;  // 10000.00 * 100
const PRICE_MAX: u32 = 2000000;  // 20000.00 * 100
const PRICE_RANGE: usize = (PRICE_MAX - PRICE_MIN) as usize + 1;

static PRICE_LEVEL_TABLE: [u16; PRICE_RANGE] = {
    let mut table = [0u16; PRICE_RANGE];
    let mut i = 0;
    while i < PRICE_RANGE {
        let price = PRICE_MIN + i as u32;
        table[i] = ((price - PRICE_MIN) / 100) as u16;  // 每 1.00 一個等級
        i += 1;
    }
    table
};

#[inline(always)]
fn get_price_level(price: u32) -> u16 {
    if price < PRICE_MIN || price > PRICE_MAX {
        return 0;
    }
    PRICE_LEVEL_TABLE[(price - PRICE_MIN) as usize]
}

// ============ 波動率區間查表 ============
const VOLATILITY_BUCKETS: usize = 100;
static VOLATILITY_ADJUSTMENT: [f64; VOLATILITY_BUCKETS] = {
    let mut table = [0.0; VOLATILITY_BUCKETS];
    let mut i = 0;
    while i < VOLATILITY_BUCKETS {
        let vol = i as f64 * 0.01;  // 0.00 到 1.00
        table[i] = if vol < 0.2 {
            1.0
        } else if vol < 0.5 {
            0.95
        } else {
            0.85
        };
        i += 1;
    }
    table
};

// ============ 時間衰減表（選擇權）============
static TIME_DECAY: [f64; 365] = {
    let mut table = [0.0; 365];
    let mut i = 0;
    while i < 365 {
        let days = i as f64;
        table[i] = (-days / 365.0).exp();
        i += 1;
    }
    table
};
}

動態查表（執行時建立）

#![allow(unused)]
fn main() {
use std::collections::HashMap;
use std::sync::Arc;

// 使用 FxHashMap（更快的 hash）
use rustc_hash::FxHashMap;

struct OrderBook {
    // 價格 -> 訂單列表索引
    price_index: FxHashMap<u32, Vec<usize>>,
    orders: Vec<Order>,
}

impl OrderBook {
    fn new() -> Self {
        Self {
            price_index: FxHashMap::default(),
            orders: Vec::with_capacity(10000),
        }
    }
    
    fn add_order(&mut self, price: u32, order: Order) {
        let idx = self.orders.len();
        self.orders.push(order);
        self.price_index.entry(price).or_insert_with(Vec::new).push(idx);
    }
    
    #[inline(always)]
    fn get_orders_at_price(&self, price: u32) -> Option<&Vec<usize>> {
        self.price_index.get(&price)
    }
}
}

3. 分支預測優化

#![allow(unused)]
fn main() {
// ============ Likely/Unlikely 提示 ============
#![feature(core_intrinsics)]
use std::intrinsics::{likely, unlikely};

fn process_order(order: &Order) {
    // 大部分訂單是買單
    if unsafe { likely(order.is_buy) } {
        execute_buy(order);
    } else {
        execute_sell(order);
    }
    
    // 極少數情況會出錯
    if unsafe { unlikely(order.quantity == 0) } {
        handle_error();
        return;
    }
    
    normal_processing(order);
}

// ============ 無分支版本 ============
// 條件賦值
fn get_fee_rate(is_vip: bool) -> f64 {
    // ❌ 有分支
    // if is_vip { 0.0005 } else { 0.001 }
    
    // ✅ 無分支
    0.001 - (is_vip as u32 as f64 * 0.0005)
}

// 條件選擇
fn select_value(condition: bool, true_val: i32, false_val: i32) -> i32 {
    let index = condition as usize;
    [false_val, true_val][index]
}

// ============ 用陣列取代 if-else ============
// ❌ 多重分支
fn get_tier_name(tier: u8) -> &'static str {
    if tier == 0 { "Bronze" }
    else if tier == 1 { "Silver" }
    else if tier == 2 { "Gold" }
    else if tier == 3 { "Platinum" }
    else { "Unknown" }
}

// ✅ 查表
const TIER_NAMES: [&str; 5] = ["Bronze", "Silver", "Gold", "Platinum", "Unknown"];

fn get_tier_name_fast(tier: u8) -> &'static str {
    TIER_NAMES[tier.min(4) as usize]
}

// ============ 位元操作取代分支 ============
// 計算絕對差值
fn abs_diff_branched(a: i32, b: i32) -> i32 {
    if a > b { a - b } else { b - a }
}

fn abs_diff_branchless(a: i32, b: i32) -> i32 {
    let diff = a - b;
    let mask = diff >> 31;
    (diff ^ mask) - mask
}

// Min/Max
fn min_i32(a: i32, b: i32) -> i32 {
    b ^ ((a ^ b) & -((a < b) as i32))
}

fn max_i32(a: i32, b: i32) -> i32 {
    a ^ ((a ^ b) & -((a < b) as i32))
}

// ============ 提前返回（短路）============
fn validate_and_execute(order: &Order) -> Result<(), Error> {
    // 快速失敗路徑
    if order.quantity == 0 { return Err(Error::InvalidQuantity); }
    if order.price == 0 { return Err(Error::InvalidPrice); }
    
    // 主要邏輯
    execute_order(order)
}
}

4. CPU 綁定與排程優化

CPU Affinity

// Cargo.toml
// [dependencies]
// core_affinity = "0.8"
// libc = "0.2"

use core_affinity::{self, CoreId};
use std::thread;

// ============ 基本 CPU 綁定 ============
fn pin_thread_to_core(core_id: usize) {
    let core_ids = core_affinity::get_core_ids().unwrap();
    if core_id < core_ids.len() {
        core_affinity::set_for_current(core_ids[core_id]);
        println!("執行緒綁定到核心 {}", core_id);
    }
}

fn main() {
    // 主執行緒綁到核心 0
    pin_thread_to_core(0);
    
    // 交易執行緒綁到核心 2
    let trading_thread = thread::spawn(|| {
        pin_thread_to_core(2);
        trading_loop();
    });
    
    // 市場資料執行緒綁到核心 3
    let market_data_thread = thread::spawn(|| {
        pin_thread_to_core(3);
        market_data_loop();
    });
    
    trading_thread.join().unwrap();
    market_data_thread.join().unwrap();
}

// ============ 使用 libc 直接設定 ============
use libc::{cpu_set_t, sched_setaffinity, CPU_SET, CPU_ZERO};

fn pin_to_cpu_libc(cpu: usize) -> Result<(), String> {
    unsafe {
        let mut cpuset: cpu_set_t = std::mem::zeroed();
        CPU_ZERO(&mut cpuset);
        CPU_SET(cpu, &mut cpuset);
        
        let result = sched_setaffinity(
            0,  // 0 = 當前執行緒
            std::mem::size_of::<cpu_set_t>(),
            &cpuset
        );
        
        if result != 0 {
            return Err(format!("無法綁定 CPU {}", cpu));
        }
    }
    Ok(())
}

// ============ 綁定多個 CPU ============
fn pin_to_multiple_cpus(cpus: &[usize]) -> Result<(), String> {
    unsafe {
        let mut cpuset: cpu_set_t = std::mem::zeroed();
        CPU_ZERO(&mut cpuset);
        
        for &cpu in cpus {
            CPU_SET(cpu, &mut cpuset);
        }
        
        let result = sched_setaffinity(
            0,
            std::mem::size_of::<cpu_set_t>(),
            &cpuset
        );
        
        if result != 0 {
            return Err("無法設定 CPU affinity".to_string());
        }
    }
    Ok(())
}

// ============ 查詢當前綁定 ============
use libc::{sched_getaffinity, CPU_ISSET};

fn get_current_affinity() -> Vec<usize> {
    unsafe {
        let mut cpuset: cpu_set_t = std::mem::zeroed();
        sched_getaffinity(0, std::mem::size_of::<cpu_set_t>(), &mut cpuset);
        
        let mut cores = Vec::new();
        for cpu in 0..256 {
            if CPU_ISSET(cpu, &cpuset) {
                cores.push(cpu);
            }
        }
        cores
    }
}

Real-Time Priority

#![allow(unused)]
fn main() {
use libc::{sched_param, sched_setscheduler, SCHED_FIFO, SCHED_RR};

// ============ 設定 Real-Time 優先權 ============
fn set_realtime_priority(priority: i32) -> Result<(), String> {
    // priority: 1-99，數字越大優先權越高
    // 一般建議：70-90
    
    if priority < 1 || priority > 99 {
        return Err("優先權必須在 1-99 之間".to_string());
    }
    
    unsafe {
        let param = sched_param {
            sched_priority: priority,
        };
        
        // SCHED_FIFO: 先進先出，執行到完成或主動讓出
        let result = sched_setscheduler(0, SCHED_FIFO, &param);
        
        if result != 0 {
            return Err("需要 CAP_SYS_NICE 權限或 sudo".to_string());
        }
    }
    
    Ok(())
}

// ============ 使用 SCHED_RR（時間片輪詢）============
fn set_realtime_rr(priority: i32, timeslice_ms: u32) -> Result<(), String> {
    unsafe {
        let param = sched_param {
            sched_priority: priority,
        };
        
        // SCHED_RR: Round-Robin，有時間片限制
        let result = sched_setscheduler(0, SCHED_RR, &param);
        
        if result != 0 {
            return Err("無法設定 SCHED_RR".to_string());
        }
    }
    
    Ok(())
}

// ============ 完整的執行緒設定 ============
fn setup_realtime_thread(cpu_id: usize, priority: i32) -> Result<(), String> {
    // 1. 綁定 CPU
    pin_to_cpu_libc(cpu_id)?;
    
    // 2. 設定 RT 優先權
    set_realtime_priority(priority)?;
    
    // 3. 鎖定記憶體（避免 page fault）
    unsafe {
        if libc::mlockall(libc::MCL_CURRENT | libc::MCL_FUTURE) != 0 {
            return Err("無法鎖定記憶體".to_string());
        }
    }
    
    Ok(())
}
}

5. 記憶體與 Cache 優化

Cache-Friendly 資料結構

#![allow(unused)]
fn main() {
// ============ Structure of Arrays (SoA) ============
// ❌ Array of Structures (AoS) - Cache miss 多
struct Order {
    id: u64,
    price: f64,
    quantity: u32,
    timestamp: u64,
}

struct OrderBookAoS {
    orders: Vec<Order>,  // 每次存取跳來跳去
}

// ✅ Structure of Arrays - Cache 友善
struct OrderBookSoA {
    ids: Vec<u64>,
    prices: Vec<f64>,
    quantities: Vec<u32>,
    timestamps: Vec<u64>,
}

impl OrderBookSoA {
    // 只需要價格時，只載入 prices 到 cache
    fn get_best_price(&self) -> Option<f64> {
        self.prices.first().copied()
    }
    
    // SIMD 可以一次處理多個價格
    fn calculate_vwap(&self) -> f64 {
        let total_value: f64 = self.prices.iter()
            .zip(self.quantities.iter())
            .map(|(&p, &q)| p * q as f64)
            .sum();
        let total_qty: u32 = self.quantities.iter().sum();
        total_value / total_qty as f64
    }
}

// ============ Cache Line 大小對齊 ============
const CACHE_LINE_SIZE: usize = 64;

#[repr(align(64))]
struct AlignedData {
    value: u64,
    _padding: [u8; 56],  // 填充到 64 bytes
}

// ============ 避免 False Sharing ============
use crossbeam::utils::CachePadded;

struct SharedCounters {
    // ❌ False sharing - 兩個執行緒寫入同一 cache line
    // counter1: AtomicU64,
    // counter2: AtomicU64,
    
    // ✅ 各自在不同 cache line
    counter1: CachePadded<AtomicU64>,
    counter2: CachePadded<AtomicU64>,
}

// ============ 記憶體池（避免頻繁分配）============
struct OrderPool {
    pool: Vec<Order>,
    free_indices: Vec<usize>,
    capacity: usize,
}

impl OrderPool {
    fn new(capacity: usize) -> Self {
        let mut pool = Vec::with_capacity(capacity);
        for i in 0..capacity {
            pool.push(Order::default());
        }
        
        Self {
            pool,
            free_indices: (0..capacity).collect(),
            capacity,
        }
    }
    
    fn allocate(&mut self) -> Option<&mut Order> {
        if let Some(idx) = self.free_indices.pop() {
            Some(&mut self.pool[idx])
        } else {
            None
        }
    }
    
    fn deallocate(&mut self, order: &Order) {
        // 找到索引並回收
        let idx = order as *const Order as usize - 
                  self.pool.as_ptr() as usize;
        let idx = idx / std::mem::size_of::<Order>();
        self.free_indices.push(idx);
    }
}

// ============ 預分配與重用 ============
struct MessageBuffer {
    buffer: Vec<u8>,
}

impl MessageBuffer {
    fn new() -> Self {
        Self {
            buffer: Vec::with_capacity(4096),
        }
    }
    
    fn prepare(&mut self, size: usize) {
        self.buffer.clear();
        self.buffer.reserve(size);
    }
    
    fn get_buffer(&mut self) -> &mut Vec<u8> {
        &mut self.buffer
    }
}

// ============ Arena Allocator ============
use bumpalo::Bump;

fn use_arena() {
    let arena = Bump::new();
    
    // 所有分配都在 arena 中，一次性釋放
    let orders: Vec<_> = (0..1000)
        .map(|i| arena.alloc(Order { id: i, ..Default::default() }))
        .collect();
    
    // 離開 scope 時一次釋放所有記憶體
}
}

記憶體預熱

#![allow(unused)]
fn main() {
// ============ Cache 預熱 ============
fn warmup_cache<T>(data: &[T]) {
    // 觸發所有 cache line 載入
    for item in data {
        std::hint::black_box(item);
    }
}

fn warmup_order_book(book: &OrderBookSoA) {
    // 預熱所有價格資料
    for price in &book.prices {
        std::hint::black_box(price);
    }
}

// ============ Prefetch（手動）============
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

unsafe fn prefetch_data<T>(ptr: *const T) {
    #[cfg(target_arch = "x86_64")]
    {
        _mm_prefetch(ptr as *const i8, _MM_HINT_T0);  // 載入到 L1 cache
    }
}

// 使用範例
fn process_orders_with_prefetch(orders: &[Order]) {
    for i in 0..orders.len() {
        // 預取下一個訂單
        if i + 1 < orders.len() {
            unsafe {
                prefetch_data(&orders[i + 1]);
            }
        }
        
        process_order(&orders[i]);
    }
}
}

6. Huge Pages 設定

系統設定（不需重編 kernel）

#!/bin/bash
# setup_hugepages.sh

# ============ 查看目前狀態 ============
cat /proc/meminfo | grep -i huge

# ============ 方法 1: Transparent Huge Pages (THP) ============
# 自動啟用，最簡單
echo always | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
echo always | sudo tee /sys/kernel/mm/transparent_hugepage/defrag

# 查看狀態
cat /sys/kernel/mm/transparent_hugepage/enabled
# 應該顯示: [always] madvise never

# ============ 方法 2: 預分配 Huge Pages ============
# 分配 1024 個 2MB 頁面 = 2GB
echo 1024 | sudo tee /proc/sys/vm/nr_hugepages

# 分配 1GB 頁面（需要 CPU 支援）
echo 2 | sudo tee /proc/sys/vm/nr_hugepages_1GB

# 查看分配結果
cat /proc/meminfo | grep HugePages_Total
cat /proc/meminfo | grep HugePages_Free

# ============ 永久設定 ============
# 編輯 /etc/sysctl.conf
sudo tee -a /etc/sysctl.conf <<EOF
# Huge Pages 設定
vm.nr_hugepages = 1024
vm.hugetlb_shm_group = 1000  # 你的 group ID
EOF

# 套用設定
sudo sysctl -p

# ============ 掛載 hugetlbfs ============
sudo mkdir -p /mnt/huge
sudo mount -t hugetlbfs nodev /mnt/huge

# 永久掛載（加到 /etc/fstab）
echo "nodev /mnt/huge hugetlbfs defaults 0 0" | sudo tee -a /etc/fstab

Rust 中使用 Huge Pages

#![allow(unused)]
fn main() {
use libc::{mmap, munmap, MAP_ANONYMOUS, MAP_PRIVATE, MAP_HUGETLB, PROT_READ, PROT_WRITE};
use std::ptr;

// ============ 手動分配 Huge Pages ============
struct HugePageBuffer {
    ptr: *mut u8,
    size: usize,
}

impl HugePageBuffer {
    fn new(size: usize) -> Result<Self, String> {
        // size 應該是 2MB 的倍數
        let size = (size + 2 * 1024 * 1024 - 1) & !(2 * 1024 * 1024 - 1);
        
        unsafe {
            let ptr = mmap(
                ptr::null_mut(),
                size,
                PROT_READ | PROT_WRITE,
                MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
                -1,
                0
            );
            
            if ptr == libc::MAP_FAILED {
                return Err("無法分配 huge pages，檢查系統設定".to_string());
            }
            
            Ok(Self {
                ptr: ptr as *mut u8,
                size,
            })
        }
    }
    
    fn as_slice_mut(&mut self) -> &mut [u8] {
        unsafe {
            std::slice::from_raw_parts_mut(self.ptr, self.size)
        }
    }
}

impl Drop for HugePageBuffer {
    fn drop(&mut self) {
        unsafe {
            munmap(self.ptr as *mut libc::c_void, self.size);
        }
    }
}

// 使用範例
fn use_huge_pages() -> Result<(), String> {
    let mut buffer = HugePageBuffer::new(4 * 1024 * 1024)?;  // 4MB
    let slice = buffer.as_slice_mut();
    
    // 使用 buffer
    slice[0] = 42;
    
    Ok(())
}

// ============ 使用 madvise 提示 ============
use libc::{madvise, MADV_HUGEPAGE};

fn suggest_huge_pages(ptr: *mut u8, size: usize) {
    unsafe {
        madvise(ptr as *mut libc::c_void, size, MADV_HUGEPAGE);
    }
}

// ============ Global Allocator 使用 Huge Pages ============
// Cargo.toml: huge_pages = "0.2"
// 注意：這個會影響所有記憶體分配

// 方法 1: 完全替換 allocator
#[global_allocator]
static ALLOC: huge_pages::HugePageAllocator = huge_pages::HugePageAllocator;

// 方法 2: 只針對特定資料結構
use huge_pages::HugePageVec;

fn use_huge_page_vec() {
    let mut vec = HugePageVec::<u64>::new();
    vec.extend(0..1_000_000);
}
}

7. 資料結構對齊

#![allow(unused)]
fn main() {
// ============ Rust 自動優化 ============
struct Auto {
    a: u8,      // 1 byte
    b: u64,     // 8 bytes  
    c: u16,     // 2 bytes
}
// Rust 會重排成：b, c, a, padding
// 實際大小：16 bytes（已優化）

// ============ C 語言佈局（FFI 用）============
#[repr(C)]
struct CLayout {
    a: u8,      // 位置固定
    // padding: 7 bytes
    b: u64,
    c: u16,
    // padding: 6 bytes
}
// 大小：24 bytes（未優化）

// ============ 緊密打包（無 padding）============
#[repr(packed)]
struct Packed {
    a: u8,
    b: u64,
    c: u16,
}
// 大小：11 bytes
// ⚠️ 警告：存取 b 可能很慢（未對齊）

// ⚠️ 取引用會有問題
fn use_packed() {
    let p = Packed { a: 1, b: 2, c: 3 };
    
    // ❌ 編譯錯誤或警告
    // let r = &p.b;
    
    // ✅ 複製值
    let val = p.b;  // OK
}

// ============ 指定對齊 ============
#[repr(align(16))]
struct Aligned16 {
    data: u64,
}
// 大小：16 bytes（對齊到 16）

#[repr(align(64))]  // Cache line 大小
struct CacheLineAligned {
    data: u64,
}
// 大小：64 bytes

// ============ 組合使用 ============
#[repr(C, align(32))]
struct Combined {
    a: u32,
    b: u32,
}
// C 佈局 + 32-byte 對齊

// ============ 檢查大小和對齊 ============
use std::mem::{size_of, align_of};

fn check_layout() {
    println!("Auto: size={}, align={}", 
        size_of::<Auto>(), align_of::<Auto>());
    
    println!("CLayout: size={}, align={}", 
        size_of::<CLayout>(), align_of::<CLayout>());
    
    println!("Packed: size={}, align={}", 
        size_of::<Packed>(), align_of::<Packed>());
    
    println!("Aligned16: size={}, align={}", 
        size_of::<Aligned16>(), align_of::<Aligned16>());
}

// ============ 實用的對齊巨集 ============
macro_rules! cache_aligned {
    ($name:ident, $field:ty) => {
        #[repr(align(64))]
        struct $name {
            value: $field,
            _padding: [u8; 64 - std::mem::size_of::<$field>()],
        }
    };
}

cache_aligned!(AlignedCounter, std::sync::atomic::AtomicU64);

// ============ SIMD 對齊 ============
#[repr(align(32))]  // AVX 需要 32-byte 對齊
struct SimdBuffer {
    data: [f32; 8],
}

#[repr(align(64))]  // AVX-512 需要 64-byte 對齊
struct Simd512Buffer {
    data: [f32; 16],
}
}

8. 浮點數優化

#![allow(unused)]
fn main() {
// ============ 乘法取代除法 ============
// ❌ 慢
let result = price / 100.0;           // ~10 cycles
let r2 = value / 3.14159;

// ✅ 快
let result = price * 0.01;            // ~3 cycles
let r2 = value * 0.318309886;         // 1/π 預計算

// ============ 整數運算取代浮點 ============
// 價格用整數（單位：分或 tick）
struct Price {
    value: i64,  // 價格 * 10000，精度到 0.0001
}

impl Price {
    fn from_float(price: f64) -> Self {
        Self { value: (price * 10000.0) as i64 }
    }
    
    fn to_float(&self) -> f64 {
        self.value as f64 * 0.0001  // 乘法比除法快
    }
    
    fn add(&self, other: &Self) -> Self {
        Self { value: self.value + other.value }
    }
    
    fn multiply(&self, qty: i64) -> i64 {
        self.value * qty
    }
}

// ============ 避免浮點比較 ============
// ❌ 不精確
if price == 123.45 { }

// ✅ 用整數
let price_int = (price * 10000.0) as i64;
if price_int == 1234500 { }

// ✅ 容差比較
const EPSILON: f64 = 1e-9;
if (price - 123.45).abs() < EPSILON { }

// ============ 快速數學函數 ============
// ❌ 精確但慢
let sqrt_val = x.sqrt();
let ln_val = x.ln();

// ✅ 快速近似（需要 libm 或自己實作）
fn fast_sqrt(x: f32) -> f32 {
    // 使用 CPU 指令
    unsafe { std::arch::x86_64::_mm_cvtss_f32(
        std::arch::x86_64::_mm_sqrt_ss(
            std::arch::x86_64::_mm_set_ss(x)
        )
    )}
}

// 或用位元技巧（Quake III 演算法）
fn fast_inv_sqrt(x: f32) -> f32 {
    let i = x.to_bits();
    let i = 0x5f3759df - (i >> 1);
    let y = f32::from_bits(i);
    y * (1.5 - 0.5 * x * y * y)  // 一次牛頓迭代
}

// ============ FMA (Fused Multiply-Add) ============
// a * b + c 一次完成，更快更精確
fn use_fma(a: f64, b: f64, c: f64) -> f64 {
    a.mul_add(b, c)  // 編譯成 FMA 指令
}

// ============ 避免不必要的型別轉換 ============
// ❌ 慢
let result = (x as f64 * y as f64) as i32;

// ✅ 快
let result = (x * y) as i32;

// ============ 使用 SIMD 版本 ============
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

unsafe fn sqrt_4(values: [f32; 4]) -> [f32; 4] {
    let v = _mm_loadu_ps(values.as_ptr());
    let result = _mm_sqrt_ps(v);
    let mut out = [0.0f32; 4];
    _mm_storeu_ps(out.as_mut_ptr(), result);
    out
}
}

9. SIMD 平行化

#![allow(unused)]
fn main() {
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

// ============ AVX2 範例：處理 4 個 f64 ============
unsafe fn sum_f64_simd(data: &[f64]) -> f64 {
    let mut sum = _mm256_setzero_pd();  // 4 個 f64
    
    let chunks = data.chunks_exact(4);
    let remainder = chunks.remainder();
    
    for chunk in chunks {
        let v = _mm256_loadu_pd(chunk.as_ptr());
        sum = _mm256_add_pd(sum, v);
    }
    
    // 水平相加
    let mut result = [0.0; 4];
    _mm256_storeu_pd(result.as_mut_ptr(), sum);
    let mut total = result.iter().sum::<f64>();
    
    // 處理剩餘
    total += remainder.iter().sum::<f64>();
    total
}

// ============ AVX2：價格 * 數量（批次）============
unsafe fn calculate_values_simd(prices: &[f64], quantities: &[f64], output: &mut [f64]) {
    assert_eq!(prices.len(), quantities.len());
    assert_eq!(prices.len(), output.len());
    
    let len = prices.len();
    let mut i = 0;
    
    while i + 4 <= len {
        let p = _mm256_loadu_pd(prices[i..].as_ptr());
        let q = _mm256_loadu_pd(quantities[i..].as_ptr());
        let result = _mm256_mul_pd(p, q);
        _mm256_storeu_pd(output[i..].as_mut_ptr(), result);
        i += 4;
    }
    
    // 處理剩餘
    while i < len {
        output[i] = prices[i] * quantities[i];
        i += 1;
    }
}

// ============ AVX2：比較（找最大價格）============
unsafe fn find_max_price_simd(prices: &[f64]) -> f64 {
    if prices.is_empty() { return 0.0; }
    
    let mut max_vec = _mm256_set1_pd(f64::MIN);
    
    let chunks = prices.chunks_exact(4);
    for chunk in chunks {
        let v = _mm256_loadu_pd(chunk.as_ptr());
        max_vec = _mm256_max_pd(max_vec, v);
    }
    
    // 提取 4 個值
    let mut result = [0.0; 4];
    _mm256_storeu_pd(result.as_mut_ptr(), max_vec);
    
    let mut max = result.iter().copied().fold(f64::MIN, f64::max);
    
    // 處理剩餘
    for &price in chunks.remainder() {
        max = max.max(price);
    }
    
    max
}

// ============ 使用 packed_simd（更簡單）============
// Cargo.toml: packed_simd = "0.3"
use packed_simd::*;

fn sum_f64_packed_simd(data: &[f64]) -> f64 {
    let mut sum = f64x4::splat(0.0);
    
    for chunk in data.chunks_exact(4) {
        let v = f64x4::from_slice_unaligned(chunk);
        sum += v;
    }
    
    sum.sum()
}

// ============ 自動向量化提示 ============
fn process_prices_auto_vec(prices: &mut [f64]) {
    // 編譯器可能自動向量化
    for price in prices {
        *price *= 1.01;  // 簡單運算容易向量化
    }
}

// 檢查是否向量化：
// RUSTFLAGS="-C opt-level=3 -C target-cpu=native" cargo build --release
// cargo asm your_crate::process_prices_auto_vec
// 看是否有 vmulpd 等 SIMD 指令
}

10. 避免 Context Switch

#![allow(unused)]
fn main() {
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;

// ============ Busy-Wait（CPU 空轉）============
fn spin_wait_nanos(nanos: u64) {
    let start = std::time::Instant::now();
    while start.elapsed().as_nanos() < nanos as u128 {
        std::hint::spin_loop();  // CPU pause 指令
    }
}

// ============ 混合策略：先 spin 後 sleep ============
fn hybrid_wait(duration: Duration) {
    const SPIN_THRESHOLD: Duration = Duration::from_micros(50);
    
    if duration < SPIN_THRESHOLD {
        // 短時間：busy wait
        spin_wait_nanos(duration.as_nanos() as u64);
    } else {
        // 長時間：先 spin 一小段，再 sleep
        spin_wait_nanos(SPIN_THRESHOLD.as_nanos() as u64);
        std::thread::sleep(duration - SPIN_THRESHOLD);
    }
}

// ============ Lock-Free 輪詢 ============
use crossbeam::queue::ArrayQueue;

static QUEUE: ArrayQueue<Message> = ArrayQueue::new(1024);

fn consumer_loop() {
    loop {
        // 無鎖輪詢，不會被阻塞
        match QUEUE.pop() {
            Some(msg) => process_message(msg),
            None => std::hint::spin_loop(),  // CPU pause
        }
    }
}

// ============ 減少系統呼叫 ============
use std::io::Write;

fn batch_logging() {
    let mut buffer = Vec::with_capacity(4096);
    
    for i in 0..100 {
        // 先寫到 buffer
        writeln!(&mut buffer, "Log {}", i).unwrap();
    }
    
    // 一次性寫入（只有一次系統呼叫）
    std::io::stdout().write_all(&buffer).unwrap();
}

// ============ 禁用搶占（需要 RT priority）============
use libc::{sched_param, sched_setscheduler, SCHED_FIFO};

fn prevent_preemption() {
    unsafe {
        let param = sched_param {
            sched_priority: 99,  // 最高優先權
        };
        sched_setscheduler(0, SCHED_FIFO, &param);
    }
    
    // 現在這個執行緒只會在以下情況被中斷：
    // 1. 主動 yield
    // 2. 更高優先權的執行緒
    // 3. 硬體中斷
}
}

11. Lock-Free 程式設計

#![allow(unused)]
fn main() {
use std::sync::atomic::{AtomicU64, AtomicBool, Ordering};
use crossbeam::queue::{ArrayQueue, SegQueue};
use crossbeam::channel::{bounded, unbounded};

// ============ Atomic 操作 ============
struct LockFreeCounter {
    count: AtomicU64,
}

impl LockFreeCounter {
    fn new() -> Self {
        Self { count: AtomicU64::new(0) }
    }
    
    fn increment(&self) -> u64 {
        self.count.fetch_add(1, Ordering::Relaxed)
    }
    
    fn get(&self) -> u64 {
        self.count.load(Ordering::Relaxed)
    }
}

// ============ Lock-Free Queue ============
struct LockFreeQueue<T> {
    queue: ArrayQueue<T>,
}

impl<T> LockFreeQueue<T> {
    fn new(capacity: usize) -> Self {
        Self {
            queue: ArrayQueue::new(capacity),
        }
    }
    
    fn push(&self, item: T) -> Result<(), T> {
        self.queue.push(item)
    }
    
    fn pop(&self) -> Option<T> {
        self.queue.pop()
    }
}

// ============ MPSC Channel (多生產者單消費者) ============
fn use_mpsc() {
    let (tx, rx) = bounded(1000);
    
    // 多個生產者
    for i in 0..4 {
        let tx = tx.clone();
        std::thread::spawn(move || {
            for msg in 0..100 {
                tx.send((i, msg)).unwrap();
            }
        });
    }
    
    // 單一消費者
    std::thread::spawn(move || {
        while let Ok(msg) = rx.recv() {
            process_message(msg);
        }
    });
}

// ============ SPSC Channel (單生產者單消費者，最快) ============
use crossbeam::channel::unbounded;

fn use_spsc() {
    let (tx, rx) = unbounded();
    
    std::thread::spawn(move || {
        for i in 0..1000000 {
            tx.send(i).unwrap();
        }
    });
    
    std::thread::spawn(move || {
        while let Ok(msg) = rx.recv() {
            // 處理訊息
        }
    });
}

// ============ RwLock 替代方案：SeqLock ============
use seqlock::SeqLock;

struct PriceData {
    price: f64,
    volume: u64,
}

static PRICE: SeqLock<PriceData> = SeqLock::new(PriceData { 
    price: 0.0, 
    volume: 0 
});

// 寫入（單一寫入者）
fn update_price(new_price: f64, new_volume: u64) {
    *PRICE.lock_write() = PriceData {
        price: new_price,
        volume: new_volume,
    };
}

// 讀取（多個讀取者，無鎖）
fn read_price() -> PriceData {
    PRICE.read()
}

// ============ Compare-And-Swap (CAS) ============
use std::sync::atomic::AtomicPtr;

struct Node<T> {
    data: T,
    next: AtomicPtr<Node<T>>,
}

struct LockFreeStack<T> {
    head: AtomicPtr<Node<T>>,
}

impl<T> LockFreeStack<T> {
    fn push(&self, data: T) {
        let new_node = Box::into_raw(Box::new(Node {
            data,
            next: AtomicPtr::new(std::ptr::null_mut()),
        }));
        
        loop {
            let old_head = self.head.load(Ordering::Acquire);
            unsafe {
                (*new_node).next.store(old_head, Ordering::Relaxed);
            }
            
            // CAS: 如果 head 還是 old_head，就換成 new_node
            if self.head
                .compare_exchange(old_head, new_node, Ordering::Release, Ordering::Acquire)
                .is_ok()
            {
                break;
            }
        }
    }
}
}

12. 編譯器優化

Cargo.toml 設定

[profile.release]
opt-level = 3              # 最高優化等級
lto = "fat"               # Link-Time Optimization (跨檔案優化)
codegen-units = 1         # 單一編譯單元（更好的優化，但編譯較慢）
panic = "abort"           # 移除 panic unwinding 程式碼
strip = true              # 移除除錯符號
overflow-checks = false   # 移除整數溢位檢查（小心使用）
debug = false             # 不產生除錯資訊
rpath = false             # 不使用 rpath

# 針對所有相依套件也做最佳化
[profile.release.package."*"]
opt-level = 3

# 針對特定套件
[profile.release.package.serde]
opt-level = 3

# 開發時的 release 版本（較快編譯）
[profile.dev]
opt-level = 1

# 自訂 profile
[profile.production]
inherits = "release"
lto = "fat"
codegen-units = 1

編譯時的 RUSTFLAGS

# ============ 啟用 CPU 特定指令 ============
# 方法 1: 環境變數
RUSTFLAGS="-C target-cpu=native" cargo build --release

# 方法 2: .cargo/config.toml
# [build]
# rustflags = ["-C", "target-cpu=native"]

# ============ 更多優化標記 ============
RUSTFLAGS="-C target-cpu=native \
           -C opt-level=3 \
           -C lto=fat \
           -C embed-bitcode=yes \
           -C codegen-units=1" \
cargo build --release

# ============ PGO (Profile-Guided Optimization) ============
# 步驟 1: 產生 instrumented 版本
RUSTFLAGS="-Cprofile-generate=/tmp/pgo-data" \
cargo build --release

# 步驟 2: 執行程式收集 profile 資料
./target/release/your_app

# 步驟 3: 用 profile 資料重新編譯
RUSTFLAGS="-Cprofile-use=/tmp/pgo-data/merged.profdata" \
cargo build --release

# 步驟 4: 合併 profile 資料（如果有多個）
llvm-profdata merge -o /tmp/pgo-data/merged.profdata /tmp/pgo-data/*.profraw

內聯優化

#![allow(unused)]
fn main() {
// ============ 強制內聯 ============
#[inline(always)]
fn hot_function(x: u64) -> u64 {
    x.wrapping_mul(2).wrapping_add(1)
}

// ============ 禁止內聯（冷路徑）============
#[inline(never)]
fn cold_error_path(msg: &str) {
    eprintln!("Error: {}", msg);
}

// ============ 建議內聯（讓編譯器決定）============
#[inline]
fn might_inline(x: u32) -> u32 {
    x * 2
}

// ============ const fn（編譯期計算）============
const fn calculate_table_size(n: usize) -> usize {
    n * 2 + 1
}

const TABLE_SIZE: usize = calculate_table_size(100);
static TABLE: [u8; TABLE_SIZE] = [0; TABLE_SIZE];

// ============ 編譯期常數 ============
const TICK_SIZE: f64 = 0.01;
const MAX_PRICE: u32 = 1000000;

#[inline(always)]
fn price_to_tick(price: f64) -> u32 {
    (price / TICK_SIZE) as u32
}
}

屬性優化

#![allow(unused)]
fn main() {
// ============ 冷/熱路徑標記 ============
#[cold]
fn handle_error() {
    // 告訴編譯器這個函數很少執行
    panic!("Error occurred");
}

#[inline(always)]
fn fast_path() {
    // 熱路徑
}

// ============ 分支預測提示 ============
fn process(x: i32) {
    if x > 0 {
        // 熱路徑
        fast_path();
    } else {
        // 冷路徑
        handle_error();
    }
}

// ============ 避免 bounds checking ============
fn sum_array(arr: &[i32]) -> i32 {
    let mut sum = 0;
    
    // ❌ 每次都檢查邊界
    for i in 0..arr.len() {
        sum += arr[i];
    }
    
    // ✅ 迭代器，編譯器可能省略檢查
    for &val in arr {
        sum += val;
    }
    
    sum
}

// ✅ 明確告訴編譯器不用檢查
unsafe fn sum_unchecked(arr: &[i32]) -> i32 {
    let mut sum = 0;
    for i in 0..arr.len() {
        sum += arr.get_unchecked(i);  // 無邊界檢查
    }
    sum
}
}

13. 系統層級調校

完整系統設定腳本

#!/bin/bash
# hft_system_setup.sh - 高頻交易系統調校

set -e

echo "========== 高頻交易系統調校開始 =========="

# ============ 1. Huge Pages ============
echo "[1/10] 設定 Huge Pages..."
echo 1024 | sudo tee /proc/sys/vm/nr_hugepages
echo always | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
echo always | sudo tee /sys/kernel/mm/transparent_hugepage/defrag

# ============ 2. CPU 性能模式 ============
echo "[2/10] 設定 CPU 為性能模式..."
for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
    echo performance | sudo tee $gov
done

# 關閉 CPU idle states (減少喚醒延遲)
for state in /sys/devices/system/cpu/cpu*/cpuidle/state*/disable; do
    echo 1 | sudo tee $state 2>/dev/null || true
done

# ============ 3. 關閉超執行緒（可選）============
echo "[3/10] 關閉超執行緒..."
echo off | sudo tee /sys/devices/system/cpu/smt/control

# ============ 4. 隔離 CPU 核心 ============
echo "[4/10] 設定 CPU 隔離（需要重啟）..."
GRUB_FILE="/etc/default/grub"
if grep -q "isolcpus" $GRUB_FILE; then
    echo "CPU 隔離已設定"
else
    sudo sed -i 's/GRUB_CMDLINE_LINUX=""/GRUB_CMDLINE_LINUX="isolcpus=2,3,4,5 nohz_full=2,3,4,5 rcu_nocbs=2,3,4,5"/' $GRUB_FILE
    sudo update-grub
    echo "已更新 GRUB，請重啟生效"
fi

# ============ 5. IRQ Affinity（中斷親和性）============
echo "[5/10] 設定網卡中斷親和性..."
# 把所有網卡中斷綁到 CPU 0,1
for irq in $(grep eth0 /proc/interrupts | cut -d: -f1); do
    echo 3 | sudo tee /proc/irq/$irq/smp_affinity 2>/dev/null || true
done

# ============ 6. 網路優化 ============
echo "[6/10] 網路參數優化..."
sudo sysctl -w net.core.rmem_max=134217728
sudo sysctl -w net.core.wmem_max=134217728
sudo sysctl -w net.ipv4.tcp_rmem="4096 87380 67108864"
sudo sysctl -w net.ipv4.tcp_wmem="4096 65536 67108864"
sudo sysctl -w net.core.netdev_max_backlog=5000
sudo sysctl -w net.ipv4.tcp_no_metrics_save=1
sudo sysctl -w net.ipv4.tcp_timestamps=0

# ============ 7. 記憶體優化 ============
echo "[7/10] 記憶體參數優化..."
sudo sysctl -w vm.swappiness=0              # 不要 swap
sudo sysctl -w vm.dirty_ratio=80            # dirty page 比例
sudo sysctl -w vm.dirty_background_ratio=5
sudo sysctl -w vm.dirty_expire_centisecs=12000

# ============ 8. 檔案系統優化 ============
echo "[8/10] 檔案系統參數..."
sudo sysctl -w fs.file-max=2097152
ulimit -n 1048576  # 最大檔案描述符

# ============ 9. Kernel 參數持久化 ============
echo "[9/10] 寫入 /etc/sysctl.conf..."
sudo tee -a /etc/sysctl.conf > /dev/null <<EOF

# HFT Optimizations
vm.nr_hugepages = 1024
vm.swappiness = 0
vm.dirty_ratio = 80
vm.dirty_background_ratio = 5

net.core.rmem_max = 134217728
net.core.wmem_max = 134217728
net.ipv4.tcp_rmem = 4096 87380 67108864
net.ipv4.tcp_wmem = 4096 65536 67108864
net.core.netdev_max_backlog = 5000
net.ipv4.tcp_no_metrics_save = 1

fs.file-max = 2097152
EOF

sudo sysctl -p

# ============ 10. 設定程式權限 ============
echo "[10/10] 設定程式權限..."
# 替換成你的程式路徑
APP_PATH="./target/release/trading_app"

if [ -f "$APP_PATH" ]; then
    sudo setcap cap_sys_nice,cap_ipc_lock,cap_net_raw+ep $APP_PATH
    echo "已設定 $APP_PATH 權限"
else
    echo "警告：找不到 $APP_PATH"
fi

echo "========== 系統調校完成 =========="
echo ""
echo "建議："
echo "1. 重啟系統讓 CPU 隔離生效"
echo "2. 檢查 Huge Pages: cat /proc/meminfo | grep Huge"
echo "3. 檢查 CPU 模式: cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor"
echo "4. 執行你的程式: $APP_PATH"

檢查與監控腳本

#!/bin/bash
# check_system.sh - 檢查系統狀態

echo "========== 系統狀態檢查 =========="

# Huge Pages
echo "[Huge Pages]"
cat /proc/meminfo | grep -i huge

# CPU 模式
echo -e "\n[CPU Governor]"
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor

# CPU 隔離
echo -e "\n[CPU Isolation]"
cat /proc/cmdline | grep isolcpus

# 網路緩衝區
echo -e "\n[Network Buffers]"
sysctl net.core.rmem_max
sysctl net.core.wmem_max

# Swap
echo -e "\n[Swappiness]"
sysctl vm.swappiness

# 檔案限制
echo -e "\n[File Limits]"
ulimit -n

14. 完整範例專案

專案結構

hft-trading/
├── Cargo.toml
├── .cargo/
│   └── config.toml
├── src/
│   ├── main.rs
│   ├── trading.rs
│   ├── market_data.rs
│   └── order_book.rs
└── scripts/
    ├── setup_system.sh
    └── run.sh

Cargo.toml

[package]
name = "hft-trading"
version = "0.1.0"
edition = "2021"

[dependencies]
core_affinity = "0.8"
libc = "0.2"
crossbeam = "0.8"
parking_lot = "0.12"
rustc-hash = "1.1"

[profile.release]
opt-level = 3
lto = "fat"
codegen-units = 1
panic = "abort"
strip = true
overflow-checks = false

[profile.release.package."*"]
opt-level = 3

.cargo/config.toml

[build]
rustflags = ["-C", "target-cpu=native", "-C", "opt-level=3"]

[target.x86_64-unknown-linux-gnu]
rustflags = [
    "-C", "target-cpu=native",
    "-C", "link-arg=-fuse-ld=lld",  # 使用更快的連結器
]

src/main.rs

use core_affinity::{self, CoreId};
use std::thread;
use libc::{sched_param, sched_setscheduler, SCHED_FIFO, mlockall, MCL_CURRENT, MCL_FUTURE};

mod trading;
mod market_data;
mod order_book;

fn setup_realtime_thread(cpu: usize, priority: i32) -> Result<(), String> {
    // 1. CPU 綁定
    let core_ids = core_affinity::get_core_ids()
        .ok_or("無法獲取 CPU 核心")?;
    
    if cpu >= core_ids.len() {
        return Err(format!("CPU {} 不存在", cpu));
    }
    
    core_affinity::set_for_current(core_ids[cpu]);
    println!("執行緒綁定到 CPU {}", cpu);
    
    // 2. 設定 Real-Time 優先權
    unsafe {
        let param = sched_param {
            sched_priority: priority,
        };
        
        if sched_setscheduler(0, SCHED_FIFO, &param) != 0 {
            return Err("無法設定 RT 優先權（需要 CAP_SYS_NICE）".to_string());
        }
        println!("設定 RT 優先權: {}", priority);
        
        // 3. 鎖定記憶體
        if mlockall(MCL_CURRENT | MCL_FUTURE) != 0 {
            return Err("無法鎖定記憶體".to_string());
        }
        println!("記憶體已鎖定");
    }
    
    Ok(())
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
    println!("========== 高頻交易系統啟動 ==========");
    
    // 交易執行緒（CPU 2，高優先權）
    let trading_handle = thread::Builder::new()
        .name("trading".to_string())
        .spawn(|| {
            if let Err(e) = setup_realtime_thread(2, 85) {
                eprintln!("交易執行緒設定失敗: {}", e);
                return;
            }
            
            trading::run();
        })?;
    
    // 市場資料執行緒（CPU 3，次高優先權）
    let market_data_handle = thread::Builder::new()
        .name("market_data".to_string())
        .spawn(|| {
            if let Err(e) = setup_realtime_thread(3, 80) {
                eprintln!("市場資料執行緒設定失敗: {}", e);
                return;
            }
            
            market_data::run();
        })?;
    
    // 等待執行緒
    trading_handle.join().unwrap();
    market_data_handle.join().unwrap();
    
    Ok(())
}

src/order_book.rs

#![allow(unused)]
fn main() {
use rustc_hash::FxHashMap;
use std::sync::atomic::{AtomicU64, Ordering};

#[repr(C)]
#[derive(Clone, Copy)]
pub struct Order {
    pub id: u64,
    pub price: u32,      // 價格 * 10000
    pub quantity: u32,
    pub is_buy: bool,
}

// SoA 結構
pub struct OrderBook {
    // 買單
    buy_prices: Vec<u32>,
    buy_quantities: Vec<u32>,
    buy_ids: Vec<u64>,
    
    // 賣單
    sell_prices: Vec<u32>,
    sell_quantities: Vec<u32>,
    sell_ids: Vec<u64>,
    
    // 統計
    total_volume: AtomicU64,
}

impl OrderBook {
    pub fn new() -> Self {
        Self {
            buy_prices: Vec::with_capacity(10000),
            buy_quantities: Vec::with_capacity(10000),
            buy_ids: Vec::with_capacity(10000),
            
            sell_prices: Vec::with_capacity(10000),
            sell_quantities: Vec::with_capacity(10000),
            sell_ids: Vec::with_capacity(10000),
            
            total_volume: AtomicU64::new(0),
        }
    }
    
    #[inline(always)]
    pub fn add_buy_order(&mut self, order: Order) {
        self.buy_prices.push(order.price);
        self.buy_quantities.push(order.quantity);
        self.buy_ids.push(order.id);
        
        self.total_volume.fetch_add(order.quantity as u64, Ordering::Relaxed);
    }
    
    #[inline(always)]
    pub fn get_best_bid(&self) -> Option<u32> {
        self.buy_prices.first().copied()
    }
    
    #[inline(always)]
    pub fn get_best_ask(&self) -> Option<u32> {
        self.sell_prices.first().copied()
    }
}
}

scripts/run.sh

#!/bin/bash
# 編譯並執行

set -e

echo "編譯中..."
RUSTFLAGS="-C target-cpu=native" cargo build --release

echo "執行..."
sudo ./target/release/hft-trading

# 或使用 setcap
# sudo setcap cap_sys_nice,cap_ipc_lock+ep ./target/release/hft-trading
# ./target/release/hft-trading

15. 效能測試與監控

Benchmark 程式碼

use std::time::Instant;

fn benchmark<F: Fn()>(name: &str, iterations: usize, f: F) {
    // 預熱
    for _ in 0..100 {
        f();
    }
    
    let start = Instant::now();
    for _ in 0..iterations {
        f();
    }
    let elapsed = start.elapsed();
    
    let avg_ns = elapsed.as_nanos() / iterations as u128;
    println!("{}: 平均 {} ns/op ({} ops)", name, avg_ns, iterations);
}

fn main() {
    const ITERATIONS: usize = 10_000_000;
    
    // 比較除法 vs 位移
    benchmark("除法", ITERATIONS, || {
        let _ = 12345 / 8;
    });
    
    benchmark("位移", ITERATIONS, || {
        let _ = 12345 >> 3;
    });
    
    // 比較浮點除法 vs 乘法
    benchmark("浮點除法", ITERATIONS, || {
        let _ = 123.45 / 100.0;
    });
    
    benchmark("浮點乘法", ITERATIONS, || {
        let _ = 123.45 * 0.01;
    });
}

監控工具

#!/bin/bash
# monitor.sh - 即時監控

APP_PID=$(pgrep trading_app)

if [ -z "$APP_PID" ]; then
    echo "找不到程式"
    exit 1
fi

echo "監控 PID: $APP_PID"

# Context Switch
watch -n 1 "cat /proc/$APP_PID/status | grep -E 'ctxt|State'"

# CPU 使用率
htop -p $APP_PID

# 效能分析
# sudo perf record -p $APP_PID -g sleep 30
# sudo perf report

# Flamegraph
# cargo install flamegraph
# sudo flamegraph -p $APP_PID

延遲測試

#![allow(unused)]
fn main() {
use std::time::Instant;

fn measure_latency() {
    let mut latencies = Vec::with_capacity(1000000);
    
    for _ in 0..1000000 {
        let start = Instant::now();
        
        // 你的關鍵路徑
        process_order();
        
        latencies.push(start.elapsed().as_nanos());
    }
    
    latencies.sort();
    
    println!("Min: {} ns", latencies[0]);
    println!("P50: {} ns", latencies[latencies.len() / 2]);
    println!("P99: {} ns", latencies[latencies.len() * 99 / 100]);
    println!("P99.9: {} ns", latencies[latencies.len() * 999 / 1000]);
    println!("Max: {} ns", latencies[latencies.len() - 1]);
}
}

總結檢查清單

✅ 必做優化（影響最大）

CPU 綁定 - 避免 cache 失效
Real-Time Priority - 減少搶占
Lock-Free 資料結構 - 避免鎖競爭
Huge Pages - 減少 TLB miss
編譯優化 - target-cpu=native, lto=fat

⚡ 重要優化

位元運算 - 替換除法/模運算
查表法 - 預計算常用值
Cache Line 對齊 - 避免 False Sharing
SoA 資料結構 - 提升 cache 效率
SIMD - 批次處理

🔧 進階優化

PGO - Profile-Guided Optimization
無分支程式設計 - 提升分支預測
預熱 Cache - 啟動時預載資料
IRQ Affinity - 中斷不打擾關鍵 CPU

📊 監控指標

Latency: P50, P99, P99.9
Context Switch: 應接近 0
Cache Miss: 用 perf stat 監控
CPU 使用率: 關鍵核心應 100%

Keyboard shortcuts

Jason's Notes