add build automation and HNSW implementation guide
- Create automated build script with version extraction and Aliyun registry push - Add comprehensive HNSW implementation guide with step-by-step instructions - Update Dockerfile to use musl target and enc binary for deployment - Include performance optimization strategies and debugging tips
This commit is contained in:
parent
e85eb8a9e8
commit
8b47403cc0
@ -12,7 +12,7 @@ COPY run.sh /home/admin/predict/run.sh
|
||||
RUN chmod 777 /home/admin/predict/run.sh
|
||||
|
||||
# Copy the compiled binary as 'test' executable
|
||||
COPY target/release/hfe_knn /home/admin/predict/test
|
||||
COPY target/x86_64-unknown-linux-musl/release/enc /home/admin/predict/test
|
||||
RUN chmod +x /home/admin/predict/test
|
||||
|
||||
# Copy training data
|
||||
|
143
HNSW_IMPLEMENTATION_GUIDE.md
Normal file
143
HNSW_IMPLEMENTATION_GUIDE.md
Normal file
@ -0,0 +1,143 @@
|
||||
# HNSW Search Layer 实现指南
|
||||
|
||||
## 目标
|
||||
实现标准的HNSW贪心搜索算法,但使用密文距离计算,匹配明文版本的逻辑和性能。
|
||||
|
||||
## 关键数据结构
|
||||
|
||||
### 输入参数
|
||||
- `query: &EncryptedQuery<T>` - 加密的查询点
|
||||
- `entry_points: Vec<usize>` - 入口点的节点索引列表
|
||||
- `ef: usize` - 搜索时的候选集大小
|
||||
- `layer: usize` - 当前搜索的层级
|
||||
- `zero: &T` - 加密的零值(用于距离计算)
|
||||
|
||||
### 内部数据结构建议
|
||||
```rust
|
||||
// 候选队列:存储待探索的节点
|
||||
let mut candidates: Vec<(usize, EncryptedNeighbor<T>)> = Vec::new();
|
||||
// 结果集:维护当前最好的ef个候选点
|
||||
let mut w: Vec<(usize, EncryptedNeighbor<T>)> = Vec::new();
|
||||
// 访问标记
|
||||
let mut visited: HashSet<usize> = HashSet::new();
|
||||
```
|
||||
|
||||
其中 `EncryptedNeighbor<T>` 结构已定义:
|
||||
```rust
|
||||
pub struct EncryptedNeighbor<T> {
|
||||
pub distance: T, // 密文距离
|
||||
pub index: FheUint8, // 密文索引
|
||||
}
|
||||
```
|
||||
|
||||
## 实现步骤
|
||||
|
||||
### Step 1: 初始化候选点
|
||||
```rust
|
||||
for &ep in &entry_points {
|
||||
if ep < self.nodes.len() && self.nodes[ep].level >= layer {
|
||||
visited.insert(ep);
|
||||
let distance = euclidean_distance(query, &self.nodes[ep].encrypted_point, zero);
|
||||
let neighbor = EncryptedNeighbor {
|
||||
distance,
|
||||
index: self.nodes[ep].encrypted_point.index.clone(),
|
||||
};
|
||||
candidates.push((ep, neighbor.clone()));
|
||||
w.push((ep, neighbor));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Step 2: 主搜索循环
|
||||
```rust
|
||||
while !candidates.is_empty() {
|
||||
// 2.1 找到距离最小的候选点
|
||||
// 提示:需要对candidates中的EncryptedNeighbor按distance排序
|
||||
// 可以使用 encrypted_selection_sort 或其他方法
|
||||
|
||||
// 2.2 移除最小距离的候选点作为当前探索点
|
||||
let current = /* 从candidates中移除最小距离点的节点索引 */;
|
||||
|
||||
// 2.3 剪枝检查(可选,但会影响性能)
|
||||
// 如果w.len() >= ef 且 current的距离 > w中最远点的距离,则break
|
||||
|
||||
// 2.4 探索当前节点的邻居
|
||||
for &neighbor_idx in &self.nodes[current].neighbors[layer] {
|
||||
if !visited.contains(&neighbor_idx) && neighbor_idx < self.nodes.len() {
|
||||
visited.insert(neighbor_idx);
|
||||
let distance = euclidean_distance(query, &self.nodes[neighbor_idx].encrypted_point, zero);
|
||||
let encrypted_neighbor = EncryptedNeighbor {
|
||||
distance,
|
||||
index: self.nodes[neighbor_idx].encrypted_point.index.clone(),
|
||||
};
|
||||
|
||||
// 加入候选队列
|
||||
candidates.push((neighbor_idx, encrypted_neighbor.clone()));
|
||||
|
||||
// 管理结果集w
|
||||
if w.len() < ef {
|
||||
w.push((neighbor_idx, encrypted_neighbor));
|
||||
} else {
|
||||
// 结果集已满,需要替换最远的点
|
||||
w.push((neighbor_idx, encrypted_neighbor));
|
||||
// 排序w,只保留前ef个最近的点
|
||||
// 提示:可以先转换为Vec<EncryptedNeighbor>,排序后重建w
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Step 3: 返回结果
|
||||
```rust
|
||||
w.into_iter().map(|(node_idx, _)| node_idx).collect()
|
||||
```
|
||||
|
||||
## 性能优化建议
|
||||
|
||||
### 1. 减少密文排序次数
|
||||
- **问题**:每次排序都很昂贵(~2-3分钟)
|
||||
- **策略**:
|
||||
- 只在必要时排序(如候选队列管理、结果集维护)
|
||||
- 考虑批量处理而不是逐个比较
|
||||
- 可以适当牺牲一些算法精确性来换取性能
|
||||
|
||||
### 2. 候选队列管理
|
||||
- **明文版本**:使用BinaryHeap,O(log n)插入和删除
|
||||
- **密文版本**:只能用排序,O(n log n)
|
||||
- **优化**:考虑限制候选队列大小,避免无限增长
|
||||
|
||||
### 3. 剪枝策略
|
||||
- **理想**:`current_distance > farthest_w_distance && w.len() >= ef` 则停止
|
||||
- **现实**:密文比较结果无法直接判断
|
||||
- **权衡**:可以跳过复杂剪枝,让算法更彻底但稍慢
|
||||
|
||||
## 调试提示
|
||||
|
||||
### 1. 验证初始化
|
||||
确保entry_points正确初始化到candidates和w中
|
||||
|
||||
### 2. 验证邻居探索
|
||||
检查是否正确遍历`self.nodes[current].neighbors[layer]`
|
||||
|
||||
### 3. 验证visited逻辑
|
||||
确保不重复访问同一节点
|
||||
|
||||
### 4. 验证结果集管理
|
||||
确保w的大小不超过ef,且包含距离最近的点
|
||||
|
||||
## 期望性能目标
|
||||
|
||||
- **明文版本**:毫秒级
|
||||
- **密文版本目标**:15-20分钟(相比当前的100+分钟)
|
||||
- **准确率目标**:80%+(相比当前的30%)
|
||||
|
||||
## 可用的工具函数
|
||||
|
||||
- `euclidean_distance(query, point, zero)` - 计算密文欧几里得距离
|
||||
- `encrypted_selection_sort(distances, k)` - 密文选择排序,获取前k个最小值
|
||||
- `EncryptedNeighbor` - 包装距离和索引的结构体
|
||||
|
||||
## 明文版本参考
|
||||
|
||||
参考 `src/bin/plain.rs` 中的 `search_layer` 函数实现,理解标准HNSW算法的逻辑流程。
|
26
build.sh
Executable file
26
build.sh
Executable file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# 获取项目版本
|
||||
VERSION=$(grep '^version =' Cargo.toml | sed 's/.*"\(.*\)".*/\1/')
|
||||
echo "📦 Building version: $VERSION"
|
||||
|
||||
# 阿里云容器镜像仓库配置
|
||||
REGISTRY="crpi-2oj2gvvfz737vu2s.cn-guangzhou.personal.cr.aliyuncs.com"
|
||||
NAMESPACE="sangge"
|
||||
REPO_NAME="yinyu" # 原本的仓库名
|
||||
IMAGE_TAG="$REGISTRY/$NAMESPACE/$REPO_NAME:$VERSION"
|
||||
|
||||
echo "🏗️ Building Rust project..."
|
||||
# 构建 Rust 项目 (使用 musl target 用于静态链接)
|
||||
cargo build --release --target x86_64-unknown-linux-musl --bin enc
|
||||
|
||||
echo "🐳 Building Docker image: $IMAGE_TAG"
|
||||
# 构建 Docker 镜像
|
||||
docker build -t "$IMAGE_TAG" .
|
||||
|
||||
echo "📤 Pushing Docker image to Aliyun registry..."
|
||||
# 推送镜像到阿里云
|
||||
docker push "$IMAGE_TAG"
|
||||
|
||||
echo "✅ Build and push completed!"
|
Loading…
x
Reference in New Issue
Block a user