# 对付顽固分子 git clone https://github.com/rofl0r/proxychains-ng.git && cd proxychains-ng ./configure --prefix=/usr --sysconfdir=/etc make sudo make install sudo make install-config
sudo vim /etc/proxychains.conf socks5 127.0.0.1 7897
cuda版本方面,可以看到上面的镜像默认使用了 cuda 12.8.1版本,实际上在jtop中可以看到本机为L4T 36.5.0,刷机的Jetpack版本为6.2.2。去 catalog.ngc.nvidia.com 搜索l4t-cuda出来的结果是:nvcr.io/nvidia/l4t-cuda:12.6.11-runtime、nvcr.io/nvidia/12.6.11-devel:12.6.11-devel-aarch64-ubuntu22.04 ,把构建底座改成tegra专用的,否则构建出来的镜像再运行moe模型时,batch.n_token>32 就会出现cuBLAS报错:
1 2 3 4 5 6 7
ARG UBUNTU_VERSION=22.04 # This needs to generally match the container host's environment. ARG CUDA_VERSION=12.6.11 # Target the CUDA build image ARG BASE_CUDA_DEV_CONTAINER=nvcr.io/nvidia/12.6.11-devel:12.6.11-devel-aarch64-ubuntu22.04
为了物尽其用,每个 Orin AGX 64G 都耗尽了显存,我需要在第三个设备上搭一个模型汇聚网关,做成类似 OpenAI API 那样可以选择模型,并把请求分发到各个 end point 上的的网关,Gemini 推荐使用 LiteLLM 的 docker 部署。就是个转发代理,还需要长期开,就把这玩意放在 rock 5b 上了。
curl -X POST "http://localhost:4001/v1/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer sk-1234" \ -d '{ "model": "gemma-4-26B-A4B-it-UD-Q8_K_XL", "messages": [ {"role": "user", "content": "你好,请用一句话介绍一下你自己。"} ] }' {"id":"chatcmpl-5RXTGa6ZQ5aW57JGlsDzxOylaWptEVhE","created":1775988908,"model":"gemma-4-26B-A4B-it-UD-Q8_K_XL","object":"chat.completion","system_fingerprint":"b8764-8fe78a7ca","choices":[{"finish_reason":"stop","index":0,"message":{"content":"我是 Gemma 4,是由 Google DeepMind 开发的开源权重大型语言模型。","role":"assistant","reasoning_content":"* Language: Chinese (\"你好,请用一句话介绍一下你自己。\")\n * Content: \"Hello, please introduce yourself in one sentence.\"\n\n * Identity: Gemma 4.\n * Developer: Google DeepMind.\n * Nature: Large Language Model (LLM) with open weights.\n * Capabilities: Text and image processing (audio for 2B/4B), text-only output.\n * Knowledge Cutoff: January 2025.\n\n * *Draft 1 (Literal):* 我是 Gemma 4,由 Google DeepMind 开发的大型语言模型。 (I am Gemma 4, a large language model developed by Google DeepMind.)\n * *Draft 2 (More descriptive):* 我是 Gemma 4,由 Google DeepMind 开发的开源权 重的多模态大语言模型。 (I am Gemma 4, an open-weights multimodal LLM developed by Google DeepMind.)\n * *Refining for \"one sentence\" and \"identity\":* The prompt asks for an introduction in one sentence.\n\n * \"我是 Gemma 4,是由 Google DeepMind 开发的开源权重大型语言模型。\" (I am Gemma 4, an open-weights large language model developed by Google DeepMind.)\n\n * Identify as Gemma 4? Yes.\n * Mention Google DeepMind? Yes.\n * One sentence? Yes.\n\n \"我是 Gemma 4,是由 Google DeepMind 开发的开源权重大型语言模型。\"","provider_specific_fields":{"refusal":null,"reasoning_content":"* Language: Chinese (\"你好,请用一句话介绍一下你自己。\")\n * Content: \"Hello, please introduce yourself in one sentence.\"\n\n * Identity: Gemma 4.\n * Developer: Google DeepMind.\n * Nature: Large Language Model (LLM) with open weights.\n * Capabilities: Text and image processing (audio for 2B/4B), text-only output.\n * Knowledge Cutoff: January 2025.\n\n * *Draft 1 (Literal):* 我是 Gemma 4,由 Google DeepMind 开发的大型语言模型。 (I am Gemma 4, a large language model developed by Google DeepMind.)\n * *Draft 2 (More descriptive):* 我是 Gemma 4,由 Google DeepMind 开发的开源权重的多模态大 语言模型。 (I am Gemma 4, an open-weights multimodal LLM developed by Google DeepMind.)\n * *Refining for \"one sentence\" and \"identity\":* The prompt asks for an introduction in one sentence.\n\n * \"我是 Gemma 4,是由 Google DeepMind 开发的开源权重大型语言模型。\" (I am Gemma 4, an open-weights large language model developed by Google DeepMind.)\n\n * Identify as Gemma 4? Yes.\n * Mention Google DeepMind? Yes.\n * One sentence? Yes.\n\n \"我是 Gemma 4,是由 Google DeepMind 开发的开源权重大型语言模型。\""}},"provider_specific_fields":{}}],"usage":{"completion_tokens":353,"prompt_tokens":25,"total_tokens":378,"prompt_tokens_details":{"cached_tokens":0}},"timings":{"cache_n":0,"prompt_n":25,"prompt_ms":254.532,"prompt_per_token_ms":10.181280000000001,"prompt_per_second":98.21947731522953,"predicted_n":353,"predicted_ms":16617.949,"predicted_per_token_ms":47.076342776203965,"predicted_per_second":21.242091909176036}}%
测试qwen3.5反应:
1 2 3 4 5 6 7 8 9 10
curl -X POST "http://localhost:4001/v1/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer sk-1234" \ -d '{ "model": "Qwen3.5-35B-A3B-Q8_K_XL", "messages": [ {"role": "user", "content": "Hello, please write a very short python script to print hello world."} ] }' {"id":"chatcmpl-gCLU58UkI3h51hgucpWwucmTJ1yX7qox","created":1775988956,"model":"Qwen3.5-35B-A3B-Q8_K_XL","object":"chat.completion","system_fingerprint":"b8762-073bb2c20","choices":[{"finish_reason":"stop","index":0,"message":{"content":"```python\nprint(\"hello world\")\n```","role":"assistant","reasoning_content":"Thinking Process:\n\n1. **Analyze the Request:**\n * Task: Write a very short Python script.\n * Functionality: Print \"hello world\".\n * Constraint: \"very short\".\n\n2. **Determine the Solution:**\n * The standard way to print in Python is `print(\"hello world\")`.\n * This is already one line and very short.\n\n3. **Draft the Code:**\n ```python\n print(\"hello world\")\n ```\n\n4. **Review against Constraints:**\n * Is it Python? Yes.\n * Is it very short? Yes (one line).\n * Does it print \"hello world\"? Yes.\n\n5. **Final Output Generation:** (Just the code block).\n","provider_specific_fields":{"refusal":null,"reasoning_content":"Thinking Process:\n\n1. **Analyze the Request:**\n * Task: Write a very short Python script.\n * Functionality: Print \"hello world\".\n * Constraint: \"very short\".\n\n2. **Determine the Solution:**\n * The standard way to print in Python is `print(\"hello world\")`.\n * This is already one line and very short.\n\n3. **Draft the Code:**\n ```python\n print(\"hello world\")\n ```\n\n4. **Review against Constraints:**\n * Is it Python? Yes.\n * Is it very short? Yes (one line).\n * Does it print \"hello world\"? Yes.\n\n5. **Final Output Generation:** (Just the code block).\n"}},"provider_specific_fields":{}}],"usage":{"completion_tokens":188,"prompt_tokens":24,"total_tokens":212,"prompt_tokens_details":{"cached_tokens":0}},"timings":{"cache_n":0,"prompt_n":24,"prompt_ms":290.856,"prompt_per_token_ms":12.119,"prompt_per_second":82.5150589982672,"predicted_n":188,"predicted_ms":10157.401,"predicted_per_token_ms":54.02872872340426,"predicted_per_second":18.508671657247756}}%
sed -i 's|# image: ghcr.io/firecrawl/firecrawl|image: ghcr.io/firecrawl/firecrawl|' ./docker-compose.yaml sed -i 's| build: apps/api| # build: apps/api|' docker-compose.yaml sed -i 's|# image: ghcr.io/firecrawl/playwright-service:latest|image: ghcr.io/firecrawl/playwright-service:latest|' docker-compose.yaml sed -i 's| build: apps/playwright-service-ts| # build: apps/playwright-service-ts|' docker-compose.yaml
docker compose up -d
安装 Hermes
目前装在我电脑的wsl2里,有时间得弄到o6n上,这个服务需要长时间开机,我舍不得电费。
安装过程需要注意的是:
配置模型 API 时,写上面的 LiteLLM 的 URL http://192.168.1.120:4001/v1,接下来模型可以选1,2,就像这样
1 2 3 4 5 6 7 8 9 10
API base URL [e.g. https://api.example.com/v1]: http://192.168.1.120:4001/v1 API key [optional]: Verified endpoint via http://192.168.1.120:4001/v1/models (2 model(s) visible) Available models: 1. gemma-4-26B-A4B-it-UD-Q8_K_XL 2. Qwen3.5-35B-A3B-Q8_K_XL Select model [1-2] or type name: 1,2 Context length in tokens [leave blank for auto-detect]: Default model set to: 1,2 (via http://192.168.1.120:4001/v1) 💾 Saved to custom providers as "192.168.1.120:4001" (edit in config.yaml)
agx-flash@agx-flash:~$ lsusb Bus 002 Device 002: ID 0627:0001 Adomax Technology Co., Ltd QEMU USB Tablet Bus 002 Device 001: ID 1d6b:0002 Linux Foundation 2.0 root hub Bus 008 Device 001: ID 1d6b:0001 Linux Foundation 1.1 root hub Bus 007 Device 001: ID 1d6b:0001 Linux Foundation 1.1 root hub Bus 006 Device 001: ID 1d6b:0001 Linux Foundation 1.1 root hub Bus 009 Device 003: ID 0955:7023 NVIDIA Corp. APX Bus 009 Device 002: ID 2109:3431 VIA Labs, Inc. Hub Bus 009 Device 001: ID 1d6b:0002 Linux Foundation 2.0 root hub Bus 010 Device 001: ID 1d6b:0003 Linux Foundation 3.0 root hub Bus 001 Device 001: ID 1d6b:0002 Linux Foundation 2.0 root hub Bus 005 Device 001: ID 1d6b:0001 Linux Foundation 1.1 root hub Bus 004 Device 001: ID 1d6b:0001 Linux Foundation 1.1 root hub Bus 003 Device 001: ID 1d6b:0001 Linux Foundation 1.1 root hub