Stash gen of LLAMA mlx

2025-02-12 17:07:06 +00:00 · 2025-02-12 17:07:06 +00:00 · 4cd96b58b5
commit 4cd96b58b5
parent 39ac5a86dd
7 changed files with 593 additions and 8 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -145,6 +145,29 @@ version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 [[package]]
 name = "bindgen"
 version = "0.69.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
 dependencies = [
 "bitflags",
 "cexpr",
 "clang-sys",
 "itertools 0.12.1",
 "lazy_static",
 "lazycell",
 "log",
 "prettyplease",
 "proc-macro2",
 "quote",
 "regex",
 "rustc-hash",
 "shlex",
 "syn",
 "which",
 ]
 [[package]]
 name = "bindgen"
 version = "0.70.1"
@ -195,6 +218,8 @@ version = "1.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c7777341816418c02e033934a09f20dc0ccaf65a5201ef8a450ae0105a573fda"
 dependencies = [
 "jobserver",
 "libc",
 "shlex",
 ]
@ -311,6 +336,26 @@ version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 [[package]]
 name = "enumflags2"
 version = "0.7.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ba2f4b465f5318854c6f8dd686ede6c0a9dc67d4b1ac241cf0eb51521a309147"
 dependencies = [
 "enumflags2_derive",
 ]
 [[package]]
 name = "enumflags2_derive"
 version = "0.7.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc4caf64a58d7a6d65ab00639b046ff54399a39f5f2554728895ace4b297cd79"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn",
 ]
 [[package]]
 name = "equivalent"
 version = "1.0.1"
@ -324,13 +369,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
 dependencies = [
 "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 [[package]]
 name = "exo-rs"
 version = "0.1.0"
 dependencies = [
 "glob",
 "llama-cpp-2",
 "mlx-rs",
 "network-interface",
 "phf",
@ -341,6 +388,7 @@ dependencies = [
 "socket2",
 "system-configuration",
 "thiserror 2.0.11",
 "tinygrad",
 "tokio",
 "tonic",
 "tonic-build",
@ -498,6 +546,15 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 [[package]]
 name = "home"
 version = "0.5.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf"
 dependencies = [
 "windows-sys 0.59.0",
 ]
 [[package]]
 name = "http"
 version = "1.2.0"
@ -623,6 +680,15 @@ dependencies = [
 "hashbrown 0.15.2",
 ]
 [[package]]
 name = "itertools"
 version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
 dependencies = [
 "either",
 ]
 [[package]]
 name = "itertools"
 version = "0.13.0"
@ -647,12 +713,27 @@ version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
 [[package]]
 name = "jobserver"
 version = "0.1.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0"
 dependencies = [
 "libc",
 ]
 [[package]]
 name = "lazy_static"
 version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 [[package]]
 name = "lazycell"
 version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 [[package]]
 name = "libc"
 version = "0.2.169"
@ -675,6 +756,32 @@ version = "0.4.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
 [[package]]
 name = "llama-cpp-2"
 version = "0.1.93"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "44818b6967a77379b8c8e105e2684d2bd2bca999ad24cfd806d8476a80c53255"
 dependencies = [
 "enumflags2",
 "llama-cpp-sys-2",
 "thiserror 1.0.69",
 "tracing",
 "tracing-core",
 ]
 [[package]]
 name = "llama-cpp-sys-2"
 version = "0.1.93"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "92add8e8dabf941518dd573075721a6e0568db86c6557e176cec99bf56883c0a"
 dependencies = [
 "bindgen 0.69.5",
 "cc",
 "cmake",
 "glob",
 "walkdir",
 ]
 [[package]]
 name = "lock_api"
 version = "0.4.12"
@ -706,6 +813,16 @@ version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
 [[package]]
 name = "matrixmultiply"
 version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a"
 dependencies = [
 "autocfg",
 "rawpointer",
 ]
 [[package]]
 name = "memchr"
 version = "2.7.4"
@ -741,7 +858,7 @@ checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
 dependencies = [
 "libc",
 "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 [[package]]
@ -800,7 +917,7 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af33a6b662998e5bb4099b1a191b4352fcb11d97706e82e4c8922fe200bb11f2"
 dependencies = [
- "bindgen",
+ "bindgen 0.70.1",
 "cc",
 "cmake",
 ]
@ -811,6 +928,19 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03"
 [[package]]
 name = "ndarray"
 version = "0.15.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
 dependencies = [
 "matrixmultiply",
 "num-complex",
 "num-integer",
 "num-traits",
 "rawpointer",
 ]
 [[package]]
 name = "network-interface"
 version = "2.0.0"
@ -852,6 +982,15 @@ dependencies = [
 "num-traits",
 ]
 [[package]]
 name = "num-integer"
 version = "0.1.46"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
 dependencies = [
 "num-traits",
 ]
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@ -1150,6 +1289,12 @@ dependencies = [
 "getrandom 0.2.15",
 ]
 [[package]]
 name = "rawpointer"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
 [[package]]
 name = "redox_syscall"
 version = "0.5.8"
@ -1210,7 +1355,7 @@ dependencies = [
 "errno",
 "libc",
 "linux-raw-sys",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 [[package]]
@ -1235,6 +1380,15 @@ dependencies = [
 "serde_json",
 ]
 [[package]]
 name = "same-file"
 version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
 dependencies = [
 "winapi-util",
 ]
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
@ -1325,7 +1479,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8"
 dependencies = [
 "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 [[package]]
@ -1405,7 +1559,7 @@ dependencies = [
 "getrandom 0.3.1",
 "once_cell",
 "rustix",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 [[package]]
@ -1458,6 +1612,15 @@ dependencies = [
 "once_cell",
 ]
 [[package]]
 name = "tinygrad"
 version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bc8951e9ced45095eb3ad7342c4e74b038bda930833df79b7debb019bb653c18"
 dependencies = [
 "ndarray",
 ]
 [[package]]
 name = "tokio"
 version = "1.43.0"
@ -1473,7 +1636,7 @@ dependencies = [
 "signal-hook-registry",
 "socket2",
 "tokio-macros",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 [[package]]
@ -1703,6 +1866,16 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
 [[package]]
 name = "walkdir"
 version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
 dependencies = [
 "same-file",
 "winapi-util",
 ]
 [[package]]
 name = "want"
 version = "0.3.1"
@ -1727,6 +1900,18 @@ dependencies = [
 "wit-bindgen-rt",
 ]
 [[package]]
 name = "which"
 version = "4.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
 dependencies = [
 "either",
 "home",
 "once_cell",
 "rustix",
 ]
 [[package]]
 name = "winapi"
 version = "0.3.9"
@ -1743,6 +1928,15 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
 [[package]]
 name = "winapi-util"
 version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
 dependencies = [
 "windows-sys 0.52.0",
 ]
 [[package]]
 name = "winapi-x86_64-pc-windows-gnu"
 version = "0.4.0"
@ -1758,6 +1952,15 @@ dependencies = [
 "windows-targets",
 ]
 [[package]]
 name = "windows-sys"
 version = "0.59.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
 dependencies = [
 "windows-targets",
 ]
 [[package]]
 name = "windows-targets"
 version = "0.52.6"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -19,6 +19,9 @@ uuid = { version = "1.13.1", features = ["v4"] }
 regex = "1.11.1"
 phf = { version = "0.11.3", features = ["macros"] }
 mlx-rs = { version = "0.21.0", features = ["metal", "accelerate", "safetensors"] }
 tinygrad = "0.1.0"
 llama-cpp-2 = { version = "0.1.93", features = ["metal", "native", "sampler"] }
 glob = "0.3.2"
 [build-dependencies]
 tonic-build = "0.12.3"
--- a/src/inference.rs
+++ b/src/inference.rs
@ -0,0 +1,19 @@
 use crate::node_service::{InferenceState, Tensor};
 use crate::Shard;
 #[derive(Debug)]
 pub struct InferenceEngine {
    state_cache: HashMap<String, _>,
 }
 impl InferenceEngine {
    pub(crate) fn infer_tensor(
        &self,
        request_id: String,
        shard: Shard,
        tensor: Option<Tensor>,
        inference_state: Option<InferenceState>,
    ) -> Tensor {
    }
 }
--- a/src/llama_module.rs
+++ b/src/llama_module.rs
@ -0,0 +1,226 @@
 use mlx_rs::macros::ModuleParameters;
 use mlx_rs::module::Module;
 use mlx_rs::module::ModuleParameters;
 use mlx_rs::module::Param;
 use mlx_rs::nested::NestedHashMap;
 use mlx_rs::nn;
 use mlx_rs::Array;
 use std::rc::Rc;
 // Define Shard struct to mirror Python dataclass
 #[derive(Debug, Clone)]
 pub struct Shard {
    pub name: String,
    pub start_layer: usize,
    pub end_layer: usize,
 }
 impl Shard {
    pub fn new(name: String, start_layer: usize, end_layer: usize) -> Self {
        Shard {
            name,
            start_layer,
            end_layer,
        }
    }
    pub fn is_first_layer(&self) -> bool {
        self.start_layer == 0
    }
    pub fn is_last_layer(&self) -> bool {
        // Assuming end_layer is inclusive and represents the last layer index in the shard
        // and num_hidden_layers is the total number of layers.
        // We would need num_hidden_layers to accurately determine the last layer.
        // For now, let's assume if end_layer is very large, it's the last layer in shard.
        self.end_layer > 9999 // A large number as a placeholder, adjust as needed
    }
 }
 // Define ModelArgs struct to mirror Python dataclass ModelArgs
 #[derive(Debug, Clone)]
 pub struct ModelArgsRs {
    pub vocab_size: i32,
    pub hidden_size: i32,
    pub num_hidden_layers: i32,
    pub num_attention_heads: i32,
    pub num_key_value_heads: i32,
    pub rms_norm_eps: f32,
    pub tie_word_embeddings: bool,
    pub model_type: String,    // Assuming model_type is a String
    pub head_dim: Option<i32>, // Using Option to represent optional field
    pub shard: Shard,          // Using the Shard struct defined above
 }
 // Placeholder for TransformerBlock - You'll need to implement this in Rust
 #[derive(Debug, Clone, ModuleParameters)]
 pub struct TransformerBlock {
    // Define the layers within TransformerBlock as needed, e.g., attention, norm, etc.
    // For now, using a linear layer as a placeholder
    pub linear: nn::Linear,
 }
 impl TransformerBlock {
    pub fn new(dims: i32, mlp_dims: i32) -> Self {
        Self {
            linear: nn::Linear::new(dims, mlp_dims).unwrap(), // Example, adjust params
        }
    }
 }
 impl Module<&Array> for TransformerBlock {
    type Output = Array;
    type Error = mlx_rs::error::Exception;
    fn forward(&mut self, input: &Array) -> Result<Self::Output, Self::Error> {
        // Implement the forward pass logic for TransformerBlock
        // For now, just passing through the linear layer
        self.linear.forward(input)
    }
    fn training_mode(&mut self, mode: bool) {}
 }
 // Define LlamaModel struct
 #[derive(Debug, Clone, ModuleParameters)]
 pub struct LlamaModelRs {
    pub args: ModelArgsRs,
    pub vocab_size: i32,
    pub num_hidden_layers: i32,
    pub embed_tokens: Option<nn::Embedding>, // Embedding layer is optional based on sharding
    pub layers: Vec<TransformerBlock>,       // Using placeholder TransformerBlock
    pub norm: Option<nn::RmsNorm>,           // RMSNorm layer is optional based on sharding
 }
 impl LlamaModelRs {
    pub fn new(args: ModelArgsRs) -> Result<Self, mlx_rs::error::Exception> {
        let vocab_size = args.vocab_size;
        let num_hidden_layers = args.num_hidden_layers;
        let mut embed_tokens = None;
        if args.shard.is_first_layer() || (args.shard.is_last_layer() && args.tie_word_embeddings) {
            embed_tokens = Some(nn::Embedding::new(args.vocab_size, args.hidden_size)?);
        }
        let mut layers = Vec::new();
        for i in 0..num_hidden_layers {
            if args.shard.start_layer <= i && i <= args.shard.end_layer {
                // Using placeholder dimensions for TransformerBlock, adjust as needed
                layers.push(TransformerBlock::new(
                    args.hidden_size,
                    args.hidden_size * 4,
                ));
            } else {
                // Placeholder for IdentityBlock - you might need to create a Rust version if needed
                // For now, just pushing a default TransformerBlock or handle differently
                layers.push(TransformerBlock::new(
                    args.hidden_size,
                    args.hidden_size * 4,
                )); // IdentityBlock() in Python seems to be a no-op
            }
        }
        let mut norm = None;
        if args.shard.is_last_layer() {
            norm = Some(nn::RmsNorm::new(args.hidden_size, args.rms_norm_eps)?);
        }
        Ok(Self {
            args,
            vocab_size,
            num_hidden_layers,
            embed_tokens,
            layers,
            norm,
        })
    }
 }
 impl Module<&Array> for LlamaModelRs {
    type Output = Array;
    type Error = mlx_rs::error::Exception;
    fn forward(&mut self, inputs: &Array) -> Result<Self::Output, Self::Error> {
        let mut h;
        if self.args.shard.is_first_layer() && self.embed_tokens.is_some() {
            h = self.embed_tokens.as_ref().unwrap().forward(inputs)?;
        } else {
            h = inputs.clone(); // Assuming input is already embedded if not the first layer
        }
        // Mask creation logic would go here - needs to be implemented in Rust
        // let mask = None;
        // if h.ndim() > 1 && h.shape()[1] > 1 {
        //     mask = create_attention_mask(h, cache); // Need to port create_attention_mask to Rust
        // }
        // Cache handling - needs more detailed implementation for Rust
        // let mut cache = cache.unwrap_or_else(|| vec![None; self.layers.len()]);
        for layer in &mut self.layers {
            h = layer.forward(&h)?; // Pass mask and cache when implemented
        }
        if self.args.shard.is_last_layer() && self.norm.is_some() {
            h = self.norm.as_ref().unwrap().forward(&h)?;
        }
        Ok(h)
    }
    fn training_mode(&mut self, mode: bool) {}
 }
 // Define Model struct
 #[derive(Debug, Clone, ModuleParameters)]
 pub struct ModelRs {
    pub args: ModelArgsRs,
    pub model_type: String,
    pub model: LlamaModelRs,
    pub lm_head: Option<nn::Linear>, // Linear layer for language model head, optional based on tie_word_embeddings
 }
 impl ModelRs {
    pub fn new(args: ModelArgsRs) -> Result<Self, mlx_rs::error::Exception> {
        let model = LlamaModelRs::new(args.clone())?; // Clone args for LlamaModel
        let model_type = args.model_type.clone();
        let mut lm_head = None;
        if args.shard.is_last_layer() && !args.tie_word_embeddings {
            lm_head = Some(nn::Linear::new(args.hidden_size, args.vocab_size)?);
        }
        Ok(Self {
            args,
            model_type,
            model,
            lm_head,
        })
    }
 }
 impl Module<&Array> for ModelRs {
    type Output = Array;
    type Error = mlx_rs::error::Exception;
    fn forward(&mut self, inputs: &Array) -> Result<Self::Output, Self::Error> {
        let mut out = self.model.forward(inputs)?;
        if self.args.shard.is_last_layer() {
            if self.args.tie_word_embeddings && self.model.embed_tokens.is_some() {
                // Need to implement as_linear() equivalent in Rust or directly use embedding weights for linear transformation
                // Placeholder - direct linear transformation using embedding weights is not directly available in mlx-rs as in python
                if let Some(embed_tokens) = &self.model.embed_tokens {
                    if let Ok(params) = embed_tokens.parameters() {
                        if let Some(weight_param) = params.get("weight") {
                            // This is a very simplified placeholder - needs proper matrix multiplication with 'out' and 'weight_param'
                            out = weight_param.clone(); // Incorrect - replace with actual linear transformation
                        }
                    }
                }
            } else if self.lm_head.is_some() {
                out = self.lm_head.as_ref().unwrap().forward(&out)?;
            }
        }
        Ok(out)
    }
    fn training_mode(&mut self, mode: bool) {}
 }
--- a/src/llama_test.rs
+++ b/src/llama_test.rs
@ -0,0 +1,73 @@
 use llama_cpp_2::context::params::LlamaContextParams;
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::{AddBos, LlamaModel, Special};
 use llama_cpp_2::model::params::LlamaModelParams;
 use llama_cpp_2::sampling::LlamaSampler;
 fn test() {
    let model_path = std::env::args().nth(1).expect("Please specify model path");
    let backend = LlamaBackend::init().unwrap();
    let params = LlamaModelParams::default();
    let prompt =
        "<|im_start|>user\nHello! how are you?<|im_end|>\n<|im_start|>assistant\n".to_string();
    LlamaContextParams::default();
    let model =
        LlamaModel::load_from_file(&backend, model_path, &params).expect("unable to load model");
    let ctx_params = LlamaContextParams::default();
    let mut ctx = model
        .new_context(&backend, ctx_params)
        .expect("unable to create the llama_context");
    let tokens_list = model
        .str_to_token(&prompt, AddBos::Always)
        .unwrap_or_else(|_| panic!("failed to tokenize {prompt}"));
    let n_len = 64;
    // create a llama_batch with size 512
    // we use this object to submit token data for decoding
    let mut batch = LlamaBatch::new(512, 1);
    let last_index = tokens_list.len() as i32 - 1;
    for (i, token) in (0_i32..).zip(tokens_list.into_iter()) {
        // llama_decode will output logits only for the last token of the prompt
        let is_last = i == last_index;
        batch.add(token, i, &[0], is_last).unwrap();
    }
    ctx.decode(&mut batch).expect("llama_decode() failed");
    let mut n_cur = batch.n_tokens();
    // The `Decoder`
    let mut decoder = encoding_rs::UTF_8.new_decoder();
    let mut sampler = LlamaSampler::greedy();
    while n_cur <= n_len {
        // sample the next token
        {
            let token = sampler.sample(&ctx, batch.n_tokens() - 1);
            sampler.accept(token);
            // is it an end of stream?
            if token == model.token_eos() {
                eprintln!();
                break;
            }
            let output_bytes = model.token_to_bytes(token, Special::Tokenize).unwrap();
            // use `Decoder.decode_to_string()` to avoid the intermediate buffer
            let mut output_string = String::with_capacity(32);
            let _decode_result = decoder.decode_to_string(&output_bytes, &mut output_string, false);
            print!("{output_string}");
            std::io::stdout().flush().unwrap();
            batch.clear();
            batch.add(token, n_cur, &[0], true).unwrap();
        }
        n_cur += 1;
        ctx.decode(&mut batch).expect("failed to eval");
    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -1,15 +1,20 @@
 mod device_capability_data;
 mod discovery;
 mod inference;
 mod network;
 mod orchestration;
 mod partitioning;
 mod topology;
 mod llama_test;
 mod module_loading;
 mod llama_module;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
 use tonic::{transport::Server, Request, Response, Status};
 use crate::discovery::{NodeInfo, UdpDiscovery};
 use crate::inference::InferenceEngine;
 use crate::node_service::{
    CollectTopologyRequest, Empty, ExampleRequest, HealthCheckRequest, HealthCheckResponse,
    InferenceState, Loss, PromptRequest, SendOpaqueStatusRequest, SendResultRequest, Tensor,
@ -30,6 +35,7 @@ struct Node {
    node_info: NodeInfo,
    current_topology: Topology,
    udp_discovery: UdpDiscovery,
    inference_engine: InferenceEngine,
 }
 impl Node {
@ -45,7 +51,10 @@ impl Node {
            .current_topology
            .get_shard_for_node(base_shard, &self.node_info.node_id);
-        let result = self.inference_engine.infer_tensor(request_id, shard, tensor, inference_state);
+        let result: Tensor = self
            .inference_engine
            .infer_tensor(request_id, shard, tensor, inference_state);
        let result = self.process_inference_result(shard, result, request_id, inference_state);
        result
--- a/src/module_loading.rs
+++ b/src/module_loading.rs
@ -0,0 +1,52 @@
 use std::path::Path;
 use crate::Shard;
 fn load_config(
    model_path: &Path,
 ) -> serde_json::Map<String, serde_json::Value> {
    let config_path = model_path.join("config.json");
    let model_index = model_path.join("model_index.json");
    if config_path.exists() {
        let config = std::fs::read_to_string(config_path).unwrap();
        serde_json::from_str(&config).unwrap()
    } else {
        let model_index = std::fs::read_to_string(model_index).unwrap();
        serde_json::from_str(&model_index).unwrap()
    }
 }
 async fn load_model_shard(
    model_path: &Path,
    shard: Shard,
    lazy: bool,
    model_config: serde_json::Map<String, serde_json::Value>,
 ) {
    let mut config = load_config(model_path);
    config.extend(model_config.into_iter());
    let model_name = model_path.file_name().unwrap().to_str().unwrap();
    config["shard"] = serde_json::json!({
        "model_id": model_name,
        "start_layer": shard.start_layer,
        "end_layer": shard.end_layer,
        "n_layers": shard.total_layers,
    });
    let weight_files = glob::glob(model_path.join("model*.safetensors").to_str().unwrap())
        .unwrap()
        .collect::<Result<Vec<_>, _>>()
        .unwrap();
    let weight_files = weight_files
        .iter()
        .map(|path| path.file_name().unwrap().to_str().unwrap())
        .collect::<Vec<_>>();
    let weights = weight_files.iter().map(|file| {
    });
    todo!();
 }