diff --git a/Cargo.lock b/Cargo.lock index 54fd630..55e086b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -230,7 +230,9 @@ name = "exo-rs" version = "0.1.0" dependencies = [ "network-interface", + "phf", "prost", + "regex", "serde", "serde_json", "socket2", @@ -241,6 +243,7 @@ dependencies = [ "tonic-build", "tracing", "tracing-subscriber", + "uuid", ] [[package]] @@ -670,6 +673,48 @@ dependencies = [ "indexmap 2.7.1", ] +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "1.1.9" @@ -952,6 +997,12 @@ dependencies = [ "libc", ] +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + [[package]] name = "slab" version = "0.4.9" @@ -1292,6 +1343,15 @@ version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" +[[package]] +name = "uuid" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" +dependencies = [ + "getrandom 0.3.1", +] + [[package]] name = "valuable" version = "0.1.1" diff --git a/Cargo.toml b/Cargo.toml index d737ec6..6025a36 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,9 @@ tracing-subscriber = "0.3" socket2 = "0.5.8" system-configuration = "0.6.1" network-interface = "2.0.0" +uuid = { version = "1.13.1", features = ["v4"] } +regex = "1.11.1" +phf = { version = "0.11.3", features = ["macros"] } [build-dependencies] tonic-build = "0.12.3" diff --git a/src/device_capability_data.rs b/src/device_capability_data.rs new file mode 100644 index 0000000..098b0a9 --- /dev/null +++ b/src/device_capability_data.rs @@ -0,0 +1,110 @@ +use crate::topology::DeviceFlops; + +use phf::phf_map; + +const TFLOPS: f64 = 1.00; + +pub static CHIP_FLOPS: phf::Map<&'static str, DeviceFlops> = phf_map! { + // Source: https://www.cpu-monkey.com + // Note: currently no distinction between variants of M3 Max and M3 Pro, we pick the lower one to be conservative + /// M chips + "Apple M1" => DeviceFlops { fp32: 2.29*TFLOPS, fp16: 4.58*TFLOPS, int8: 9.16*TFLOPS }, + "Apple M1 Pro" => DeviceFlops { fp32: 5.30*TFLOPS, fp16: 10.60*TFLOPS, int8: 21.20*TFLOPS }, + "Apple M1 Max" => DeviceFlops { fp32: 10.60*TFLOPS, fp16: 21.20*TFLOPS, int8: 42.40*TFLOPS }, + "Apple M1 Ultra" => DeviceFlops { fp32: 21.20*TFLOPS, fp16: 42.40*TFLOPS, int8: 84.80*TFLOPS }, + "Apple M2" => DeviceFlops { fp32: 3.55*TFLOPS, fp16: 7.10*TFLOPS, int8: 14.20*TFLOPS }, + "Apple M2 Pro" => DeviceFlops { fp32: 5.68*TFLOPS, fp16: 11.36*TFLOPS, int8: 22.72*TFLOPS }, + "Apple M2 Max" => DeviceFlops { fp32: 13.49*TFLOPS, fp16: 26.98*TFLOPS, int8: 53.96*TFLOPS }, + "Apple M2 Ultra" => DeviceFlops { fp32: 26.98*TFLOPS, fp16: 53.96*TFLOPS, int8: 107.92*TFLOPS }, + "Apple M3" => DeviceFlops { fp32: 3.55*TFLOPS, fp16: 7.10*TFLOPS, int8: 14.20*TFLOPS }, + "Apple M3 Pro" => DeviceFlops { fp32: 4.97*TFLOPS, fp16: 9.94*TFLOPS, int8: 19.88*TFLOPS }, + "Apple M3 Max" => DeviceFlops { fp32: 14.20*TFLOPS, fp16: 28.40*TFLOPS, int8: 56.80*TFLOPS }, + "Apple M4" => DeviceFlops { fp32: 4.26*TFLOPS, fp16: 8.52*TFLOPS, int8: 17.04*TFLOPS }, + "Apple M4 Pro" => DeviceFlops { fp32: 5.72*TFLOPS, fp16: 11.44*TFLOPS, int8: 22.88*TFLOPS }, + "Apple M4 Max" => DeviceFlops { fp32: 18.03*TFLOPS, fp16: 36.07*TFLOPS, int8: 72.14*TFLOPS }, + /// A chips + "Apple A13 Bionic" => DeviceFlops { fp32: 0.69*TFLOPS, fp16: 1.38*TFLOPS, int8: 2.76*TFLOPS }, + "Apple A14 Bionic" => DeviceFlops { fp32: 0.75*TFLOPS, fp16: 1.50*TFLOPS, int8: 3.00*TFLOPS }, + "Apple A15 Bionic" => DeviceFlops { fp32: 1.37*TFLOPS, fp16: 2.74*TFLOPS, int8: 5.48*TFLOPS }, + "Apple A16 Bionic" => DeviceFlops { fp32: 1.79*TFLOPS, fp16: 3.58*TFLOPS, int8: 7.16*TFLOPS }, + "Apple A17 Pro" => DeviceFlops { fp32: 2.15*TFLOPS, fp16: 4.30*TFLOPS, int8: 8.60*TFLOPS }, + /// NVIDIA GPUs + // RTX 40 series + "NVIDIA GEFORCE RTX 4090" => DeviceFlops { fp32: 82.58*TFLOPS, fp16: 165.16*TFLOPS, int8: 330.32*TFLOPS }, + "NVIDIA GEFORCE RTX 4080" => DeviceFlops { fp32: 48.74*TFLOPS, fp16: 97.48*TFLOPS, int8: 194.96*TFLOPS }, + "NVIDIA GEFORCE RTX 4080 SUPER" => DeviceFlops { fp32: 52.0*TFLOPS, fp16: 104.0*TFLOPS, int8: 208.0*TFLOPS }, + "NVIDIA GEFORCE RTX 4070 TI SUPER" => DeviceFlops { fp32: 40.0*TFLOPS, fp16: 80.0*TFLOPS, int8: 160.0*TFLOPS }, + "NVIDIA GEFORCE RTX 4070 TI" => DeviceFlops { fp32: 39.43*TFLOPS, fp16: 78.86*TFLOPS, int8: 157.72*TFLOPS }, + "NVIDIA GEFORCE RTX 4070 SUPER" => DeviceFlops { fp32: 30.0*TFLOPS, fp16: 60.0*TFLOPS, int8: 120.0*TFLOPS }, + "NVIDIA GEFORCE RTX 4070" => DeviceFlops { fp32: 29.0*TFLOPS, fp16: 58.0*TFLOPS, int8: 116.0*TFLOPS }, + "NVIDIA GEFORCE RTX 4060 TI 16GB" => DeviceFlops { fp32: 22.0*TFLOPS, fp16: 44.0*TFLOPS, int8: 88.0*TFLOPS }, + "NVIDIA GEFORCE RTX 4060 TI" => DeviceFlops { fp32: 22.0*TFLOPS, fp16: 44.0*TFLOPS, int8: 88.0*TFLOPS }, + // RTX 30 series + "NVIDIA GEFORCE RTX 3050" => DeviceFlops { fp32: 9.11*TFLOPS, fp16: 18.22*TFLOPS, int8: 36.44*TFLOPS }, + "NVIDIA GEFORCE RTX 3060" => DeviceFlops { fp32: 13.0*TFLOPS, fp16: 26.0*TFLOPS, int8: 52.0*TFLOPS }, + "NVIDIA GEFORCE RTX 3060 TI" => DeviceFlops { fp32: 16.2*TFLOPS, fp16: 32.4*TFLOPS, int8: 64.8*TFLOPS }, + "NVIDIA GEFORCE RTX 3070" => DeviceFlops { fp32: 20.3*TFLOPS, fp16: 40.6*TFLOPS, int8: 81.2*TFLOPS }, + "NVIDIA GEFORCE RTX 3070 TI" => DeviceFlops { fp32: 21.8*TFLOPS, fp16: 43.6*TFLOPS, int8: 87.2*TFLOPS }, + "NVIDIA GEFORCE RTX 3080 (10 GB)" => DeviceFlops { fp32: 29.8*TFLOPS, fp16: 59.6*TFLOPS, int8: 119.2*TFLOPS }, + "NVIDIA GEFORCE RTX 3080 (12 GB)" => DeviceFlops { fp32: 30.6*TFLOPS, fp16: 61.2*TFLOPS, int8: 122.4*TFLOPS }, + "NVIDIA GEFORCE RTX 3080 TI" => DeviceFlops { fp32: 34.1*TFLOPS, fp16: 68.2*TFLOPS, int8: 136.4*TFLOPS }, + "NVIDIA GEFORCE RTX 3090" => DeviceFlops { fp32: 35.6*TFLOPS, fp16: 71.2*TFLOPS, int8: 142.4*TFLOPS }, + "NVIDIA GEFORCE RTX 3090 TI" => DeviceFlops { fp32: 40.0*TFLOPS, fp16: 80.0*TFLOPS, int8: 160.0*TFLOPS }, + // RTX 20 series + "NVIDIA GEFORCE RTX 2060" => DeviceFlops { fp32: 6.45*TFLOPS, fp16: 12.9*TFLOPS, int8: 25.8*TFLOPS }, + "NVIDIA GEFORCE RTX 2060 SUPER" => DeviceFlops { fp32: 7.2*TFLOPS, fp16: 14.4*TFLOPS, int8: 28.8*TFLOPS }, + "NVIDIA GEFORCE RTX 2070" => DeviceFlops { fp32: 7.46*TFLOPS, fp16: 14.93*TFLOPS, int8: 29.86*TFLOPS }, + "NVIDIA GEFORCE RTX 2070 SUPER" => DeviceFlops { fp32: 9.06*TFLOPS, fp16: 18.12*TFLOPS, int8: 36.24*TFLOPS }, + "NVIDIA GEFORCE RTX 2080" => DeviceFlops { fp32: 10.07*TFLOPS, fp16: 20.14*TFLOPS, int8: 40.28*TFLOPS }, + "NVIDIA GEFORCE RTX 2080 TI" => DeviceFlops { fp32: 13.45*TFLOPS, fp16: 26.9*TFLOPS, int8: 40.28*TFLOPS }, + "NVIDIA GEFORCE RTX 2080 SUPER" => DeviceFlops { fp32: 11.15*TFLOPS, fp16: 22.30*TFLOPS, int8: 44.60*TFLOPS }, + "NVIDIA TITAN RTX" => DeviceFlops { fp32: 16.31*TFLOPS, fp16: 32.62*TFLOPS, int8: 65.24*TFLOPS }, + // GTX 10 series + "NVIDIA GEFORCE GTX 1050 TI" => DeviceFlops { fp32: 2.0*TFLOPS, fp16: 4.0*TFLOPS, int8: 8.0*TFLOPS }, + "NVIDIA GEFORCE GTX 1070" => DeviceFlops { fp32: 6.463*TFLOPS, fp16: 0.101*TFLOPS, int8: 25.852*TFLOPS }, + "NVIDIA GEFORCE GTX 1080" => DeviceFlops { fp32: 8.873*TFLOPS, fp16: 0.138*TFLOPS, int8: 35.492*TFLOPS }, + "NVIDIA GEFORCE GTX 1080 TI" => DeviceFlops { fp32: 11.34*TFLOPS, fp16: 0.177*TFLOPS, int8: 45.36*TFLOPS }, + // GTX 16 series + "NVIDIA GeForce GTX 1660 TI" => DeviceFlops { fp32: 4.8*TFLOPS, fp16: 9.6*TFLOPS, int8: 19.2*TFLOPS }, + // QUADRO RTX Ampere series + "NVIDIA RTX A2000" => DeviceFlops { fp32: 7.99*TFLOPS, fp16: 7.99*TFLOPS, int8: 31.91*TFLOPS }, + "NVIDIA RTX A4000" => DeviceFlops { fp32: 19.17*TFLOPS, fp16: 19.17*TFLOPS, int8: 76.68*TFLOPS }, + "NVIDIA RTX A4500" => DeviceFlops { fp32: 23.65*TFLOPS, fp16: 23.65*TFLOPS, int8: 94.6*TFLOPS }, + "NVIDIA RTX A5000" => DeviceFlops { fp32: 27.8*TFLOPS, fp16: 27.8*TFLOPS, int8: 111.2*TFLOPS }, + "NVIDIA RTX A6000" => DeviceFlops { fp32: 38.71*TFLOPS, fp16: 38.71*TFLOPS, int8: 154.84*TFLOPS }, + // NVIDIA Ada Lovelace Architecture-Based + "NVIDIA RTX 4000 ADA GENERATION" => DeviceFlops { fp32: 26.7*TFLOPS, fp16: 26.7*TFLOPS, int8: 258.0*TFLOPS }, + // Common Server GPUs + "NVIDIA A40 48GB PCIE" => DeviceFlops { fp32: 37.4*TFLOPS, fp16: 149.7*TFLOPS, int8: 299.3*TFLOPS }, + "NVIDIA A100 40GB PCIE" => DeviceFlops { fp32: 19.5*TFLOPS, fp16: 312.0*TFLOPS, int8: 624.0*TFLOPS }, + "NVIDIA A800 40GB PCIE" => DeviceFlops { fp32: 19.5*TFLOPS, fp16: 312.0*TFLOPS, int8: 624.0*TFLOPS }, + "NVIDIA A100 80GB PCIE" => DeviceFlops { fp32: 19.5*TFLOPS, fp16: 312.0*TFLOPS, int8: 624.0*TFLOPS }, + "NVIDIA A800 80GB PCIE" => DeviceFlops { fp32: 19.5*TFLOPS, fp16: 312.0*TFLOPS, int8: 624.0*TFLOPS }, + "NVIDIA A100 80GB SXM" => DeviceFlops { fp32: 19.5*TFLOPS, fp16: 312.0*TFLOPS, int8: 624.0*TFLOPS }, + "NVIDIA A800 80GB SXM" => DeviceFlops { fp32: 19.5*TFLOPS, fp16: 312.0*TFLOPS, int8: 624.0*TFLOPS }, + /// AMD GPUs + // RX 6000 series + "AMD Radeon RX 6900 XT" => DeviceFlops { fp32: 23.04*TFLOPS, fp16: 46.08*TFLOPS, int8: 92.16*TFLOPS }, + "AMD Radeon RX 6800 XT" => DeviceFlops { fp32: 20.74*TFLOPS, fp16: 41.48*TFLOPS, int8: 82.96*TFLOPS }, + "AMD Radeon RX 6800" => DeviceFlops { fp32: 16.17*TFLOPS, fp16: 32.34*TFLOPS, int8: 64.68*TFLOPS }, + "AMD Radeon RX 6700 XT" => DeviceFlops { fp32: 13.21*TFLOPS, fp16: 26.42*TFLOPS, int8: 52.84*TFLOPS }, + "AMD Radeon RX 6700" => DeviceFlops { fp32: 11.4*TFLOPS, fp16: 22.8*TFLOPS, int8: 45.6*TFLOPS }, + "AMD Radeon RX 6600 XT" => DeviceFlops { fp32: 10.6*TFLOPS, fp16: 21.2*TFLOPS, int8: 42.4*TFLOPS }, + "AMD Radeon RX 6600" => DeviceFlops { fp32: 8.93*TFLOPS, fp16: 17.86*TFLOPS, int8: 35.72*TFLOPS }, + "AMD Radeon RX 6500 XT" => DeviceFlops { fp32: 5.77*TFLOPS, fp16: 11.54*TFLOPS, int8: 23.08*TFLOPS }, + "AMD Radeon RX 6400" => DeviceFlops { fp32: 3.57*TFLOPS, fp16: 7.14*TFLOPS, int8: 14.28*TFLOPS }, + // RX 7000 series + "AMD Radeon RX 7900 XTX" => DeviceFlops { fp32: 61.4*TFLOPS, fp16: 122.8*TFLOPS, int8: 245.6*TFLOPS }, + "AMD Radeon RX 7900 XT" => DeviceFlops { fp32: 53.4*TFLOPS, fp16: 106.8*TFLOPS, int8: 213.6*TFLOPS }, + "AMD Radeon RX 7800 XT" => DeviceFlops { fp32: 42.6*TFLOPS, fp16: 85.2*TFLOPS, int8: 170.4*TFLOPS }, + "AMD Radeon RX 7700 XT" => DeviceFlops { fp32: 34.2*TFLOPS, fp16: 68.4*TFLOPS, int8: 136.8*TFLOPS }, + "AMD Radeon RX 7600" => DeviceFlops { fp32: 21.5*TFLOPS, fp16: 43.0*TFLOPS, int8: 86.0*TFLOPS }, + "AMD Radeon RX 7500" => DeviceFlops { fp32: 16.2*TFLOPS, fp16: 32.4*TFLOPS, int8: 64.8*TFLOPS }, +}; + +pub fn look_up(chip: &str) -> Option { + CHIP_FLOPS.get(chip) + .or_else(|| CHIP_FLOPS.get(&format!("Laptop GPU {}", chip))) + .or_else(|| CHIP_FLOPS.get(&format!("{} Laptop GPU", chip))) + .cloned() +} diff --git a/src/discovery/mod.rs b/src/discovery/mod.rs index 12b27c3..7a4c677 100644 --- a/src/discovery/mod.rs +++ b/src/discovery/mod.rs @@ -6,6 +6,7 @@ use std::net::SocketAddr; use std::time::Duration; use tokio::net::UdpSocket; use tokio::task::JoinHandle; +use uuid::Uuid; mod broadcast; mod udp_listen; @@ -47,6 +48,22 @@ pub struct NodeInfo { pub device_capabilities: DeviceCapabilities, } +impl Default for NodeInfo { + fn default() -> Self { + NodeInfo { + node_id: Uuid::new_v4().to_string(), + discovery_listen_port: 0, + broadcast_port: 0, + broadcast_interval: Default::default(), + grpc_port: 0, + allowed_peer_ids: None, + allowed_interfaces: None, + discovery_timeout: Default::default(), + device_capabilities: DeviceCapabilities::determine(), + } + } +} + pub struct UdpDiscovery { node_info: NodeInfo, discovery_handle: JoinHandle<()>, diff --git a/src/main.rs b/src/main.rs index 4804b32..f7bb9f0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ mod topology; mod orchestration; mod discovery; mod network; +mod device_capability_data; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -14,6 +15,7 @@ use crate::node_service::{ use node_service::node_service_server::{NodeService, NodeServiceServer}; use node_service::TensorRequest; use topology::Topology; +use crate::discovery::{NodeInfo, UdpDiscovery}; pub mod node_service { tonic::include_proto!("node_service"); // The string specified here must match the proto package name @@ -193,6 +195,7 @@ async fn main() -> Result<(), Box> { let grpc_addr = "[::1]:50051".parse()?; let node = Node::default(); + let udp_discovery = UdpDiscovery::new(NodeInfo::default()); // TODO: Also implement discovery diff --git a/src/topology.rs b/src/topology.rs index 2f5fe66..de5e901 100644 --- a/src/topology.rs +++ b/src/topology.rs @@ -1,5 +1,7 @@ use std::collections::HashMap; +use std::process::Command; use serde::{Deserialize, Serialize}; +use crate::device_capability_data; #[derive(Debug, Deserialize, Serialize, Clone)] pub struct Topology { @@ -27,10 +29,73 @@ pub struct DeviceCapabilities { } #[derive(Debug, Deserialize, Serialize, Clone)] +struct SystemProfilerOutputData { + #[serde(rename = "SPHardwareDataType")] + hardware: Vec +} + +#[derive(Debug, Deserialize, Serialize, Clone)] +struct SPHardwareDataType { + #[serde(rename = "_name")] + name: String, + activation_lock_status: String, + boot_rom_version: String, + chip_type: String, + machine_model: String, + machine_name: String, + model_number: String, + number_processors: String, + os_loader_version: String, + physical_memory: String, + #[serde(rename = "platform_UUID")] + platform_uuid: String, + #[serde(rename = "provisioning_UDID")] + provisioning_udid: String, + serial_number: String +} + +impl DeviceCapabilities { + pub fn determine() -> DeviceCapabilities { + let s = Command::new("system_profiler") + .arg("SPHardwareDataType") + .arg("-json") + .output() + .unwrap() + .stdout; + + let mut data = serde_json::from_slice::(&s).unwrap(); + let hardware = data.hardware.remove(0); + + let model = hardware.machine_name; + let chip = hardware.chip_type; + let memory = { + let parts: Vec<&str> = hardware.physical_memory.split_ascii_whitespace().collect(); + if parts.len() >= 2 { + let value = parts[0].parse::().unwrap_or(0); + if parts[1] == "GB" { + value * 1024 + } else { + value + } + } else { + 0 + } + }; + + DeviceCapabilities { + flops: device_capability_data::look_up(&chip).expect("Failed to find FLOPS data for chip"), + model, + chip, + memory, + } + } +} + +#[derive(Debug, Deserialize, Serialize, Clone, Default)] pub struct DeviceFlops { - fp32: u64, - fp16: u64, - int8: u64, + pub fp32: f64, + pub fp16: f64, + pub int8: f64, } #[derive(Debug, Deserialize, Serialize, Clone, Hash, Eq, PartialEq)]