Commit b0d3eba1 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

fix(dynamo-run): Network interface detection is Linux only (#133)

"netlink" doesn't exist on Mac. We print the primary network interface to help multi-node setup, which is also unlikely on Mac.
parent 3d9ade88
......@@ -1507,7 +1507,6 @@ dependencies = [
"dynamo-runtime",
"futures",
"futures-util",
"libc",
"netlink-packet-route",
"rtnetlink",
"serde",
......
......@@ -41,9 +41,6 @@ clap = { version = "4.5", features = ["derive", "env"] }
dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] }
futures = { version = "0.3" }
futures-util = "0.3"
libc = { version = "0.2" }
netlink-packet-route = { version = "0.19", optional = true }
rtnetlink = { version = "0.14", optional = true }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
tokio = { version = "1", features = ["full"] }
......@@ -52,3 +49,7 @@ tracing = { version = "0.1" }
tracing-subscriber = { version = "0.3", features = ["env-filter", "local-time", "json"] }
dynamo-runtime = { path = "../../lib/runtime" }
dynamo-llm = { path = "../../lib/llm" }
[target.x86_64-unknown-linux-gnu.dependencies]
netlink-packet-route = { version = "0.19", optional = true }
rtnetlink = { version = "0.14", optional = true }
......@@ -13,38 +13,102 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use futures_util::TryStreamExt;
use netlink_packet_route::address::AddressAttribute;
use netlink_packet_route::link::LinkLayerType;
use netlink_packet_route::link::State as LinkState;
use netlink_packet_route::link::{LinkAttribute, LinkMessage};
use netlink_packet_route::AddressFamily;
use std::collections::HashSet;
use std::collections::VecDeque;
use std::{collections::HashMap, error::Error};
// Mac build uses none of this
#![allow(dead_code)]
#[cfg(target_os = "linux")]
pub async fn get_primary_interface() -> Result<Option<String>, LinkDataError> {
unix::get_primary_interface().await
}
#[cfg(target_os = "macos")]
pub async fn get_primary_interface() -> Result<Option<String>, LinkDataError> {
Ok(None)
}
#[derive(Debug)]
pub struct LinkDataError {
kind: LinkDataErrorKind,
interface: Option<String>,
}
impl LinkDataError {
fn connection(connection_error: std::io::Error) -> Self {
let kind = LinkDataErrorKind::Connection(connection_error);
let interface = None;
Self { kind, interface }
}
fn communication(communication_error: rtnetlink::Error) -> Self {
let kind = LinkDataErrorKind::Communication(communication_error);
let interface = None;
Self { kind, interface }
}
}
impl std::fmt::Display for LinkDataError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let err_message = "could not get interface link data";
if let Some(interface) = self.interface.as_ref() {
write!(f, "{err_message} for {interface}")
} else {
write!(f, "{err_message}")
}
}
}
impl std::error::Error for LinkDataError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self.kind {
LinkDataErrorKind::Connection(ref e) => Some(e),
LinkDataErrorKind::Communication(ref e) => Some(e),
}
}
}
#[derive(Debug)]
pub enum LinkDataErrorKind {
Connection(std::io::Error),
Communication(rtnetlink::Error),
}
#[cfg(target_os = "linux")]
mod unix {
use futures_util::TryStreamExt;
use netlink_packet_route::address::AddressAttribute;
use netlink_packet_route::link::LinkLayerType;
use netlink_packet_route::link::State as LinkState;
use netlink_packet_route::link::{LinkAttribute, LinkMessage};
use netlink_packet_route::AddressFamily;
use std::collections::HashMap;
use std::collections::HashSet;
use std::collections::VecDeque;
pub async fn get_primary_interface() -> Result<Option<String>, super::LinkDataError> {
let mut candidates: VecDeque<String> = get_ipv4_interface_links()
.await?
.into_iter()
.filter(|(k, v)| v.is_ethernet() && v.link_is_up() && v.has_carrier() && k.starts_with("e"))
.filter(|(k, v)| {
v.is_ethernet() && v.link_is_up() && v.has_carrier() && k.starts_with("e")
})
.map(|(k, _)| k)
.collect();
Ok(candidates.pop_front())
}
}
#[derive(Clone, Debug)]
// Most of the fields are Option<T> because the netlink protocol allows them
// to be absent (even though we have no reason to believe they'd ever actually
// be missing).
struct InterfaceLinkData {
#[derive(Clone, Debug)]
// Most of the fields are Option<T> because the netlink protocol allows them
// to be absent (even though we have no reason to believe they'd ever actually
// be missing).
struct InterfaceLinkData {
link_type: LinkLayerType,
state: Option<LinkState>,
has_carrier: bool,
}
}
impl InterfaceLinkData {
impl InterfaceLinkData {
pub fn link_is_up(&self) -> bool {
self.state
.map(|state| matches!(state, LinkState::Up))
......@@ -58,9 +122,9 @@ impl InterfaceLinkData {
pub fn has_carrier(&self) -> bool {
self.has_carrier
}
}
}
impl From<LinkMessage> for InterfaceLinkData {
impl From<LinkMessage> for InterfaceLinkData {
fn from(link_message: LinkMessage) -> Self {
let link_type = link_message.header.link_layer_type;
let state = link_message
......@@ -84,61 +148,16 @@ impl From<LinkMessage> for InterfaceLinkData {
has_carrier,
}
}
}
#[derive(Debug)]
pub struct LinkDataError {
kind: LinkDataErrorKind,
interface: Option<String>,
}
impl LinkDataError {
fn connection(connection_error: std::io::Error) -> Self {
let kind = LinkDataErrorKind::Connection(connection_error);
let interface = None;
Self { kind, interface }
}
fn communication(communication_error: rtnetlink::Error) -> Self {
let kind = LinkDataErrorKind::Communication(communication_error);
let interface = None;
Self { kind, interface }
}
}
impl std::fmt::Display for LinkDataError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let err_message = "could not get interface link data";
if let Some(interface) = self.interface.as_ref() {
write!(f, "{err_message} for {interface}")
} else {
write!(f, "{err_message}")
}
}
}
impl Error for LinkDataError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
match self.kind {
LinkDataErrorKind::Connection(ref e) => Some(e),
LinkDataErrorKind::Communication(ref e) => Some(e),
}
}
}
#[derive(Debug)]
pub enum LinkDataErrorKind {
Connection(std::io::Error),
Communication(rtnetlink::Error),
}
// Retrieve the link data (state, MTU, etc.) for all interfaces, and return
// them as a HashMap keyed by interface name. This is roughly equivalent to `ip
// link show` since we're using the same netlink interface under the hood as
// that command.
async fn get_ipv4_interface_links() -> Result<HashMap<String, InterfaceLinkData>, LinkDataError> {
// Retrieve the link data (state, MTU, etc.) for all interfaces, and return
// them as a HashMap keyed by interface name. This is roughly equivalent to `ip
// link show` since we're using the same netlink interface under the hood as
// that command.
async fn get_ipv4_interface_links(
) -> Result<HashMap<String, InterfaceLinkData>, super::LinkDataError> {
let (netlink_connection, rtnetlink_handle, _receiver) =
rtnetlink::new_connection().map_err(LinkDataError::connection)?;
rtnetlink::new_connection().map_err(super::LinkDataError::connection)?;
// We have to spawn off the netlink connection because of the architecture
// of `netlink_proto::Connection`, which runs in the background and owns
......@@ -164,7 +183,7 @@ async fn get_ipv4_interface_links() -> Result<HashMap<String, InterfaceLinkData>
})
.try_collect()
.await
.map_err(LinkDataError::communication)?;
.map_err(super::LinkDataError::communication)?;
let link_handle = rtnetlink_handle.link().get().execute();
link_handle
......@@ -189,10 +208,10 @@ async fn get_ipv4_interface_links() -> Result<HashMap<String, InterfaceLinkData>
})
.try_collect()
.await
.map_err(LinkDataError::communication)
}
.map_err(super::LinkDataError::communication)
}
fn extract_interface_name(link_message: &LinkMessage) -> Option<String> {
fn extract_interface_name(link_message: &LinkMessage) -> Option<String> {
link_message
.attributes
.iter()
......@@ -200,4 +219,5 @@ fn extract_interface_name(link_message: &LinkMessage) -> Option<String> {
LinkAttribute::IfName(name) => Some(name.clone()),
_ => None,
})
}
}
......@@ -339,7 +339,7 @@ async fn start_vllm(
let mut log_level = line_parts.next().unwrap_or_default();
// Skip date (0) and time (1). Print last (2) which is everything else.
let line = line_parts.nth(2).unwrap_or_default();
if line.starts_with("custom_op.py:68") {
if line.starts_with("custom_op.py:68") || line.trim().len() == 0 {
// Skip a noisy line
// custom_op.py:68] custom op <the op> enabled
continue;
......@@ -349,7 +349,7 @@ async fn start_vllm(
}
match log_level {
"DEBUG" => tracing::debug!("VLLM: {line}"),
"INFO" => tracing::debug!("VLLM: {line}"), // VLLM is noisy
"INFO" => tracing::debug!("VLLM: {line}"), // VLLM is noisy in debug mode
"WARNING" => tracing::warn!("VLLM: {line}"),
"ERROR" => tracing::error!("VLLM: {line}"),
level => tracing::info!("VLLM: {level} {line}"),
......@@ -359,6 +359,9 @@ async fn start_vllm(
tokio::spawn(async move {
let mut lines = stderr.lines();
while let Ok(Some(line)) = lines.next_line().await {
if line.trim().len() == 0 {
continue;
}
tracing::warn!("VLLM: {line}");
}
});
......@@ -399,7 +402,7 @@ async fn start_vllm(
.extract(py)
.unwrap()
});
tracing::info!("vllm zmq backend is ready: {resp:?}");
tracing::debug!("vllm zmq backend is ready: {resp:?}");
Ok(proc)
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment