Unverified Commit 794c0a44 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat(keyvalue): Filesystem backed KeyValueStore (#4138)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent 3fd0ab3d
......@@ -54,6 +54,10 @@ impl KeyValueStore for EtcdStore {
fn connection_id(&self) -> u64 {
self.client.lease_id()
}
fn shutdown(&self) {
// Revoke the lease? etcd will do it for us on disconnect.
}
}
pub struct EtcdBucket {
......@@ -132,13 +136,13 @@ impl KeyValueBucket for EtcdBucket {
continue;
}
};
let item = KeyValue::new(key, v_bytes.into());
match e.event_type() {
EventType::Put => {
let item = KeyValue::new(key, v_bytes.into());
yield WatchEvent::Put(item);
}
EventType::Delete => {
yield WatchEvent::Delete(item);
yield WatchEvent::Delete(Key::from_raw(key));
}
}
}
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::collections::HashSet;
use std::ffi::OsString;
use std::fmt;
use std::fs;
use std::os::unix::ffi::OsStrExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;
use std::{collections::HashMap, pin::Pin};
use anyhow::Context as _;
use async_trait::async_trait;
use futures::StreamExt;
use inotify::{Event, EventMask, EventStream, Inotify, WatchMask};
use parking_lot::Mutex;
use crate::storage::key_value_store::KeyValue;
use super::{Key, KeyValueBucket, KeyValueStore, StoreError, StoreOutcome, WatchEvent};
/// Treat as a singleton
#[derive(Clone)]
pub struct FileStore {
root: PathBuf,
connection_id: u64,
/// Directories we may have created files in, for shutdown cleanup
/// Arc so that we only ever have one map here after clone
active_dirs: Arc<Mutex<HashMap<PathBuf, Directory>>>,
}
impl FileStore {
pub(super) fn new<P: Into<PathBuf>>(root_dir: P) -> Self {
FileStore {
root: root_dir.into(),
connection_id: rand::random::<u64>(),
active_dirs: Arc::new(Mutex::new(HashMap::new())),
}
}
}
#[async_trait]
impl KeyValueStore for FileStore {
type Bucket = Directory;
/// A "bucket" is a directory
async fn get_or_create_bucket(
&self,
bucket_name: &str,
_ttl: Option<Duration>, // TODO ttl not used yet
) -> Result<Self::Bucket, StoreError> {
let p = self.root.join(bucket_name);
if let Some(dir) = self.active_dirs.lock().get(&p) {
return Ok(dir.clone());
};
if p.exists() {
// Get
if !p.is_dir() {
return Err(StoreError::FilesystemError(
"Bucket name is not a directory".to_string(),
));
}
} else {
// Create
fs::create_dir_all(&p).map_err(to_fs_err)?;
}
let dir = Directory::new(self.root.clone(), p.clone());
self.active_dirs.lock().insert(p, dir.clone());
Ok(dir)
}
/// A "bucket" is a directory
async fn get_bucket(&self, bucket_name: &str) -> Result<Option<Self::Bucket>, StoreError> {
let p = self.root.join(bucket_name);
if let Some(dir) = self.active_dirs.lock().get(&p) {
return Ok(Some(dir.clone()));
};
if !p.exists() {
return Ok(None);
}
if !p.is_dir() {
return Err(StoreError::FilesystemError(
"Bucket name is not a directory".to_string(),
));
}
let dir = Directory::new(self.root.clone(), p.clone());
self.active_dirs.lock().insert(p, dir.clone());
Ok(Some(dir))
}
fn connection_id(&self) -> u64 {
self.connection_id
}
// This cannot be a Drop imp because DistributedRuntime is cloned various places including
// Python. Drop doesn't get called.
fn shutdown(&self) {
for (_, mut dir) in self.active_dirs.lock().drain() {
if let Err(err) = dir.delete_owned_files() {
tracing::error!(error = %err, %dir, "Failed shutdown delete of owned files");
}
}
}
}
#[derive(Clone)]
pub struct Directory {
root: PathBuf,
p: PathBuf,
/// These are the files we created and hence must delete on shutdown
owned_files: Arc<Mutex<HashSet<PathBuf>>>,
}
impl Directory {
fn new(root: PathBuf, p: PathBuf) -> Self {
Directory {
root,
p,
owned_files: Arc::new(Mutex::new(HashSet::new())),
}
}
fn delete_owned_files(&mut self) -> anyhow::Result<()> {
let mut errs = Vec::new();
for p in self.owned_files.lock().drain() {
if let Err(err) = fs::remove_file(&p) {
errs.push(format!("{}: {err}", p.display()));
}
}
if !errs.is_empty() {
anyhow::bail!(errs.join(", "));
}
Ok(())
}
}
impl fmt::Display for Directory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.p.display())
}
}
#[async_trait]
impl KeyValueBucket for Directory {
/// Write a file to the directory
async fn insert(
&self,
key: &Key,
value: bytes::Bytes,
_revision: u64, // Not used. Maybe put in file name?
) -> Result<StoreOutcome, StoreError> {
let safe_key = Key::new(key.as_ref()); // because of from_raw
let full_path = self.p.join(safe_key.as_ref());
self.owned_files.lock().insert(full_path.clone());
let str_path = full_path.display().to_string();
fs::write(&full_path, &value)
.context(str_path)
.map_err(a_to_fs_err)?;
Ok(StoreOutcome::Created(0))
}
/// Read a file from the directory
async fn get(&self, key: &Key) -> Result<Option<bytes::Bytes>, StoreError> {
let safe_key = Key::new(key.as_ref()); // because of from_raw
let full_path = self.p.join(safe_key.as_ref());
if !full_path.exists() {
return Ok(None);
}
let str_path = full_path.display().to_string();
let data: bytes::Bytes = fs::read(&full_path)
.context(str_path)
.map_err(a_to_fs_err)?
.into();
Ok(Some(data))
}
/// Delete a file from the directory
async fn delete(&self, key: &Key) -> Result<(), StoreError> {
let safe_key = Key::new(key.as_ref()); // because of from_raw
let full_path = self.p.join(safe_key.as_ref());
let str_path = full_path.display().to_string();
if !full_path.exists() {
return Err(StoreError::MissingKey(str_path));
}
self.owned_files.lock().remove(&full_path);
fs::remove_file(&full_path)
.context(str_path)
.map_err(a_to_fs_err)
}
async fn watch(
&self,
) -> Result<Pin<Box<dyn futures::Stream<Item = WatchEvent> + Send + 'life0>>, StoreError> {
let inotify = Inotify::init().map_err(to_fs_err)?;
inotify
.watches()
.add(
&self.p,
WatchMask::MODIFY | WatchMask::CREATE | WatchMask::DELETE,
)
.map_err(to_fs_err)?;
let dir = self.p.clone();
Ok(Box::pin(async_stream::stream! {
let mut buffer = [0; 1024];
let mut events = match inotify.into_event_stream(&mut buffer) {
Ok(events) => events,
Err(err) => {
tracing::error!(error = %err, "Failed getting event stream from inotify");
return;
}
};
while let Some(Ok(event)) = events.next().await {
let Some(name) = event.name else {
tracing::warn!("Unexpected event on the directory itself");
continue;
};
let item_path = dir.join(name);
let key = match item_path.strip_prefix(&self.root) {
Ok(stripped) => stripped.display().to_string().replace("_", "/"),
Err(err) => {
// Possibly this should be a panic.
// A key cannot be outside the file store root.
tracing::error!(
error = %err,
item_path = %item_path.display(),
root = %self.root.display(),
"Item in file store is not prefixed with file store root. Should be impossible. Ignoring invalid key.");
continue;
}
};
match event.mask {
EventMask::MODIFY | EventMask::CREATE => {
let data: bytes::Bytes = match fs::read(&item_path) {
Ok(data) => data.into(),
Err(err) => {
tracing::warn!(error = %err, item = %item_path.display(), "Failed reading event item. Skipping.");
continue;
}
};
let item = KeyValue::new(key, data);
yield WatchEvent::Put(item);
}
EventMask::DELETE => {
yield WatchEvent::Delete(Key::from_raw(key));
}
event_type => {
tracing::warn!(?event_type, dir = %dir.display(), "Unexpected event type");
continue;
}
}
}
}))
}
async fn entries(&self) -> Result<HashMap<String, bytes::Bytes>, StoreError> {
let contents = fs::read_dir(&self.p)
.with_context(|| self.p.display().to_string())
.map_err(a_to_fs_err)?;
let mut out = HashMap::new();
for entry in contents {
let entry = entry.map_err(to_fs_err)?;
if !entry.path().is_file() {
tracing::warn!(
path = %entry.path().display(),
"Unexpected entry, directory should only contain files."
);
continue;
}
let key = match entry.path().strip_prefix(&self.root) {
Ok(p) => p.to_string_lossy().to_string().replace("_", "/"),
Err(err) => {
tracing::error!(
error = %err,
path = %entry.path().display(),
root = %self.root.display(),
"FileStore path not in root. Should be impossible. Skipping entry."
);
continue;
}
};
let data: bytes::Bytes = fs::read(entry.path())
.with_context(|| self.p.display().to_string())
.map_err(a_to_fs_err)?
.into();
out.insert(key, data);
}
Ok(out)
}
}
// For anyhow preserve the context
fn a_to_fs_err(err: anyhow::Error) -> StoreError {
StoreError::FilesystemError(format!("{err:#}"))
}
fn to_fs_err<E: std::error::Error>(err: E) -> StoreError {
StoreError::FilesystemError(err.to_string())
}
......@@ -57,7 +57,7 @@ impl MemoryBucket {
}
impl MemoryStore {
pub fn new() -> Self {
pub(super) fn new() -> Self {
let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
MemoryStore {
inner: Arc::new(MemoryStoreInner {
......@@ -107,6 +107,8 @@ impl KeyValueStore for MemoryStore {
fn connection_id(&self) -> u64 {
self.connection_id
}
fn shutdown(&self) {}
}
#[async_trait]
......@@ -205,8 +207,7 @@ impl KeyValueBucket for MemoryBucketRef {
yield WatchEvent::Put(item);
},
Some(MemoryEvent::Delete { key }) => {
let item = KeyValue::new(key, bytes::Bytes::new());
yield WatchEvent::Delete(item);
yield WatchEvent::Delete(Key::from_raw(key));
}
}
}
......
......@@ -52,6 +52,11 @@ impl KeyValueStore for NATSStore {
fn connection_id(&self) -> u64 {
self.client.client().server_info().client_id
}
fn shutdown(&self) {
// TODO: Track and delete any owned keys
// The TTL should ensure NATS does it, but best we do it immediately
}
}
impl NATSStore {
......@@ -160,12 +165,14 @@ impl KeyValueBucket for NATSBucket {
>| async move {
match maybe_entry {
Ok(entry) => {
let item = KeyValue::new(entry.key, entry.value);
Some(match entry.operation {
Operation::Put => WatchEvent::Put(item),
Operation::Delete => WatchEvent::Delete(item),
Operation::Put => {
let item = KeyValue::new(entry.key, entry.value);
WatchEvent::Put(item)
}
Operation::Delete => WatchEvent::Delete(Key::from_raw(entry.key)),
// TODO: What is Purge? Not urgent, NATS impl not used
Operation::Purge => WatchEvent::Delete(item),
Operation::Purge => WatchEvent::Delete(Key::from_raw(entry.key)),
})
}
Err(e) => {
......
......@@ -31,7 +31,7 @@ def get_runtime():
except Exception:
# If no existing runtime, create a new one
loop = asyncio.get_running_loop()
_runtime_instance = DistributedRuntime(loop, False)
_runtime_instance = DistributedRuntime(loop, "etcd", False)
return _runtime_instance
......
......@@ -226,7 +226,7 @@ def get_runtime():
# No running loop, create a new one (sync context)
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
_runtime_instance = DistributedRuntime(loop, False)
_runtime_instance = DistributedRuntime(loop, "etcd", False)
return _runtime_instance
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment