Unverified Commit 99d8079a authored by one's avatar one Committed by GitHub
Browse files

[hytop] Tolerate noisy hy-smi output and return parse error reasons (#7)

- Add `_decode_json_object_with_noise` to best-effort decode a JSON object from noisy text by trying `json.loads` first and then scanning for a leading `{` and using `JSONDecoder.raw_decode`.
- Change `parse_hy_smi_output` to strip ANSI, use the new decoder, and return a `(samples, error_reason)` pair where `error_reason` is one of `None`, `"empty output"`, `"invalid json output"`, or `"no card rows in payload"`.
- Update `collect_node` to unpack the new return shape and include the parse reason in the `NodeResult.error` message when no GPUs were parsed.
- Update and extend tests in `projects/hytop/tests/test_parser.py` and `projects/hytop/tests/test_service.py` to cover noise-tolerant parsing, specific error reasons, and the service behavior with noisy/invalid output.
parent 91022255
......@@ -10,6 +10,31 @@ ANSI_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
CARD_KEY_RE = re.compile(r"^card(\d+)$")
def _decode_json_object_with_noise(cleaned: str) -> dict[str, object] | None:
"""Best-effort decode of a JSON object from noisy text.
Tries strict whole-string JSON first, then scans for the first '{' that
starts a decodable JSON object (using raw_decode).
"""
try:
payload = json.loads(cleaned)
return payload if isinstance(payload, dict) else None
except json.JSONDecodeError:
pass
decoder = json.JSONDecoder()
for idx, char in enumerate(cleaned):
if char != "{":
continue
try:
payload, _end = decoder.raw_decode(cleaned, idx)
except json.JSONDecodeError:
continue
return payload if isinstance(payload, dict) else None
return None
def strip_ansi(text: str) -> str:
"""Strip ANSI escape sequences from text.
......@@ -42,7 +67,7 @@ def parse_number(text: str) -> float:
return float(match.group(0))
def parse_hy_smi_output(raw: str, sample_ts: float) -> dict[int, Sample]:
def parse_hy_smi_output(raw: str, sample_ts: float) -> tuple[dict[int, Sample], str | None]:
"""Parse hy-smi JSON output into GPU keyed samples.
Args:
......@@ -50,18 +75,15 @@ def parse_hy_smi_output(raw: str, sample_ts: float) -> dict[int, Sample]:
sample_ts: Monotonic timestamp assigned to parsed rows.
Returns:
Mapping from GPU id to parsed sample.
Pair of (samples, error_reason). error_reason is None on success.
"""
cleaned = strip_ansi(raw).strip()
if not cleaned:
return {}
try:
payload = json.loads(cleaned)
except json.JSONDecodeError:
return {}
if not isinstance(payload, dict):
return {}
return {}, "empty output"
payload = _decode_json_object_with_noise(cleaned)
if payload is None:
return {}, "invalid json output"
result: dict[int, Sample] = {}
for card_key, card_data in payload.items():
......@@ -82,4 +104,6 @@ def parse_hy_smi_output(raw: str, sample_ts: float) -> dict[int, Sample]:
continue
setattr(sample, metric_name, parsed_value)
result[gpu_id] = sample
return result
if not result:
return {}, "no card rows in payload"
return result, None
......@@ -35,9 +35,10 @@ def collect_node(
if raw.error:
return NodeResult(host=host, samples={}, error=raw.error)
sample_ts = time.monotonic()
samples = parse_hy_smi_output(raw.stdout, sample_ts=sample_ts)
samples, parse_error = parse_hy_smi_output(raw.stdout, sample_ts=sample_ts)
if not samples:
return NodeResult(host=host, samples={}, error="no gpu rows parsed")
reason = parse_error or "unknown parse error"
return NodeResult(host=host, samples={}, error=f"no gpu rows parsed ({reason})")
return NodeResult(host=host, samples=samples)
......
......@@ -6,7 +6,11 @@ import json
import pytest
from hytop.gpu.parser import parse_hy_smi_output, parse_number, strip_ansi
from hytop.gpu.parser import (
parse_hy_smi_output,
parse_number,
strip_ansi,
)
# ---------------------------------------------------------------------------
# Real hy-smi JSON fixture (from actual 8-card Hygon DCU node)
......@@ -101,12 +105,14 @@ class TestParseNumber:
class TestParseHySmiOutput:
def test_full_output_card_count(self):
raw = json.dumps(HY_SMI_FULL)
result = parse_hy_smi_output(raw, sample_ts=1.0)
result, reason = parse_hy_smi_output(raw, sample_ts=1.0)
assert reason is None
assert set(result.keys()) == {0, 7}
def test_full_output_card0_metrics(self):
raw = json.dumps(HY_SMI_FULL)
result = parse_hy_smi_output(raw, sample_ts=1.0)
result, reason = parse_hy_smi_output(raw, sample_ts=1.0)
assert reason is None
s = result[0]
assert s.temp_c == pytest.approx(30.0)
assert s.avg_pwr_w == pytest.approx(157.0)
......@@ -116,12 +122,14 @@ class TestParseHySmiOutput:
def test_full_output_card7_hcu_load(self):
raw = json.dumps(HY_SMI_FULL)
result = parse_hy_smi_output(raw, sample_ts=1.0)
result, reason = parse_hy_smi_output(raw, sample_ts=1.0)
assert reason is None
assert result[7].gpu_pct == pytest.approx(100.0)
def test_temp_only_output(self):
raw = json.dumps(HY_SMI_TEMP_ONLY)
result = parse_hy_smi_output(raw, sample_ts=1.0)
result, reason = parse_hy_smi_output(raw, sample_ts=1.0)
assert reason is None
s = result[0]
assert s.temp_c == pytest.approx(26.0)
# Unrelated sensor keys must not populate fields
......@@ -130,22 +138,59 @@ class TestParseHySmiOutput:
def test_sample_ts_propagated(self):
raw = json.dumps(HY_SMI_FULL)
result = parse_hy_smi_output(raw, sample_ts=42.5)
result, reason = parse_hy_smi_output(raw, sample_ts=42.5)
assert reason is None
assert result[0].ts == pytest.approx(42.5)
def test_unknown_card_keys_ignored(self):
payload = {"sys_info": {"foo": "bar"}, "card0": HY_SMI_FULL["card0"]}
result = parse_hy_smi_output(json.dumps(payload), sample_ts=1.0)
result, reason = parse_hy_smi_output(json.dumps(payload), sample_ts=1.0)
assert reason is None
assert list(result.keys()) == [0]
def test_empty_string_returns_empty(self):
assert parse_hy_smi_output("", sample_ts=1.0) == {}
samples, reason = parse_hy_smi_output("", sample_ts=1.0)
assert samples == {}
assert reason == "empty output"
def test_invalid_json_returns_empty(self):
assert parse_hy_smi_output("not json", sample_ts=1.0) == {}
samples, reason = parse_hy_smi_output("not json", sample_ts=1.0)
assert samples == {}
assert reason == "invalid json output"
def test_ansi_stripped_before_parse(self):
# Some hy-smi versions emit ANSI colors; parser must strip them first
raw_with_ansi = "\x1b[0m" + json.dumps(HY_SMI_TEMP_ONLY) + "\x1b[0m"
result = parse_hy_smi_output(raw_with_ansi, sample_ts=1.0)
result, reason = parse_hy_smi_output(raw_with_ansi, sample_ts=1.0)
assert reason is None
assert 0 in result
def test_noise_before_json_is_tolerated(self):
raw_with_noise = (
"Authorized users only.\\n"
"RTNETLINK answers: File exists\\n"
f"{json.dumps(HY_SMI_TEMP_ONLY)}"
)
result, reason = parse_hy_smi_output(raw_with_noise, sample_ts=1.0)
assert reason is None
assert 0 in result
def test_noise_with_broken_json_still_returns_empty(self):
raw_with_noise_and_invalid_json = (
"Authorized users only.\\n"
'{"card0": {"Average Graphics Package Power (W)": "133.0""broken": "1"}}'
)
samples, reason = parse_hy_smi_output(raw_with_noise_and_invalid_json, sample_ts=1.0)
assert samples == {}
assert reason == "invalid json output"
def test_invalid_json_reason(self):
samples, reason = parse_hy_smi_output("not json", sample_ts=1.0)
assert samples == {}
assert reason == "invalid json output"
def test_no_card_rows_reason(self):
payload = {"meta": {"version": "1"}}
samples, reason = parse_hy_smi_output(json.dumps(payload), sample_ts=1.0)
assert samples == {}
assert reason == "no card rows in payload"
......@@ -77,7 +77,25 @@ class TestCollectNode:
def test_empty_output_yields_error(self, mock_collect):
mock_collect.return_value = CollectResult(host="localhost", stdout="", stderr="")
result = collect_node("localhost", ssh_timeout=5, cmd_timeout=10, hy_smi_args=["--json"])
assert result.error is not None
assert result.error == "no gpu rows parsed (empty output)"
@patch("hytop.gpu.service.collect_from_host")
def test_noise_before_valid_json_is_parsed(self, mock_collect):
noisy_stdout = (
"Authorized users only.\\n"
"RTNETLINK answers: File exists\\n"
+ HY_SMI_FULL_JSON
)
mock_collect.return_value = CollectResult(host="localhost", stdout=noisy_stdout, stderr="")
result = collect_node("localhost", ssh_timeout=5, cmd_timeout=10, hy_smi_args=["--json"])
assert result.error is None
assert set(result.samples.keys()) == {0, 7}
@patch("hytop.gpu.service.collect_from_host")
def test_invalid_json_yields_specific_error(self, mock_collect):
mock_collect.return_value = CollectResult(host="localhost", stdout="not json", stderr="")
result = collect_node("localhost", ssh_timeout=5, cmd_timeout=10, hy_smi_args=["--json"])
assert result.error == "no gpu rows parsed (invalid json output)"
# ---------------------------------------------------------------------------
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment