first

876a36a4 · raojy · eda2afb8 · 876a36a4 · 876a36a4 · 876a36a4
Commit 876a36a4 authored May 27, 2026 by raojy
15 changed files
--- a/SenseNova-SI-main/training/intern_vl/requirements.txt
+++ b/SenseNova-SI-main/training/intern_vl/requirements.txt
+accelerate<1
+bitsandbytes==0.42.0
+decord
+deepspeed>=0.13.5
+einops==0.6.1
+einops-exts==0.0.4
+huggingface_hub
+imageio
+numpy==1.26.4
+opencv-python
+orjson
+peft==0.10.0
+pycocoevalcap
+pyyaml
+scikit-learn>=1.2.2
+scipy
+sentencepiece==0.1.99
+shortuuid
+tensorboardX
+termcolor
+timm==0.9.12
+tokenizers==0.15.1
+torch>=2
+torchvision>=0.15
+tqdm
+transformers==4.37.2
+yacs
--- a/SenseNova-SI-main/training/qwen3_vl/data_800K.yaml
+++ b/SenseNova-SI-main/training/qwen3_vl/data_800K.yaml
+datasets:
+  - path: /path/to/SenseNova-SI-800K/SenseNova-SI-800K_qwen3vl_format.jsonl
+    data_folder: /path/to/SenseNova-SI-800K/
+    data_type: jsonl
--- a/SenseNova-SI-main/training/qwen3_vl/data_8M.yaml
+++ b/SenseNova-SI-main/training/qwen3_vl/data_8M.yaml
+datasets:
+  - path: /path/to/SenseNova-SI-8M/SenseNova-SI-8M_qwen3vl_format.jsonl
+    data_folder: /path/to/SenseNova-SI-8M/
+    data_type: jsonl
--- a/SenseNova-SI-main/training/qwen3_vl/preprocess_sensenova_si_dataset.py
+++ b/SenseNova-SI-main/training/qwen3_vl/preprocess_sensenova_si_dataset.py
+"""Preprocess SenseNova-SI dataset JSONL into lmms-engine compatible format.
+This script fixes two schema incompatibilities:
+1. `image` mixed types (`str` and `list[str]`) -> normalized to `list[str]`.
+2. `conversations` format -> converted to `messages` with structured `content`.
+"""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+def normalize_image_field(sample: dict[str, Any]) -> bool:
+    """Normalize `image` to list[str] for Arrow/HF Dataset compatibility."""
+    image = sample.get("image")
+    if isinstance(image, str):
+        sample["image"] = [image]
+        return True
+    if isinstance(image, list):
+        return False
+    if image is None:
+        return False
+    raise ValueError(f"Unsupported image type: {type(image).__name__}")
+def map_conversations_to_messages(sample: dict[str, Any]) -> bool:
+    """Convert OpenAI-like `conversations` into lmms-engine `messages`."""
+    conversations = sample.get("conversations")
+    if conversations is None:
+        return False
+    if not isinstance(conversations, list):
+        raise ValueError("`conversations` must be a list.")
+    mapped_messages: list[dict[str, Any]] = []
+    for conversation in conversations:
+        if not isinstance(conversation, dict):
+            raise ValueError("Each `conversations` item must be an object.")
+        sender = conversation.get("from")
+        text = conversation.get("value", "")
+        if sender == "human":
+            role = "user"
+        elif sender == "gpt":
+            role = "assistant"
+        else:
+            role = str(sender) if sender is not None else "user"
+        mapped_messages.append(
+            {
+                "role": role,
+                "content": [{"type": "text", "text": text}],
+            }
+        )
+    sample["messages"] = mapped_messages
+    del sample["conversations"]
+    return True
+def default_output_path(src_path: Path) -> Path:
+    """Build default output path with `_qwen3vl_format` suffix."""
+    return src_path.with_name(
+        f"{src_path.stem}_qwen3vl_format{src_path.suffix or '.jsonl'}"
+    )
+def preprocess_jsonl(src_path: Path, dst_path: Path) -> None:
+    """Read JSONL, normalize each sample, and write mapped JSONL."""
+    image_fixed_count = 0
+    conversation_fixed_count = 0
+    total_count = 0
+    dst_path.parent.mkdir(parents=True, exist_ok=True)
+    with (
+        src_path.open("r", encoding="utf-8") as source,
+        dst_path.open("w", encoding="utf-8") as target,
+    ):
+        for line_number, line in enumerate(source, start=1):
+            stripped = line.strip()
+            if not stripped:
+                continue
+            try:
+                sample = json.loads(stripped)
+            except json.JSONDecodeError as error:
+                raise ValueError(
+                    f"Invalid JSON at line {line_number}: {error}"
+                ) from error
+            if not isinstance(sample, dict):
+                raise ValueError(f"Line {line_number} is not a JSON object.")
+            if normalize_image_field(sample):
+                image_fixed_count += 1
+            if map_conversations_to_messages(sample):
+                conversation_fixed_count += 1
+            target.write(json.dumps(sample, ensure_ascii=False) + "\n")
+            total_count += 1
+    print(
+        "Done."
+        f" total={total_count},"
+        f" image_fixed={image_fixed_count},"
+        f" conversations_mapped={conversation_fixed_count},"
+        f" output='{dst_path}'"
+    )
+def build_args() -> argparse.Namespace:
+    """Build and parse CLI arguments."""
+    parser = argparse.ArgumentParser(
+        description="Preprocess SenseNova-SI dataset JSONL for lmms-engine training."
+    )
+    parser.add_argument(
+        "--src",
+        required=True,
+        type=Path,
+        help="Path to original SenseNova-SI dataset JSONL.",
+    )
+    parser.add_argument(
+        "--dst",
+        type=Path,
+        default=None,
+        help="Output JSONL path. Default: <src_stem>_qwen3vl_format.jsonl",
+    )
+    return parser.parse_args()
+def main() -> None:
+    """Script entrypoint."""
+    args = build_args()
+    dst_path = args.dst if args.dst is not None else default_output_path(args.src)
+    preprocess_jsonl(src_path=args.src, dst_path=dst_path)
+if __name__ == "__main__":
+    main()
--- a/SenseNova-SI-main/training/qwen3_vl/run.sh
+++ b/SenseNova-SI-main/training/qwen3_vl/run.sh
+#!/bin/bash
+################################################################################
+# Qwen3-VL 8B Training with FSDP2 + Ulysses Sequence Parallel
+################################################################################
+#
+# DESCRIPTION:
+#   Train Qwen3-VL vision-language model with support for long sequences
+#   using Ulysses Sequence Parallel and FSDP2 distributed training.
+#
+# KEY FEATURES:
+#   - Multi-resolution visual understanding
+#   - Ulysses SP for 10K+ visual tokens
+#   - Flash Attention 2 + unpadding (use_rmpad)
+#   - Sequence packing (35-40% MFU)
+#   - Liger Kernel fused operations
+#   - FSDP2 distributed training
+#
+# REQUIREMENTS:
+#   - 8x GPUs (A100/H100 recommended, 80GB VRAM)
+#   - flash-attn: pip install flash-attn --no-build-isolation
+#   - liger-kernel: pip install liger-kernel
+#
+# DATASET:
+#   Prepare your dataset in OpenAI chat format (JSONL/Arrow):
+#   See: docs/user_guide/data_prep.md
+#
+#   Example dataset YAML (data/video/debug.yaml):
+#   ```yaml
+#   datasets:
+#     - path: /path/to/your/dataset
+#       data_folder: ""
+#       data_type: arrow
+#   ```
+#
+# CONFIGURATION:
+#   Edit example_config.yaml to customize:
+#   - Model size (2B/8B/72B): change load_from_pretrained_path
+#   - Sequence length: adjust packing_length
+#   - SP degree: set sp_ulysses_degree (1/2/4/8)
+#   - Batch size: per_device_train_batch_size
+#   - Max frames: video_max_frames
+#
+# PERFORMANCE TIPS:
+#   - Adjust sp_ulysses_degree based on sequence length:
+#     * Degree 1: < 10K tokens
+#     * Degree 2: 10K-20K tokens
+#     * Degree 4: 20K-40K tokens
+#     * Degree 8: 40K+ tokens
+#   - Enable packing for better MFU: set packing: true
+#   - Use gradient_checkpointing for larger models (already enabled)
+#   - Monitor memory with: watch -n 1 nvidia-smi
+#
+################################################################################
+# Number of GPUs
+NGPUS=8
+# Dataset scale: first argument 800K or 8M (default: 800K).
+# Example: bash training/qwen3_vl/run.sh 8M
+DATA_SCALE="${1:-800K}"
+case "${DATA_SCALE}" in
+  800K)
+    TRAIN_CONFIG="training/qwen3_vl/train_config_800K.yaml"
+    ;;
+  8M)
+    TRAIN_CONFIG="training/qwen3_vl/train_config_8M.yaml"
+    ;;
+  *)
+    echo "Usage: $0 [800K|8M]" >&2
+    echo "  800K  SenseNova-SI 800K preset (train_config_800K.yaml + data_800K.yaml)" >&2
+    echo "  8M    SenseNova-SI 8M preset (train_config_8M.yaml + data_8M.yaml)" >&2
+    exit 1
+    ;;
+esac
+# Training command
+torchrun --nproc_per_node=${NGPUS} \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --master_port=12355 \
+  -m lmms_engine.launch.cli \
+  config_yaml="${TRAIN_CONFIG}"
+################################################################################
+# MULTI-NODE TRAINING:
+#
+# On rank 0 node:
+# torchrun --nproc_per_node=8 \
+#   --nnodes=2 \
+#   --node_rank=0 \
+#   --master_addr=<RANK_0_IP> \
+#   --master_port=12355 \
+#   -m lmms_engine.launch.cli \
+#   config_yaml=training/qwen3_vl/train_config.yaml
+#
+# On rank 1 node:
+# torchrun --nproc_per_node=8 \
+#   --nnodes=2 \
+#   --node_rank=1 \
+#   --master_addr=<RANK_0_IP> \
+#   --master_port=12355 \
+#   -m lmms_engine.launch.cli \
+#   config_yaml=training/qwen3_vl/train_config.yaml
+#
+################################################################################
--- a/SenseNova-SI-main/training/qwen3_vl/train_config_800K.yaml
+++ b/SenseNova-SI-main/training/qwen3_vl/train_config_800K.yaml
+trainer_type: fsdp2_trainer
+dataset_config:
+  extra_kwargs: {}
+  dataset_type: qwen3_vl_iterable
+  dataset_format: yaml
+  processor_config:
+    processor_name: Qwen/Qwen3-VL-8B-Instruct
+    processor_type: qwen3_vl
+  dataset_path: training/qwen3_vl/data_800K.yaml
+  datasets: null
+  shuffle: true
+  eval_dataset_path: null
+  object_storage: none
+  bucket_name: null
+  packing: false
+  packing_strategy: first_fit
+  packing_length: 40000
+  filter_overlong: true
+  filter_overlong_workers: 8
+  max_length: null
+  video_sampling_strategy: fps
+  video_max_pixels: 50176
+  video_max_frames: 512
+  frame_num: 64
+  fps: 1
+  video_backend: qwen_vl_utils
+trainer_args:
+  output_dir: ./results/qwen3_vl/sensenova_si_800K
+  overwrite_output_dir: false
+  do_train: false
+  do_eval: false
+  do_predict: false
+  eval_strategy: 'no'
+  prediction_loss_only: false
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 1
+  per_gpu_train_batch_size: null
+  per_gpu_eval_batch_size: null
+  gradient_accumulation_steps: 1
+  eval_accumulation_steps: null
+  eval_delay: 0
+  torch_empty_cache_steps: null
+  learning_rate: 1.0e-05
+  weight_decay: 0.0
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1.0e-08
+  max_grad_norm: 1.0
+  num_train_epochs: 1
+  max_steps: 1000
+  lr_scheduler_type: cosine
+  lr_scheduler_kwargs: {}
+  warmup_ratio: 0.01
+  warmup_steps: 0
+  log_level: passive
+  log_level_replica: warning
+  log_on_each_node: true
+  logging_dir: ./output/qwen3_vl_training/runs
+  logging_strategy: steps
+  logging_first_step: false
+  logging_steps: 1
+  logging_nan_inf_filter: true
+  save_strategy: steps
+  save_steps: 200
+  save_total_limit: 1
+  save_safetensors: true
+  save_on_each_node: false
+  save_only_model: false
+  restore_callback_states_from_checkpoint: false
+  no_cuda: false
+  use_cpu: false
+  use_mps_device: false
+  seed: 42
+  data_seed: null
+  jit_mode_eval: false
+  bf16: true
+  fp16: false
+  fp16_opt_level: O1
+  half_precision_backend: auto
+  bf16_full_eval: false
+  fp16_full_eval: false
+  tf32: null
+  local_rank: 0
+  ddp_backend: null
+  tpu_num_cores: null
+  tpu_metrics_debug: false
+  debug: []
+  dataloader_drop_last: false
+  eval_steps: null
+  dataloader_num_workers: 0
+  dataloader_prefetch_factor: null
+  past_index: -1
+  run_name: video_debug
+  disable_tqdm: false
+  remove_unused_columns: true
+  label_names: null
+  load_best_model_at_end: false
+  metric_for_best_model: null
+  greater_is_better: null
+  ignore_data_skip: false
+  fsdp: []
+  fsdp_min_num_params: 0
+  fsdp_config:
+    transformer_layer_cls_to_wrap:
+    - Qwen3VLTextDecoderLayer
+    reshard_after_forward: false
+    min_num_params: 0
+    xla: false
+    xla_fsdp_v2: false
+    xla_fsdp_grad_ckpt: false
+  fsdp_transformer_layer_cls_to_wrap: null
+  accelerator_config:
+    split_batches: false
+    dispatch_batches: null
+    even_batches: true
+    use_seedable_sampler: true
+    non_blocking: false
+    gradient_accumulation_kwargs: null
+  parallelism_config: null
+  deepspeed: null
+  label_smoothing_factor: 0.0
+  optim: adamw_torch_fused
+  optim_args: null
+  adafactor: false
+  group_by_length: false
+  length_column_name: length
+  report_to: []
+  project: huggingface
+  trackio_space_id: trackio
+  ddp_find_unused_parameters: null
+  ddp_bucket_cap_mb: null
+  ddp_broadcast_buffers: null
+  dataloader_pin_memory: true
+  dataloader_persistent_workers: false
+  skip_memory_metrics: true
+  use_legacy_prediction_loop: false
+  push_to_hub: false
+  resume_from_checkpoint: null
+  hub_model_id: null
+  hub_strategy: every_save
+  hub_token: <HUB_TOKEN>
+  hub_private_repo: null
+  hub_always_push: false
+  hub_revision: null
+  gradient_checkpointing: true
+  gradient_checkpointing_kwargs: null
+  include_inputs_for_metrics: false
+  include_for_metrics: []
+  eval_do_concat_batches: true
+  fp16_backend: auto
+  push_to_hub_model_id: null
+  push_to_hub_organization: null
+  mp_parameters: ''
+  auto_find_batch_size: false
+  full_determinism: false
+  torchdynamo: null
+  ray_scope: last
+  ddp_timeout: 1800
+  torch_compile: false
+  torch_compile_backend: null
+  torch_compile_mode: null
+  include_tokens_per_second: false
+  include_num_input_tokens_seen: 'no'
+  neftune_noise_alpha: null
+  optim_target_modules: null
+  batch_eval_metrics: false
+  eval_on_start: false
+  use_liger_kernel: true
+  liger_kernel_config: null
+  eval_use_gather_object: false
+  average_tokens_across_devices: true
+  use_muon: false
+  freeze_modules: null
+  use_rmpad: true
+  fsdp2: true
+  sp_ulysses_degree: 1
+  reduce_dtype: bfloat16
+  output_dtype: bfloat16
+  print_batch_input_steps: 5
+  enable_profiler: false
+  profiler_config:
+    start_step: 1
+    end_step: 3
+model_config:
+  extra_kwargs: {}
+  load_from_pretrained_path: training/pretrained_models/Qwen/Qwen3-VL-8B-Instruct
+  load_from_config: null
+  attn_implementation: flash_attention_2
+  overwrite_config: null
+  monkey_patch_kwargs: null
+extra_kwargs: null
\ No newline at end of file
--- a/SenseNova-SI-main/training/qwen3_vl/train_config_8M.yaml
+++ b/SenseNova-SI-main/training/qwen3_vl/train_config_8M.yaml
+trainer_type: fsdp2_trainer
+dataset_config:
+  extra_kwargs: {}
+  dataset_type: qwen3_vl_iterable
+  dataset_format: yaml
+  processor_config:
+    processor_name: Qwen/Qwen3-VL-8B-Instruct
+    processor_type: qwen3_vl
+  dataset_path: training/qwen3_vl/data_8M.yaml
+  datasets: null
+  shuffle: true
+  eval_dataset_path: null
+  object_storage: none
+  bucket_name: null
+  packing: false
+  packing_strategy: first_fit
+  packing_length: 40000
+  filter_overlong: true
+  filter_overlong_workers: 8
+  max_length: null
+  video_sampling_strategy: fps
+  video_max_pixels: 50176
+  video_max_frames: 512
+  frame_num: 64
+  fps: 1
+  video_backend: qwen_vl_utils
+trainer_args:
+  output_dir: ./results/qwen3_vl/sensenova_si_8M
+  overwrite_output_dir: false
+  do_train: false
+  do_eval: false
+  do_predict: false
+  eval_strategy: 'no'
+  prediction_loss_only: false
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 1
+  per_gpu_train_batch_size: null
+  per_gpu_eval_batch_size: null
+  gradient_accumulation_steps: 1
+  eval_accumulation_steps: null
+  eval_delay: 0
+  torch_empty_cache_steps: null
+  learning_rate: 1.0e-05
+  weight_decay: 0.0
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1.0e-08
+  max_grad_norm: 1.0
+  num_train_epochs: 1
+  max_steps: 1000
+  lr_scheduler_type: cosine
+  lr_scheduler_kwargs: {}
+  warmup_ratio: 0.01
+  warmup_steps: 0
+  log_level: passive
+  log_level_replica: warning
+  log_on_each_node: true
+  logging_dir: ./output/qwen3_vl_training/runs
+  logging_strategy: steps
+  logging_first_step: false
+  logging_steps: 1
+  logging_nan_inf_filter: true
+  save_strategy: steps
+  save_steps: 200
+  save_total_limit: 1
+  save_safetensors: true
+  save_on_each_node: false
+  save_only_model: false
+  restore_callback_states_from_checkpoint: false
+  no_cuda: false
+  use_cpu: false
+  use_mps_device: false
+  seed: 42
+  data_seed: null
+  jit_mode_eval: false
+  bf16: true
+  fp16: false
+  fp16_opt_level: O1
+  half_precision_backend: auto
+  bf16_full_eval: false
+  fp16_full_eval: false
+  tf32: null
+  local_rank: 0
+  ddp_backend: null
+  tpu_num_cores: null
+  tpu_metrics_debug: false
+  debug: []
+  dataloader_drop_last: false
+  eval_steps: null
+  dataloader_num_workers: 0
+  dataloader_prefetch_factor: null
+  past_index: -1
+  run_name: video_debug
+  disable_tqdm: false
+  remove_unused_columns: true
+  label_names: null
+  load_best_model_at_end: false
+  metric_for_best_model: null
+  greater_is_better: null
+  ignore_data_skip: false
+  fsdp: []
+  fsdp_min_num_params: 0
+  fsdp_config:
+    transformer_layer_cls_to_wrap:
+    - Qwen3VLTextDecoderLayer
+    reshard_after_forward: false
+    min_num_params: 0
+    xla: false
+    xla_fsdp_v2: false
+    xla_fsdp_grad_ckpt: false
+  fsdp_transformer_layer_cls_to_wrap: null
+  accelerator_config:
+    split_batches: false
+    dispatch_batches: null
+    even_batches: true
+    use_seedable_sampler: true
+    non_blocking: false
+    gradient_accumulation_kwargs: null
+  parallelism_config: null
+  deepspeed: null
+  label_smoothing_factor: 0.0
+  optim: adamw_torch_fused
+  optim_args: null
+  adafactor: false
+  group_by_length: false
+  length_column_name: length
+  report_to: []
+  project: huggingface
+  trackio_space_id: trackio
+  ddp_find_unused_parameters: null
+  ddp_bucket_cap_mb: null
+  ddp_broadcast_buffers: null
+  dataloader_pin_memory: true
+  dataloader_persistent_workers: false
+  skip_memory_metrics: true
+  use_legacy_prediction_loop: false
+  push_to_hub: false
+  resume_from_checkpoint: null
+  hub_model_id: null
+  hub_strategy: every_save
+  hub_token: <HUB_TOKEN>
+  hub_private_repo: null
+  hub_always_push: false
+  hub_revision: null
+  gradient_checkpointing: true
+  gradient_checkpointing_kwargs: null
+  include_inputs_for_metrics: false
+  include_for_metrics: []
+  eval_do_concat_batches: true
+  fp16_backend: auto
+  push_to_hub_model_id: null
+  push_to_hub_organization: null
+  mp_parameters: ''
+  auto_find_batch_size: false
+  full_determinism: false
+  torchdynamo: null
+  ray_scope: last
+  ddp_timeout: 1800
+  torch_compile: false
+  torch_compile_backend: null
+  torch_compile_mode: null
+  include_tokens_per_second: false
+  include_num_input_tokens_seen: 'no'
+  neftune_noise_alpha: null
+  optim_target_modules: null
+  batch_eval_metrics: false
+  eval_on_start: false
+  use_liger_kernel: true
+  liger_kernel_config: null
+  eval_use_gather_object: false
+  average_tokens_across_devices: true
+  use_muon: false
+  freeze_modules: null
+  use_rmpad: true
+  fsdp2: true
+  sp_ulysses_degree: 1
+  reduce_dtype: bfloat16
+  output_dtype: bfloat16
+  print_batch_input_steps: 5
+  enable_profiler: false
+  profiler_config:
+    start_step: 1
+    end_step: 3
+model_config:
+  extra_kwargs: {}
+  load_from_pretrained_path: training/pretrained_models/Qwen/Qwen3-VL-8B-Instruct
+  load_from_config: null
+  attn_implementation: flash_attention_2
+  overwrite_config: null
+  monkey_patch_kwargs: null
+extra_kwargs: null
\ No newline at end of file
--- a/SenseNova-SI-main/uv.lock
+++ b/SenseNova-SI-main/uv.lock
--- a/doc/1.png
+++ b/doc/1.png
--- a/doc/2.jpg
+++ b/doc/2.jpg
--- a/doc/3.png
+++ b/doc/3.png
--- a/doc/4.png
+++ b/doc/4.png
--- a/doc/5.png
+++ b/doc/5.png
--- a/icon.png
+++ b/icon.png
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=15311
+# 模型名称
+modelName=SenseNova-U1
+# 模型描述
+modelDescription=SenseNova-U1 是一款基于 NEO-unify 架构的原生统一多模态大模型，摒弃传统视觉编码器与 VAE 结构，端到端融合图文信息。模型拥有 8B 密集型与 A3B 混合专家两种规格，可实现图文理解、图像生成、图片编辑及图文交错创作，多项任务性能达开源顶尖水平，支持量化压缩与低显存部署，适配多样应用场景。
+# 运行过程
+processType=推理
+# 算法类别
+appCategory=多模态
+# 框架类型
+frameType=pytorch
+# 加速卡类型
+accelerateType=BW1000