""" A command-line interface for chatting with GLM-4.1V-9B-Thinkng model supporting images and videos. Examples: # Text-only chat python trans_infer_cli.py # Chat with single image python trans_infer_cli.py --image_paths /path/to/image.jpg # Chat with multiple images python trans_infer_cli.py --image_paths /path/to/img1.jpg /path/to/img2.png /path/to/img3.png # Chat with single video python trans_infer_cli.py --video_path /path/to/video.mp4 # Custom generation parameters python trans_infer_cli.py --temperature 0.8 --top_k 5 --max_tokens 4096 Notes: - Media files are loaded once at startup and persist throughout the conversation - Type 'exit' to quit the chat - Chat with images and video is NOT allowed - The model will remember the conversation history and can reference uploaded media in subsequent turns """ import argparse import re import torch from transformers import AutoProcessor, Glm4vForConditionalGeneration def build_content(image_paths, video_path, text): content = [] if image_paths: for img_path in image_paths: content.append({"type": "image", "url": img_path}) if video_path: content.append({"type": "video", "url": video_path}) content.append({"type": "text", "text": text}) return content def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, default="THUDM/GLM-4.1V-9B-Thinking") parser.add_argument("--image_paths", type=str, nargs="*", default=None) parser.add_argument("--video_path", type=str, default=None) parser.add_argument("--max_tokens", type=int, default=8192) parser.add_argument("--temperature", type=float, default=1.0) parser.add_argument("--repetition_penalty", type=float, default=1.0) parser.add_argument("--top_k", type=int, default=2) args = parser.parse_args() processor = AutoProcessor.from_pretrained(args.model_path, use_fast=True) model = Glm4vForConditionalGeneration.from_pretrained( args.model_path, torch_dtype=torch.bfloat16, device_map="cuda:0" ) messages = [] first_turn = True if args.image_paths is not None and args.video_path is not None: raise ValueError( "Chat with images and video is NOT allowed. Please use either --image_paths or --video_path, not both." ) while True: question = input("\nUser: ").strip() if question.lower() == "exit": break if first_turn: content = build_content(args.image_paths, args.video_path, question) first_turn = False else: content = [{"type": "text", "text": question}] messages.append({"role": "user", "content": content}) inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True, ).to(model.device) output = model.generate( **inputs, max_new_tokens=args.max_tokens, repetition_penalty=args.repetition_penalty, do_sample=args.temperature > 0, top_k=args.top_k, temperature=args.temperature if args.temperature > 0 else None, ) raw = processor.decode( output[0][inputs["input_ids"].shape[1] : -1], skip_special_tokens=False ) match = re.search(r"(.*?)", raw, re.DOTALL) answer = match.group(1).strip() if match else "" messages.append( {"role": "assistant", "content": [{"type": "text", "text": answer}]} ) print(f"Assistant: {raw}") if __name__ == "__main__": main()