# -*- coding: utf-8 -*-
"""ZhongJingGPT-1.B.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1DCPomUsfTxqkqxKpK-AIGvBSPbkOm7R3

# ZhongJingGPT-2-1.8b

A Traditional Chinese Medicine large language model, inspired by the wisdom of the eminent representative of ancient Chinese medical scholars, Zhang Zhongjing. This model aims to illuminate the profound knowledge of Traditional Chinese Medicine, bridging the gap between ancient wisdom and modern technology, and providing a reliable and professional tool for the Traditional Chinese Medical fields. However, all generated results are for reference only and should be provided by experienced professionals for diagnosis and treatment results and suggestions.
"""

import torch
print(torch.cuda.is_available())

!pip install transformers huggingface_hub accelerate peft

"""# You should restart colab and the run the following code."""

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Set the device
device = "cuda"  # replace with your device: "cpu", "cuda", "mps"

# Initialize model and tokenizer
peft_model_id = "CMLL/ZhongJing-2-1_8b"
base_model_id = "Qwen/Qwen1.5-1.8B-Chat"
model = AutoModelForCausalLM.from_pretrained(base_model_id, device_map="auto")
model.load_adapter(peft_model_id)
tokenizer = AutoTokenizer.from_pretrained(
    "CMLL/ZhongJing-2-1_8b",
    padding_side="right",
    trust_remote_code=True,
    pad_token=''
)

def get_model_response(question, context):
    # Create the prompt
    prompt = f"Question: {question}\nContext: {context}"
    messages = [
        {"role": "system", "content": "You are a helpful TCM assistant named 仲景中医大语言模型."},
        {"role": "user", "content": prompt}
    ]

    # Prepare the input
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    # Generate the response
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    # Decode the response
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

# Loop to get user input and provide model response
while True:
    user_question = input("Enter your question (or type 'exit' to stop): ")
    if user_question.lower() == 'exit':
        break
    user_context = input("Enter context (or type 'none' if no context): ")
    if user_context.lower() == 'none':
        user_context = ""

    print("Model is generating a response, please wait...")
    model_response = get_model_response(user_question, user_context)
    print("Model's response:", model_response)