import os
import shutil
import pandas as pd
from datasets import Dataset
import torch

# Getting Started with FinGPT

"""
Welcome to this comprehensive guide aimed at beginners diving into the realm of
Financial Large Language Models (FinLLMs) with FinGPT. This blog post demystifies the 
rocess of training FinGPT using Low-Rank Adaptation (LoRA) with the robust base model ChatGlm2-6b.
"""

## Part 1: Preparing the Data
"""
Data preparation is a crucial step when it comes to training
Financial Large Language Models. 
Here, we’ll guide you on how to get your dataset ready for FinGPT using Python.
In this section, you’ve initialized your working directory and loaded a financial sentiment dataset. Let’s break down the steps:
"""

jsonl_path = '../FinGPT/data/dataset_new.jsonl'
save_path = '../FinGPT/data/dataset_new'


if os.path.exists(jsonl_path):
    os.remove(jsonl_path)

if os.path.exists(save_path):
    shutil.rmtree(save_path)

directory = "../FinGPT/data"
if not os.path.exists(directory):
    os.makedirs(directory)

### 1.2 Load and Prepare Dataset:

"""
* Import necessary libraries from the datasets package: https://huggingface.co/docs/datasets/index
* Load the Twitter Financial News Sentiment (TFNS) dataset and convert it to a Pandas dataframe. https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment
* Map numerical labels to their corresponding sentiments (negative, positive, neutral).
* Add instruction for each data entry, which is crucial for Instruction Tuning.
* Convert the Pandas dataframe back to a Hugging Face Dataset object.
"""
from datasets import load_dataset, load_from_disk
import datasets

dic = {
    0:"negative",
    1:'positive',
    2:'neutral',
}

df = pd.read_csv('../FinGPT/data/twitter-financial-news-sentiment/sent_train.csv')
df['label'] = df['label'].apply(lambda x:dic[x])
df['instruction'] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'
df.columns = ['input', 'output', 'instruction']
tfns = Dataset.from_pandas(df)

print(tfns)

tmp_dataset = datasets.concatenate_datasets([tfns]*2)
train_dataset = tmp_dataset
print(tmp_dataset.num_rows)

all_dataset = train_dataset.shuffle(seed = 42)
print(all_dataset.shape)

import json
from tqdm.notebook import tqdm

def format_example(example: dict) -> dict:
    context = f"Instruction: {example['instruction']}\n"
    if example.get("input"):
        context += f"Input: {example['input']}\n"
    context += "Answer: "
    target = example["output"]
    return {"context": context, "target": target}

data_list = []
for item in all_dataset.to_pandas().itertuples():
    tmp = {}
    tmp["instruction"] = item.instruction
    tmp["input"] = item.input
    tmp["output"] = item.output
    data_list.append(tmp)




# save to a jsonl file
with open("../FinGPT/data/dataset_new.jsonl", 'w') as f:
    for example in tqdm(data_list, desc="formatting.."):
        f.write(json.dumps(format_example(example)) + '\n')


import datasets
from transformers import AutoTokenizer, AutoConfig

model_name = "../FinGPT/chatglm2-6b"
max_seq_length = 512
skip_overlength = True

# The preprocess function tokenizes the prompt and target, combines them into input IDs,
# and then trims or pads the sequence to the maximum sequence length.
def preprocess(tokenizer, config, example, max_seq_length):
    prompt = example["context"]
    target = example["target"]
    prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)
    target_ids = tokenizer.encode(
        target,
        max_length=max_seq_length,
        truncation=True,
        add_special_tokens=False)
    input_ids = prompt_ids + target_ids + [config.eos_token_id]
    return {"input_ids": input_ids, "seq_len": len(prompt_ids)}

# The read_jsonl function reads each line from the JSONL file, preprocesses it using the preprocess function,
# and then yields each preprocessed example.
def read_jsonl(path, max_seq_length, skip_overlength=False):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,  trust_remote_code=True)
    config = AutoConfig.from_pretrained(
        model_name, device_map='auto',  trust_remote_code=True)
    with open(path, "r") as f:
        for line in tqdm(f.readlines()):
            example = json.loads(line)
            feature = preprocess(tokenizer, config, example, max_seq_length)
            if skip_overlength and len(feature["input_ids"]) > max_seq_length:
                continue
            feature["input_ids"] = feature["input_ids"][:max_seq_length]
            yield feature

### 2.3 Save the dataset
# The script then creates a Hugging Face Dataset object from the generator and saves it to disk.


dataset = datasets.Dataset.from_generator(
    lambda: read_jsonl(jsonl_path, max_seq_length, skip_overlength)
    )
dataset.save_to_disk(save_path)



