omnisql

4f4ba442 · mashun1 · 4f4ba442 · 4f4ba442 · 4f4ba442 · 4f4ba442
Commit 4f4ba442 authored Apr 16, 2025 by mashun1
20 changed files
--- a/data_synthesis/database_synthesis/prompt_templates/schema_prompt.txt
+++ b/data_synthesis/database_synthesis/prompt_templates/schema_prompt.txt
--- a/data_synthesis/database_synthesis/sqlite_schema_parser.py
+++ b/data_synthesis/database_synthesis/sqlite_schema_parser.py
+import os
+import re
+import json
+import random
+import sqlite3
+import traceback
+
+def merge_foreign_keys_to_create_table(create_stmts, fk_stmts):
+    # Extract foreign key constraint information
+    #ALTER TABLE "performance_metrics" ADD CONSTRAINT fk_performance_metrics_app_id FOREIGN KEY ("app_id") REFERENCES applications ("app_id");
+    fk_constraints = {}
+    for alter_statement in fk_stmts:
+        match = re.search(r'ALTER TABLE "(\w+)" ADD CONSTRAINT (\w+) FOREIGN KEY \("(\w+)"\) REFERENCES (\w+) \("(\w+)"\)', alter_statement)
+        if match:
+            table_name = match.group(1)
+            constraint_name = match.group(2)
+            column_name = match.group(3)
+            ref_table_name = match.group(4)
+            ref_column_name = match.group(5)
+            if table_name in fk_constraints:
+                fk_constraints[table_name].append(f'CONSTRAINT {constraint_name} FOREIGN KEY ("{column_name}") REFERENCES {ref_table_name} ("{ref_column_name}")')
+            else:
+                fk_constraints[table_name] = [f'CONSTRAINT {constraint_name} FOREIGN KEY ("{column_name}") REFERENCES {ref_table_name} ("{ref_column_name}")']
+
+    # Merge foreign key constraints into the CREATE TABLE statement
+    modified_create_table_statements = []
+    for create_statement in create_stmts:
+        match = re.search(r'CREATE TABLE "(\w+)"', create_statement)
+        if match:
+            table_name = match.group(1)
+            if table_name in fk_constraints:
+                for fk in fk_constraints[table_name]:
+                    create_statement = create_statement.rstrip('\n);') + '), \n  ' + fk + '\n);'
+        modified_create_table_statements.append(create_statement)
+    return modified_create_table_statements
+
+def verify_ddl_in_transaction(ddl_stmts, db_id):
+    create_stmts = ddl_stmts['create_stmts']
+    insert_stmts = ddl_stmts['insert_stmts']
+    alter_stmts = ddl_stmts['alter_stmts']
+    fk_stmts = ddl_stmts['fk_stmts']
+    stmts = merge_foreign_keys_to_create_table(create_stmts, fk_stmts)
+    
+    os.makedirs(f'synthetic_sqlite_databases/{db_id}', exist_ok=True)
+    try:
+        # connect db
+        conn = sqlite3.connect(f'synthetic_sqlite_databases/{db_id}/{db_id}.sqlite')
+        cursor = conn.cursor()
+
+        # begin transaction
+        conn.execute('BEGIN TRANSACTION')
+        cursor.execute('PRAGMA foreign_keys = OFF;')
+
+        # CREATE TABLE
+        for stmt in stmts:
+            # print(stmt)
+            try:
+                cursor.execute(stmt)
+            except Exception as e:
+                # print("Exception: ", str(e))
+                continue
+        
+        # INSERT INTO
+        for stmt in insert_stmts:
+            # print(stmt)
+            try:
+                cursor.execute(stmt) 
+            except Exception as e:
+                # print("Exception: ", str(e))
+                continue       
+        
+        cursor.execute('PRAGMA foreign_keys = ON;')
+
+        # update values in foreign key columns
+        for alter_stmt in alter_stmts:
+            stmt = alter_stmt['alter_stmt']
+            values = alter_stmt['values']
+                
+            # create an empty dict to fill placeholder
+            filled_values = {}
+            for i, value in enumerate(values):
+                tp = value['type']
+                rg = value['range']
+                v = random.randint(0, rg)
+                if tp == "TEXT":
+                    v = str(v)
+                elif tp == "INTEGER":
+                    v = int(v)
+                filled_values[f'id_{i}'] = v
+                
+            stmt = stmt.format(**filled_values)
+            try:
+                cursor.execute(stmt)
+            except Exception as e:
+                # print("Exception: ", str(e))
+                continue
+
+
+        # commit transaciton
+        conn.commit()
+        print("Transaction committed successfully.")
+
+    except Exception as e:
+        # if any error occurs, roll back the transaction
+        conn.rollback()
+        print("Transaction failed and rolled back. Error:", str(e))
+        raise Exception()
+    finally:
+        # close the connection
+        conn.close()
+
+def convert_complex_type(sql_type):
+    """Converts complex types such as Array and Struct to SQLite-compatible types."""
+    if "Array" in sql_type:
+        return "TEXT"  # Convert Array to TEXT (as JSON-encoded strings)
+    elif "Struct" in sql_type:
+        return "TEXT"  # Convert Struct to TEXT (as JSON-encoded strings)
+    else:
+        # Mapping for standard types
+        type_mapping = {
+            "INTEGER": "INTEGER",
+            "VARCHAR": "TEXT",  # SQLite treats all VARCHAR as TEXT
+            "TEXT": "TEXT",
+            "REAL": "REAL",
+            "FLOAT": "REAL",
+            "DATE": "TEXT",
+            "TIME": "TEXT",
+            "BOOLEAN": "INTEGER"  # SQLite uses INTEGER for boolean
+        }
+        return type_mapping.get(sql_type, "TEXT")  # Default to TEXT if unknown type
+
+
+def format_value_for_sqlite(value, column_type):
+    """Formats values for SQLite, including handling Array and Struct types."""
+    if "Array" in column_type or "Struct" in column_type:
+        # Convert complex types (Array, Struct) to JSON strings
+        return f"'{json.dumps(value)}'"
+    elif isinstance(value, str):
+        # Escape single quotes in strings using replace before f-string
+        value = value.replace("'", "''")
+        return f"'{value}'"
+    elif value is None:
+        return "NULL"
+    return str(value)
+
+
+def generate_sqlite_ddl(json_schema):
+    """Generates SQLite DDL statements including primary and foreign keys, table descriptions, and sample row insertion."""
+    result = {}
+    ddl_statements = []
+    insert_stmts = []
+    foreign_key_statements = set()
+    foreign_keys_alter = {}
+    foreign_keys_alter_stmts = []
+    rows_cnt = {}
+    table_pk = {}
+    table_cols = {}
+    table_types = {}
+    
+    for table in json_schema['tables']:
+        table_name = table['table_name']
+        table_description = table.get('table_description', '')
+        column_names = table['column_names']
+        column_types = table['column_types']
+        descriptions = table['column_descriptions']
+        primary_key = table.get('primary_key', [])
+        sample_rows = table.get('sample_rows', [])
+
+        # Step 1: Create table comment (table description as a comment)
+        # if table_description:
+        #     ddl_statements.append(f'-- {table_description}')
+
+        # Step 2: Create table without foreign key constraints
+        columns_ddl = []
+        table_cols[table_name] = column_names
+        table_types[table_name] = column_types
+        for i, column_name in enumerate(column_names):
+            column_type = convert_complex_type(column_types[i])
+            description = descriptions[i]
+            columns_ddl.append(f'"{column_name}" {column_type} /* {description} */')
+
+        # Add primary key constraint
+        if primary_key:
+            table_pk[table_name] = primary_key
+            pk_columns = ', '.join(f'"{col}"' for col in primary_key)
+            columns_ddl.append(f'PRIMARY KEY ({pk_columns})')
+
+        ddl = f'CREATE TABLE "{table_name}" (\n  ' + ',\n  '.join(columns_ddl) + '\n);'
+        ddl_statements.append(ddl)
+    
+        rows_cnt[table_name] = len(sample_rows)
+
+        # Insert sample rows
+        if sample_rows:
+            for idx, row in enumerate(sample_rows):
+                # if idx > 2: # 
+                #     break
+                # Find the index of the primary key column
+                pk_indices = [column_names.index(key) for key in primary_key]
+                values = [format_value_for_sqlite(value, column_types[i]) for i, value in enumerate(row)]
+                for pk_idx in pk_indices:
+                    type_str = convert_complex_type(column_types[pk_idx])
+                    if type_str == 'TEXT':
+                        values[pk_idx] = str(idx)
+                    elif type_str == 'INTEGER':
+                        values[pk_idx] = idx
+                    elif type_str == "REAL":
+                        values[pk_idx] = float(idx)
+                
+                if len(column_names) != len(values):
+                    continue
+                values = ", ".join([str(value) for value in values])
+                # print(values)
+                insert_stmt = f'INSERT INTO "{table_name}" ({", ".join(column_names)}) VALUES ({values});'
+                insert_stmts.append(insert_stmt)
+    
+    table_sets = {}
+    for table_name, pks in table_pk.items():
+        table_sets[table_name] = set(pks)
+    for fk in json_schema['foreign_keys']:
+        table_name = fk['source_table']
+        src_cols = fk['column_in_source_table'] if type(fk['column_in_source_table']) == list else [fk['column_in_source_table']]
+        ref_cols = fk['column_in_referenced_table'] if type(fk['column_in_referenced_table']) == list else [fk['column_in_referenced_table']]
+        real_src_cols = []
+        real_ref_cols = []
+        for src_col, ref_col in zip(src_cols, ref_cols):
+            if src_col in table_sets[table_name]:
+                continue
+            real_ref_cols.append(ref_col)
+            real_src_cols.append(src_col)
+        if len(real_src_cols) == 0:
+            continue
+        fk_source_cols = ', '.join(f'"{col}"' for col in real_src_cols)
+        fk_ref_table = fk['referenced_table']
+        fk_ref_cols = ', '.join(f'"{col}"' for col in real_ref_cols)
+        column_names = table_cols[table_name]
+        column_types = table_types[table_name]
+        fk_stmt = (f'ALTER TABLE "{table_name}" '
+                    f'ADD CONSTRAINT fk_{table_name}_{"_".join(real_src_cols)} '
+                   f'FOREIGN KEY ({fk_source_cols}) REFERENCES {fk_ref_table} ({fk_ref_cols});')
+        if fk_stmt in foreign_key_statements:
+            continue
+        foreign_key_statements.add(fk_stmt)
+        if table_name in foreign_keys_alter:
+            for i in range(len(real_src_cols)):
+                foreign_keys_alter[table_name]['ref_table'].append(fk_ref_table)
+            foreign_keys_alter[table_name]['fk_cols'].extend(real_src_cols)
+            foreign_keys_alter[table_name]['fk_types'].extend([convert_complex_type(column_types[column_names.index(fk)]) for fk in real_src_cols])
+        else:
+            foreign_keys_alter[table_name] = {
+                "src_table": table_name,
+                "ref_table": [fk_ref_table],
+                "fk_cols": real_src_cols,
+                "fk_types": [convert_complex_type(column_types[column_names.index(fk)]) for fk in real_src_cols],
+                "pk_cols": table_pk[table_name],
+                "pk_types": [convert_complex_type(column_types[column_names.index(pk)]) for pk in table_pk[table_name]]
+            }
+                
+    # for stmt in ddl_statements:
+    #     pass
+    # Alter table for foreign key constraint DDL
+    for table_name, fk_alter in foreign_keys_alter.items():
+        source_table = fk_alter["src_table"]
+        ref_table = fk_alter["ref_table"]
+        src_row_num = rows_cnt[source_table]
+        ref_row_num = [rows_cnt[ref] for ref in ref_table]
+        pk_cols = fk_alter["pk_cols"]
+        pk_types = fk_alter["pk_types"]
+        cols = fk_alter["fk_cols"]
+        types = fk_alter["fk_types"]
+        for i in range(src_row_num):
+            ddl_stmt = f"UPDATE {source_table} SET "
+            fk_des = []
+            for j, col, tp in zip(range(len(cols)), cols, types):
+                id = random.randint(0, ref_row_num[j]-1)
+                fk_des.append({"type": tp, "range": ref_row_num[j]-1})
+                if tp == "TEXT":
+                    id = str(id)
+                elif tp == "REAL":
+                    id = float(id)
+                ddl_stmt += (f"{col}"+" = {id_"+str(j)+"}, ")
+            ddl_stmt = ddl_stmt.strip()[:-1] + " WHERE "
+            for j, pk, ptp in zip(range(len(pk_cols)) , pk_cols, pk_types):
+                i_v = i
+                if ptp == "TEXT":
+                    i_v = str(i_v)
+                elif ptp == "REAL":
+                    i_v = float(i_v)
+                if j == 0:
+                    ddl_stmt += f"{pk} = {i_v}"
+                else:
+                    ddl_stmt += f" and {pk} = {i_v}"
+            ddl_stmt += ";"
+            foreign_keys_alter_stmts.append({"alter_stmt": ddl_stmt, "values": fk_des})
+                # execute update
+    
+    # for stmt in foreign_key_statements:
+    #     pass
+    result["create_stmts"] = ddl_statements
+    result["insert_stmts"] = insert_stmts
+    result["alter_stmts"] = foreign_keys_alter_stmts
+    result["fk_stmts"] = list(foreign_key_statements)
+
+    return result
+
+
+# Example usage:
+json_schema_str = '''{
+  "tables": [
+    {
+      "table_name": "datasets",
+      "table_description": "Stores details of all greenhouse gas datasets collected from global sites.",
+      "column_names": ["dataset_id", "dataset_number", "site_id", "category", "gas_name", "sampling_method", "frequency", "year", "download_link", "readme_link"],
+      "column_types": ["INTEGER", "INTEGER", "INTEGER", "VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", "INTEGER", "VARCHAR", "VARCHAR"],
+      "column_descriptions": [
+        "Unique identifier for each dataset",
+        "Number assigned to the dataset",
+        "Reference to the site where the data was collected",
+        "Category of the data (e.g., Greenhouse Gases)",
+        "Name of the gas being monitored",
+        "Method of sampling (e.g., Surface PFP, Aircraft PFP, Flask)",
+        "Sampling frequency (e.g., Discrete, Continuous)",
+        "Year when the data was collected",
+        "Link to download the dataset",
+        "Link to the readme or metadata of the dataset"
+      ],
+      "primary_key": ["dataset_id"],
+      "sample_rows": [
+        [151, 151, 1, "Greenhouse Gases", "Carbon Dioxide(CO2)", "Surface PFP", "Discrete", 2023, "download_link_151", "readme_link_151"],
+        [152, 152, 2, "Greenhouse Gases", "Carbon Dioxide(CO2)", "Aircraft PFP", "Discrete", 2023, "download_link_152", "readme_link_152"]
+      ]
+    },
+    {
+      "table_name": "sites",
+      "table_description": "Details of the sites where greenhouse gas samples are collected.",
+      "column_names": ["site_id", "site_name", "location", "country", "contact_email"],
+      "column_types": ["INTEGER", "VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR"],
+      "column_descriptions": [
+        "Unique identifier for each site",
+        "Name of the site",
+        "Geographical location of the site",
+        "Country where the site is located",
+        "Contact email for the site or environmental team"
+      ],
+      "primary_key": ["site_id"],
+      "sample_rows": [
+        [1, "West Branch, Iowa", "West Branch, Iowa, United States", "USA", "contact@westbranch.us"],
+        [2, "Walnut Grove, California", "Walnut Grove, California, United States", "USA", "contact@walnutgrove.us"]
+      ]
+    },
+    {
+      "table_name": "sampling_methods",
+      "table_description": "Details of various sampling methods used for collecting air samples.",
+      "column_names": ["method_id", "method_name", "description"],
+      "column_types": ["INTEGER", "VARCHAR", "TEXT"],
+      "column_descriptions": [
+        "Unique identifier for each sampling method",
+        "Name of the sampling method (e.g., Surface PFP, Aircraft PFP)",
+        "Detailed description of the sampling method"
+      ],
+      "primary_key": ["method_id"],
+      "sample_rows": [
+        [1, "Surface PFP", "Surface flask sampling for air composition"],
+        [2, "Aircraft PFP", "Aircraft-based flask sampling for higher altitude air"]
+      ]
+    },
+    {
+      "table_name": "gas_samples",
+      "table_description": "Raw data of the gas concentrations measured at each site.",
+      "column_names": ["sample_id", "dataset_id", "gas_name", "concentration", "measurement_date", "measurement_time"],
+      "column_types": ["INTEGER", "INTEGER", "VARCHAR", "FLOAT", "DATE", "TIME"],
+      "column_descriptions": [
+        "Unique identifier for each gas sample",
+        "Reference to the dataset from which the sample is drawn",
+        "Name of the gas measured (e.g., CO2, CH4)",
+        "Concentration of the gas in ppm (parts per million)",
+        "Date of the measurement",
+        "Time of the measurement"
+      ],
+      "primary_key": ["sample_id"],
+      "sample_rows": [
+        [1, 151, "Carbon Dioxide(CO2)", 405.2, "2023-05-01", "12:00:00"],
+        [2, 152, "Carbon Dioxide(CO2)", 407.8, "2023-05-02", "12:30:00"]
+      ]
+    },
+    {
+      "table_name": "users",
+      "table_description": "Details of users accessing the datasets and samples.",
+      "column_names": ["user_id", "user_name", "email", "organization", "role"],
+      "column_types": ["INTEGER", "VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR"],
+      "column_descriptions": [
+        "Unique identifier for each user",
+        "Full name of the user",
+        "Email address of the user",
+        "Organization the user belongs to",
+        "Role of the user (e.g., researcher, admin, viewer)"
+      ],
+      "primary_key": ["user_id"],
+      "sample_rows": [
+        [101, "Dr. Alice Green", "alice.green@enviroresearch.org", "EnviroResearch", "researcher"],
+        [102, "John Doe", "john.doe@climatelabs.org", "Climate Labs", "admin"]
+      ]
+    }
+  ],
+  "foreign_keys": [
+    {
+      "source_table": "datasets",
+      "column_in_source_table": "site_id",
+      "referenced_table": "sites",
+      "column_in_referenced_table": "site_id"
+    },
+    {
+      "source_table": "gas_samples",
+      "column_in_source_table": "dataset_id",
+      "referenced_table": "datasets",
+      "column_in_referenced_table": "dataset_id"
+    }
+  ]
+}'''
+
+def verify_schema(json_schema, db_id):
+    # Convert the schema into DDL statements
+    try:
+        ddl_stmts = generate_sqlite_ddl(json_schema)
+        verify_ddl_in_transaction(ddl_stmts, db_id)
+        return True
+    except Exception as e:
+        print("Exception type:", type(e))
+        print("Exception message:", e)
+        # traceback.print_exc()
+        return False
+
+    # Print the DDL output
+    # print(ddl_output["create_stmts"])
+    # print(ddl_output["insert_stmts"])
+    # print(ddl_output["alter_stmts"])
+    # print(ddl_output["fk_stmts"])
+
+if __name__ == "__main__":
+    verify_schema(json.loads(json_schema_str), "test_db")
\ No newline at end of file
--- a/data_synthesis/database_synthesis/synthesize_schema.py
+++ b/data_synthesis/database_synthesis/synthesize_schema.py
+import argparse
+import json
+import os
+import re
+import time
+import json_repair
+
+import openai
+
+
+def parse_response(response):
+    domain_pattern = r'(?<=\[START_DOMAIN\])(.*?)(?=\[END_DOMAIN\])'
+    scenario_pattern = r'(?<=\[START_SCENARIO\])(.*?)(?=\[END_SCENARIO\])'
+    schema_pattern = r'(?<=\[START_DATABASE_SCHEMA\])(.*?)(?=\[END_DATABASE_SCHEMA\])'
+
+    try:
+        domain_match = re.search(domain_pattern, response, re.DOTALL)
+        domain = domain_match.group(0).strip() if domain_match else None
+
+        scenario_match = re.search(scenario_pattern, response, re.DOTALL)
+        scenario = scenario_match.group(0).strip() if scenario_match else None
+
+        schema_match = re.search(schema_pattern, response, re.DOTALL)
+        schema = schema_match.group(0).strip() if schema_match else None
+        schema_dict = json_repair.loads(schema)
+        schema = json.dumps(schema_dict, indent=2, ensure_ascii=False)
+
+        return domain, scenario, schema
+    except Exception as e:
+        print(response)
+        print("Parsing Exception:", str(e))
+        return None, None, None
+
+def llm_inference(model, base_url, prompts):
+    '''
+    This function leverages a large language model (LLM) to generate responses for a given list of prompts.
+    You can integrate your preferred LLM within this function.
+
+    Args:
+        model: The LLM to be used for inference.
+        prompts: A list of prompts for which the LLM will generate responses.
+
+    Returns:
+        A list of dictionaries containing the prompt, the generated response, and extracted components 
+        (domain, scenario, schema) from the response. Invalid responses are filtered out.
+    '''
+    client = openai.OpenAI(
+        base_url=base_url,
+        api_key="EMPTY"
+    )
+
+    # Generate responses using the LLM (each prompt corresponds to one response)
+    # responses = None  # Replace this with the actual LLM call, e.g., model.generate(prompts, temperature=0, n=1)
+
+    responses = []
+    for prompt in prompts:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[{"role":"user", "content": prompt}],
+            max_tokens=4196,
+            temperature=0.8
+        )
+        responses.append(response.choices[0].message.content.strip())
+
+    # Initialize a list to store the processed results
+    results = []
+
+    # Iterate over prompts and their corresponding responses
+    for prompt, response in zip(prompts, responses):
+        # Parse the response to extract domain, scenario, and schema
+        domain, scenario, schema = parse_response(response)
+
+        # Filter out invalid responses where any component is missing
+        if domain is None or scenario is None or schema is None:
+            continue
+
+        # Append valid results to the list
+        results.append({
+            "prompt": prompt,
+            "generated_content": {
+                "response": response,
+                "domain": domain,
+                "scenario": scenario,
+                "schema": schema
+            }
+        })
+
+    return results
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type = str)
+    parser.add_argument("--nums", type=int, default=None)
+    parser.add_argument("--base_url", type=str)
+    args = parser.parse_args()
+    
+    print(args)
+
+    prompts = json.load(open("./prompts/prompts_schema_synthesis.json"))[:args.nums]
+    output_file = "./results/schema_synthesis.json"
+    results = llm_inference(args.model, args.base_url, prompts)
+
+    with open(output_file, "w", encoding = "utf-8") as f:
+        f.write(json.dumps(results, indent = 2, ensure_ascii = False))
--- a/data_synthesis/database_synthesis/web_tables.json.zip
+++ b/data_synthesis/database_synthesis/web_tables.json.zip
--- a/data_synthesis/question_synthesis/README.md
+++ b/data_synthesis/question_synthesis/README.md
+# 程式化自然语言问题合成
+
+这是我们数据合成框架的第三步，专门用于为合成 SQL 查询生成风格化的自然语言问题。
+
+## 第 1 步：问题生成
+
+生成风格化的自然语言问题
+
+```bash
+# 创建用于生成问题的提示
+mkdir prompts
+python3 generate_question_synthesis_prompts.py
+```
+
+```bash
+# 为合成的 SQL 查询生成问题
+mkdir results
+python3 synthesize_question.py --model model_name --base_url vllm_serve_url(http://x.x.x.x:8000/v1)
+```
+
+## 第 2 步：后处理
+
+```bash
+# 执行以执行语义一致性选择，确保生成的问题与其相应的 SQL 查询紧密一致
+export HF_ENDPOINT=https://hf-mirror.com
+python3 post_process_questions.py
+```
\ No newline at end of file
--- a/data_synthesis/question_synthesis/README_official.md
+++ b/data_synthesis/question_synthesis/README_official.md
+# Stylized Natural Language Question Synthesis
+
+This is the third step in our data synthesis framework, dedicated to generating stylized natural language questions for synthetic SQL queries.
+
+## Step 1: Question Generation
+
+Generate stylized natural language questions.
+
+1. Run `python3 generate_question_synthesis_prompts.py` to create prompts for question generation.
+2. Execute `python3 synthesize_question.py` to generate questions for the synthesized SQL queries. Note: Ensure the `llm_inference()` function is implemented to integrate your preferred LLM. For each prompt (SQL query), we sample multiple responses (questions) with a temperature of `0.8`.
+
+## Step 2: Post-Processing
+
+1. Execute `python3 post_process_questions.py` to perform semantic consistency selection, ensuring the generated questions align closely with their corresponding SQL queries.
+2. The final synthetic `<question, SQL>` pairs will be saved to `./results/question_and_sql_pairs.json`.
\ No newline at end of file
--- a/data_synthesis/question_synthesis/generate_question_synthesis_prompts.py
+++ b/data_synthesis/question_synthesis/generate_question_synthesis_prompts.py
+import json
+import os
+import random
+import sqlite3
+import numpy as np
+import re
+from tqdm import tqdm
+
+style2desc = {
+"Formal": '''**Formal Style**
+   - Uses standard grammar and vocabulary.
+   - Example: Find all students older than 18 years and return their home addresses.''',
+
+"Colloquial": '''**Colloquial Style**
+   - Employs informal vocabulary and expressions.
+   - Example: Hey! Could you help me find all the students who are over 18? I'd love to know their names and where they live.''',
+
+"Imperative": '''**Imperative Style**
+   - Uses command or directive sentences.
+   - Example: Could you please gather all the students who are older than 18? I really need to know their names and where they live!''',
+
+"Interrogative": '''**Interrogative Style**
+   - Uses question forms.
+   - Example: Could you tell me which students are older than 18 and what their home addresses are?''',
+
+"Descriptive": '''**Descriptive Style**
+   - Uses detailed descriptions with contextual information.
+   - Example: I want to know the names and home addresses of all students older than 18.''',
+
+"Concise": '''**Concise Style**
+   - Use short sentences.
+   - Example: Students older than 18, return their names and addresses.''',
+
+"Vague": '''**Vague Style**
+   - Includes ambiguous vocabulary requiring inference.
+   - Example: What are the names and addresses of those older students? (External Knowledge: 'older students' refers to age >= 18.)''',
+
+"Metaphorical": '''**Metaphorical Style**
+   - Uses metaphors or metaphorical expressions.
+   - Example: Find the names and addresses of those who have reached adulthood. (External Knowledge: 'reached adulthood' refers to age >= 18.)''',
+
+"Multi-turn Dialogue": '''**Multi-turn Dialogue Style**
+    - This involves a dialogue to clarify the user's query needs.
+    - Example: [{"User": "I want to query some student information."}, {"Assistant": "Which students' information would you like to query?"}, {"User": "Students older than 18."}, {"Assistant": "What other information would you like to know about them?"}, {"User": "Names and addresses."}, {"Assistant": "Is there anything else you need?"}, {"User": "No."}, {"Assistant": "OK, I will help you translate your request into an SQL query."}]'''
+}
+
+steps_wo_ek = '''1. **Explain the SQL Query:** Provide a detailed explanation of what the query does.
+2. **Generate a Question:** Formulate a natural language question based on the SQL query and explanation.'''
+
+steps_w_ek = '''1. **Explain the SQL Query:** Provide a detailed explanation of what the query does.
+2. **Generate a Question:** Formulate a natural language question based on the SQL query and explanation.
+3. **External Knowledge:** For Vague or Metaphorical styles, include external knowledge to enhance clarity.'''
+
+steps_multi_round = '''1. **Explain the SQL Query:** Provide a detailed explanation of what the query does.
+2. **Generate a Dialogue:** Create a conversation between the User and the Assistant based on the SQL query and its explanation.'''
+
+guidelines_wo_ek = '''1. Clearly describe the columns being selected by the SQL query. For example:
+   - "SELECT * ... FROM ..." means "Find all ...";
+   - "SELECT f.check_date, f.status, f.remarks, c.year, c.year_min, c.year_max, c.year_average, c.data_quality_score FROM ..." means "Return the check dates, statuses, remarks, years, minimum years, maximum years, average years, and quality scores for ...".
+2. Ensure the natural language question accurately captures the semantics of the SQL query, including conditions such as predicates, `ORDER BY`, and `LIMIT` clauses.'''
+
+guidelines_w_ek = '''1. Clearly describe the columns being selected by the SQL query. For example:
+   - "SELECT * ... FROM ..." means "Find all ...";
+   - "SELECT f.check_date, f.status, f.remarks, c.year, c.year_min, c.year_max, c.year_average, c.data_quality_score FROM ..." means "Return the check dates, statuses, remarks, years, minimum years, maximum years, average years, and quality scores for ...".
+2. Ensure the natural language question accurately captures the semantics of the SQL query, including conditions such as predicates, `ORDER BY`, and `LIMIT` clauses.
+3. If necessary, incorporate external knowledge using multiple entries separated by semicolons (";"). These can include formulas, common sense, domain-specific knowledge, or extended context, such as information from long documents. Each entry should be concise.'''
+
+guidelines_multi_round = '''1. Clearly describe the columns being selected by the SQL query. For example:
+   - "SELECT * ... FROM ..." means "Find all ...";
+   - "SELECT f.check_date, f.status, f.remarks, c.year, c.year_min, c.year_max, c.year_average, c.data_quality_score FROM ..." means "Return the check dates, statuses, remarks, years, minimum years, maximum years, average years, and quality scores for ...".
+2. Ensure the conversation accurately captures the semantics of the SQL query, including conditions such as predicates, `ORDER BY`, and `LIMIT` clauses.'''
+
+output_format_wo_ek = '''Please structure your response as follows:
+
+[EXPLANATION-START]
+(SQL Explanation)
+[EXPLANATION-END]
+
+[QUESTION-START]
+(Natural Language Question)
+[QUESTION-END]
+
+- **SQL Explanation**: Provide a clear and detailed explanation of the SQL query, enclosed within [EXPLANATION-START] and [EXPLANATION-END].
+- **Natural Language Question**: Translate the SQL query into a natural language question, enclosed within [QUESTION-START] and [QUESTION-END].'''
+
+output_format_w_ek = '''Please structure your response as follows:
+
+[EXPLANATION-START]
+(SQL Explanation)
+[EXPLANATION-END]
+
+[QUESTION-START]
+(Natural Language Question)
+[QUESTION-END]
+
+[EXTERNAL-KNOWLEDGE-START]
+(External Knowledge)
+[EXTERNAL-KNOWLEDGE-END]
+
+- **SQL Explanation**: Provide a clear and detailed explanation of the SQL query, enclosed within [EXPLANATION-START] and [EXPLANATION-END].
+- **Natural Language Question**: Translate the SQL query into a natural language question, enclosed within [QUESTION-START] and [QUESTION-END].
+- **External Knowledge**: Include any relevant external knowledge if applicable, enclosed within [EXTERNAL-KNOWLEDGE-START] and [EXTERNAL-KNOWLEDGE-END]. Leave this section blank if not needed.'''
+
+output_format_multi_round = '''Please structure your response as follows:
+
+[EXPLANATION-START]
+(SQL Explanation)
+[EXPLANATION-END]
+
+[QUESTION-START]
+(Natural Language Question, in the format of [{"User": ...}, {"Assistant": ...}, {"User": ...}, ....])
+[QUESTION-END]
+
+- **SQL Explanation**: Provide a clear and detailed explanation of the SQL query, enclosed within [EXPLANATION-START] and [EXPLANATION-END].
+- **Natural Language Question**: Convert the SQL query into a multi-round dialogue, enclosed within [QUESTION-START] and [QUESTION-END]. Represent this as a list that captures multiple rounds of conversation between the User and the Assistant.'''
+
+instruction_wo_ek = "Based on the above information, follow the reasoning steps to generate the explanation and the question corresponding to the SQL query."
+
+instruction_w_ek = "Based on the above information, follow the reasoning steps to generate the explanation, the question, and the external knowledge corresponding to the SQL query."
+
+instruction_multi_round = "Based on the above information, follow the reasoning steps to generate the explanation and the dialogue corresponding to the SQL query."
+
+def obtain_db_schema(db_file_dir):
+    conn = sqlite3.connect(db_file_dir)
+    cursor = conn.cursor()
+
+    cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='table';")
+    tables = cursor.fetchall()
+
+    table_names = []
+    create_statements = []
+    for table in tables:
+        table_name, create_statement = table
+        table_names.append(table_name)
+        create_statements.append(create_statement)
+
+    cursor.close()
+    conn.close()
+
+    return table_names, create_statements
+
+# NOTE: When columns with the same names exist in different tables, more detailed design considerations are necessary
+def extract_column_descriptions(create_statements):
+    column_name2column_desc = dict()
+    # Regular expression to match column definitions
+    pattern = r'"(\w+)"\s+\w+\s*/\*\s*(.*?)\s*\*/'
+
+    for create_statement in create_statements:
+        # Find all matches in the string
+        matches = re.findall(pattern, create_statement)
+
+        # Print the results
+        for column_name, description in matches:
+            column_name = column_name.lower()
+            if column_name not in column_name2column_desc:
+                column_name2column_desc[column_name] = description
+
+    return column_name2column_desc
+
+if __name__ == "__main__":
+    random.seed(42)
+    db_path = "../database_synthesis/synthetic_sqlite_databases"
+    sql_infos = json.load(open("../sql_synthesis/results/synthetic_sqls.json"))
+    question_synthesis_template = open("./prompt_templates/question_synthesis_prompt.txt").read()
+    styles = ["Formal", "Colloquial", "Imperative", "Interrogative", "Descriptive", "Concise", "Vague", "Metaphorical", "Multi-turn Dialogue"]
+
+    print(sql_infos[0])
+    db_ids = list(set([sql["db_id"] for sql in sql_infos]))
+    print(len(db_ids))
+
+    db_id2column_info = dict()
+    for db_id in tqdm(db_ids):
+        table_names, create_statements = obtain_db_schema(os.path.join(db_path, db_id, db_id + ".sqlite"))
+        db_id2column_info[db_id] = extract_column_descriptions(create_statements)
+    
+    prompts = []
+    for sql_info in tqdm(sql_infos):
+        style_name = random.sample(styles, 1)[0]
+        column_name2column_desc = db_id2column_info[sql_info["db_id"]]
+        used_column_name2column_desc = dict()
+        for column_name, column_desc in column_name2column_desc.items():
+            if column_name.lower() in sql_info["sql"].lower():
+                used_column_name2column_desc[column_name] = column_desc
+
+        if style_name in ["Vague", "Metaphorical"]: # "Vague" and "Metaphorical" styles require external knowledge
+            steps = steps_w_ek
+            guidelines = guidelines_w_ek
+            instruction = instruction_w_ek
+            output_format = output_format_w_ek
+        elif style_name == "Multi-turn Dialogue": # the "Multi-turn Dialogue" style uses a special multi-round format
+            steps = steps_multi_round
+            guidelines = guidelines_multi_round
+            instruction = instruction_multi_round
+            output_format = output_format_multi_round
+        else:
+            steps = steps_wo_ek
+            guidelines = guidelines_wo_ek
+            instruction = instruction_wo_ek
+            output_format = output_format_wo_ek
+
+        prompt = question_synthesis_template.format(
+            style_desc = style2desc[style_name].strip(),
+            engine = "SQLite",
+            column_info = json.dumps(used_column_name2column_desc, indent = 2, ensure_ascii = False).strip(),
+            sql = sql_info["sql"].strip(),
+            steps = steps.strip(),
+            guidelines = guidelines.strip(),
+            output_format = output_format.strip(),
+            instruction = instruction.strip()
+        )
+        
+        sql_info["style"] = style_name
+        sql_info["prompt"] = prompt
+    
+    with open("prompts/question_synthesis_prompts.json", "w", encoding="utf-8") as f:
+        f.write(json.dumps(sql_infos, indent=2, ensure_ascii=False))
\ No newline at end of file
--- a/data_synthesis/question_synthesis/post_process_questions.py
+++ b/data_synthesis/question_synthesis/post_process_questions.py
+import json
+import re
+import time
+import random
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+import numpy as np
+import math
+from scipy.spatial.distance import cdist
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
+import matplotlib.pyplot as plt
+
+def visualize_embeddings(embeddings, min_index):
+    pca = PCA(n_components=2)
+    embeddings_2d = pca.fit_transform(embeddings)
+
+    plt.figure(figsize=(8, 6))
+
+    plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], color='red', label='Other Points')
+    plt.scatter(embeddings_2d[min_index, 0], embeddings_2d[min_index, 1], color='blue', label='Central Point', s=100)
+
+    plt.legend()
+
+    plt.title('2D PCA of Embeddings')
+    plt.xlabel('PCA Component 1')
+    plt.ylabel('PCA Component 2')
+
+    plt.savefig(f"embeddings/figure-{random.randint(0,10000000000)}")
+
+def parse_llm_response(response, style):
+    explanation_pattern = re.compile(r'\[EXPLANATION-START\](.*?)\[EXPLANATION-END\]', re.DOTALL)
+    question_pattern = re.compile(r'\[QUESTION-START\](.*?)\[QUESTION-END\]', re.DOTALL)
+    external_knowledge_pattern = re.compile(r'\[EXTERNAL-KNOWLEDGE-START\](.*?)\[EXTERNAL-KNOWLEDGE-END\]', re.DOTALL)
+
+    explanation_match = explanation_pattern.search(response)
+    question_match = question_pattern.search(response)
+    external_knowledge_match = external_knowledge_pattern.search(response)
+
+    explanation_content = explanation_match.group(1).strip() if explanation_match else ""
+    question_content = question_match.group(1).strip() if question_match else ""
+    external_knowledge_content = external_knowledge_match.group(1).strip() if external_knowledge_match else ""
+    
+    if style == "Multi-turn Dialogue":
+        # parse dialogue
+        try:
+            dialog = ""
+            for turn in json.loads(question_content):
+                dialog += "**" + list(turn.keys())[0] + "**: " + list(turn.values())[0] + "\n"
+            question_content = dialog
+        except Exception as e:
+            print(e)
+            return None
+
+    if explanation_content == "" or question_content == "":
+        return None
+    else:
+        return {
+            "question": question_content.strip(),
+            "explanation": explanation_content.strip(),
+            "external_knowledge": external_knowledge_content.strip()
+        }
+    
+def integrate_info(sql2question_prompt_info, question_info):
+    if sql2question_prompt_info["db_id"].endswith(".db"):
+        db_id = sql2question_prompt_info["db_id"][:-3]
+    else:
+        db_id = sql2question_prompt_info["db_id"]
+    return {
+        "db_id": db_id,
+        "sql": sql2question_prompt_info["sql"],
+        "sql_result_column_count": sql2question_prompt_info["column_count"],
+        "sql_result_rows_count": sql2question_prompt_info["rows"],
+        "sql_complexity": sql2question_prompt_info["complexity"],
+        "question_style": sql2question_prompt_info["style"],
+        "sql_explanation": question_info["explanation"],
+        "question": question_info["question"],
+        "external_knowledge": question_info["external_knowledge"]
+    }
+
+def edu_distance(vector1, vector2):
+    distance = 0
+    for num1, num2 in zip(vector1, vector2):
+        distance += (num1-num2) ** 2
+    return math.sqrt(distance)
+
+if __name__ == "__main__":
+    input_dataset = json.load(open("./results/question_synthesis.json"))
+    output_file = "./results/question_and_sql_pairs.json"
+
+    print("loading SentenceTransformer....")
+    embedding_model = SentenceTransformer(model_name_or_path = "sentence-transformers/all-mpnet-base-v2", device = "cuda:0")
+
+    valid_questions_num = []
+    result_dataset = []
+    for data in tqdm(input_dataset):
+        question_infos = []
+        for response in data["responses"]:
+            question_info = parse_llm_response(response, data["style"])
+            if question_info is not None:
+                question_infos.append(question_info)
+        
+        valid_questions_num.append(len(question_infos))
+
+        if len(question_infos) == 0: # no valid question
+            continue
+        elif len(question_infos) == 1: # only one valid question
+            result_dataset.append(integrate_info(data, question_infos[0]))
+        elif len(question_infos) == 2: # two valid questions
+            # we randomly select one of them
+            result_dataset.append(integrate_info(data, random.sample(question_infos, 1)[0]))
+        else: # more than two valid questions
+            # we vote the final question according to the EK+question embeddings
+            texts = [question_info["external_knowledge"] + " " + question_info["question"] for question_info in question_infos]
+            texts = [text.strip() for text in texts]
+
+            # we vote the final question according to the question embeddings
+            # texts = [question_info["question"] for question_info in question_infos]
+            embeddings = embedding_model.encode(texts)
+            
+            # find the index of the question at the central point
+            distance_matrix = cdist(embeddings, embeddings, metric = 'cosine') # metric='cityblock' or metric='euclidean'
+            distance_sums = distance_matrix.sum(axis = 1)
+            min_index = np.argmin(distance_sums)
+            
+            result_dataset.append(integrate_info(data, question_infos[min_index]))
+
+            # print("EK:\n", integrate_info(data, question_infos[min_index])["external_knowledge"])
+            # print("Question:\n", integrate_info(data, question_infos[min_index])["question"])
+            # print("SQL:\n", integrate_info(data, question_infos[min_index])["sql"])
+            # print("---------------------------------------")
+            # visualize_embeddings(embeddings, min_index)
+
+    with open(output_file, "w", encoding="utf-8") as f:
+        f.write(json.dumps(result_dataset, indent=2, ensure_ascii=False))
+
+    question_num2count = dict()
+    for num in valid_questions_num:
+        if num in question_num2count:
+            question_num2count[num] += 1
+        else:
+            question_num2count[num] = 1
+    print(question_num2count)
\ No newline at end of file
--- a/data_synthesis/question_synthesis/prompt_templates/question_synthesis_prompt.txt
+++ b/data_synthesis/question_synthesis/prompt_templates/question_synthesis_prompt.txt
+**Task Overview**
+Your task is to create a high-quality natural language question based on a given SQL query and other information.
+
+**Style**
+The natural language question should follow this style:
+{style_desc}
+
+**Database Engine**
+{engine}
+
+**Column Information**
+Below are column names and their corresponding descriptions:
+{column_info}
+
+**SQL Query**
+Given SQL query:
+```sql
+{sql}
+```
+
+**Reasoning Steps**
+{steps}
+
+**Guidelines**
+{guidelines}
+
+**Output Format**
+{output_format}
+
+**Insturction**
+{instruction}
\ No newline at end of file
--- a/data_synthesis/question_synthesis/synthesize_question.py
+++ b/data_synthesis/question_synthesis/synthesize_question.py
+import argparse
+import json
+from tqdm import tqdm
+import openai
+
+
+def llm_inference(model, base_url, dataset):
+    """
+    Perform LLM inference to generate multiple responses for each prompt in the dataset.
+
+    Args:
+        model: The LLM used for inference.
+        dataset: A list of dictionaries.
+
+    Returns:
+        A list of dictionaries, where each dictionary includes the original data and the corresponding generated responses.
+    """
+    client = openai.OpenAI(
+        base_url=base_url,
+        api_key="EMPTY"
+    )
+    prompts = [data["prompt"] for data in dataset]
+
+    # Placeholder for storing generated responses for each prompt
+    # Each element in `responses_list` is a list of responses (strings) corresponding to a prompt.
+    responses_list = []  # Replace this with your actual response generation logic.
+    
+    for prompt in prompts:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[{"role":"user", "content": prompt}],
+            max_tokens=4196,
+            temperature=0.8
+        )
+        responses_list.append(response.choices[0].message.content.strip())
+
+    # Initialize an empty list to store the results
+    results = []
+
+    # Iterate through the dataset and the corresponding responses
+    for data, responses in zip(dataset, responses_list):
+        # Add the generated responses to the current data entry
+        data["responses"] = responses
+
+        # Append the updated data entry to the results
+        results.append(data)
+
+    return results
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type = str)
+    parser.add_argument("--base_url", type=str)
+    opt = parser.parse_args()
+    print(opt)
+
+    input_dataset = json.load(open("./prompts/question_synthesis_prompts.json"))
+    output_file = "./results/question_synthesis.json"
+    
+    results = llm_inference(opt.model, opt.base_url, input_dataset)
+
+    with open(output_file, "w", encoding = "utf-8") as f:
+        f.write(json.dumps(results, indent = 2, ensure_ascii = False))
\ No newline at end of file
--- a/data_synthesis/sql_synthesis/README.md
+++ b/data_synthesis/sql_synthesis/README.md
+# 复杂性感知型 SQL 查询生成
+
+## 第 1 步：SQL 查询生成
+
+利用数据库架构、数据库值、查询复杂性和 SQLite 支持的函数生成 SQL 查询
+
+```bash
+# 创建用于生成 SQL 查询的提示。
+
+mkdir prompts
+python3 generate_sql_synthesis_prompts.py
+```
+
+```bash
+# 运行以使用 LLM 生成 SQL 查询
+python3 synthesize_sql.py --model model_name --base_url vllm_serve_url(http://x.x.x.x:8000/v1)
+```
+
+## 第 2 步：后处理
+
+优化生成的 SQL 查询以确保质量并删除无效或冗余的查询
+
+```bash
+python3 post_process_sqls.py
+```
\ No newline at end of file
--- a/data_synthesis/sql_synthesis/README_official.md
+++ b/data_synthesis/sql_synthesis/README_official.md
+# Complexity-Aware SQL Query Generation
+
+This is the second step in our data synthesis framework, focused on generating complexity-aware SQL queries based on synthetic databases.
+
+## Step 1: SQL Query Generation
+
+Generate SQL queries by leveraging database schemas, database values, query complexity, and SQLite-supported functions.
+
+1. Execute `python3 generate_sql_synthesis_prompts.py` to create prompts for SQL query generation.
+2. Run `python3 synthesize_sql.py` to generate SQL queries using LLMs. (Note: Implement the `llm_inference()` function to integrate your preferred LLM.)
+
+## Step 2: Post-Processing
+
+Refine the generated SQL queries to ensure quality and remove invalid or redundant queries:
+
+1. Run `python3 post_process_sqls.py` to:
+   - Discard non-SELECT queries.
+   - Remove queries with syntax errors or execution timeouts.
+   - Deduplicate queries based on their templates.
+
+2. The final synthetic SQL queries will be saved in `./results/synthetic_sqls.json`.
--- a/data_synthesis/sql_synthesis/generate_sql_synthesis_prompts.py
+++ b/data_synthesis/sql_synthesis/generate_sql_synthesis_prompts.py
+import json
+import os
+import random
+import sqlite3
+import numpy as np
+
+from tqdm import tqdm
+
+sql_func_template = '''
+### SQL Functions
+You may consider one or more of the following SQL functions while generating the query:
+{sql_funcs}
+Important tips:
+Except for the functions listed above, you may use any other functions as long as they conform to the syntax of the database engine.
+'''
+
+insert_stmts_template = '''
+### INSERT INTO Statements
+Below are several `INSERT INTO` statements. Use these to help generate predicates (i.e., `WHERE` clauses) in your SQL query:
+
+{insert_statements}
+'''
+
+simple_criterion = '''**Criteria:**
+Simple SQL queries may satisfy one or more of the following criteria:
+- Simple queries should select data from a single table only.
+- Basic aggregate functions are permitted, such as `COUNT`, `SUM`, `AVG`, `MIN`, `MAX`.
+- No joins are allowed; the query must operate on a single table.
+
+**Example of Simple SQL Query:**
+```sql
+SELECT name, department_name
+FROM employees
+WHERE level > 5
+ORDER BY age DESC;
+```'''
+
+moderate_criterion = '''**Criteria:**
+Moderate SQL queries may satisfy one or more of the following criteria:
+- Involves table joins, such as `JOIN`, `INNER JOIN`, `LEFT JOIN`, `CROSS JOIN`, etc.
+- Includes subqueries within the `SELECT` or `WHERE` clauses.
+- Utilizes aggregate functions alongside a `GROUP BY` clause.
+- Contains complex `WHERE` conditions, including `IN`, `BETWEEN`, `LIKE`.
+- Incorporate a `HAVING` clause to filter aggregated results.
+- Uses aggregate functions like `COUNT`, `SUM`, `AVG`, `MIN`, `MAX`, etc.
+
+**Example of Moderate SQL Query:**
+```sql
+SELECT e.name, d.department_name, AVG(s.salary) AS average_salary
+FROM employees e
+INNER JOIN departments d ON e.department_id = d.department_id
+LEFT JOIN salaries s ON e.employee_id = s.employee_id
+WHERE e.age > 30 AND e.status = 'active'
+GROUP BY e.name, d.department_name
+HAVING AVG(s.salary) > 50000;
+```'''
+
+complex_criterion = '''**Criteria:**
+Complex SQL queries may satisfy one or more of the following criteria:
+- Contains complex nested subqueries.
+- Utilizes multiple types of joins, including self-joins.
+- Includes window functions, such as `ROW_NUMBER`, `RANK`, etc.
+- Uses Common Table Expressions (CTEs) for improved readability.
+- Combines multiple aggregate functions.
+- Involves complex `WHERE` and `HAVING` clauses with multiple conditions.
+- Utilizes advanced functions and operators.
+
+**Example of Complex SQL Query:**
+```sql
+WITH EmployeeCTE AS (
+    SELECT employee_id, name, department_id, ROW_NUMBER() OVER (PARTITION BY department_id ORDER BY salary DESC) AS rank
+    FROM employees
+)
+SELECT e.name, d.department_name
+FROM EmployeeCTE e
+INNER JOIN departments d ON e.department_id = d.department_id
+WHERE e.rank <= 3;
+```'''
+
+highly_complex_criterion = '''**Criteria:**
+Highly complex SQL queries may satisfy one or more of the following criteria:
+- Includes multiple Common Table Expressions (CTEs) for readability.
+- Combines nested subqueries and various joins.
+- Utilizes recursive CTEs for hierarchical or recursive queries.
+- Extensively uses advanced window functions.
+- May involve `UNION` or `UNION ALL` to combine result sets.
+- Implements complex logic with advanced analytical functions.
+- Employs a wide range of SQL clauses and conditions.
+- Utilizes a broad spectrum of SQL functions and advanced features.
+
+**Example of Highly Complex SQL Query:**
+```sql
+WITH RECURSIVE EmployeeHierarchy AS (
+    SELECT employee_id, name, manager_id, department_id, 1 as level
+    FROM employees
+    WHERE manager_id IS NULL
+    UNION ALL
+    SELECT e.employee_id, e.name, e.manager_id, e.department_id, eh.level + 1
+    FROM employees e
+    JOIN EmployeeHierarchy eh ON e.manager_id = eh.employee_id
+),
+DepartmentSalaries AS (
+    SELECT eh.employee_id, eh.name, eh.level, d.department_name, s.salary, d.department_id
+    FROM EmployeeHierarchy eh
+    INNER JOIN departments d ON eh.department_id = d.department_id
+    INNER JOIN salaries s ON eh.employee_id = s.employee_id
+),
+DepartmentStats AS (
+    SELECT 
+        d.department_id,
+        COUNT(e.employee_id) AS employee_count,
+        AVG(s.salary) AS average_salary
+    FROM employees e
+    INNER JOIN salaries s ON e.employee_id = s.employee_id
+    INNER JOIN departments d ON e.department_id = d.department_id
+    GROUP BY d.department_id
+)
+SELECT ds.name, ds.level, 
+    SUM(ds.salary) OVER (PARTITION BY ds.department_id ORDER BY ds.level, ds.name) AS cumulative_salary
+FROM DepartmentSalaries ds
+INNER JOIN DepartmentStats dstat ON ds.department_id = dstat.department_id
+ORDER BY ds.level, ds.name;
+```'''
+
+def obtain_db_schema(db_file_dir):
+    conn = sqlite3.connect(db_file_dir)
+    cursor = conn.cursor()
+
+    cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='table';")
+    tables = cursor.fetchall()
+
+    table_names = []
+    create_statements = []
+    for table in tables:
+        table_name, create_statement = table
+        table_names.append(table_name)
+        create_statements.append(create_statement)
+
+    cursor.close()
+    conn.close()
+
+    return table_names, create_statements
+
+def obtain_insert_statements(db_file_dir, table_names):
+    table_name2insert_statements = dict()
+    conn = sqlite3.connect(db_file_dir)
+    cursor = conn.cursor()
+
+    for table_name in table_names:
+        try:
+            cursor.execute(f'SELECT * FROM "{table_name}" LIMIT 2')
+            rows = cursor.fetchall()
+
+            column_names = [description[0] for description in cursor.description]
+
+            insert_statements = []
+            for row in rows:
+                values = ', '.join([f"'{str(value)}'" if isinstance(value, str) else str(value) for value in row])
+                insert_statement = f"INSERT INTO {table_name} ({', '.join(column_names)}) VALUES ({values});"
+                insert_statements.append(insert_statement)
+
+            # for statement in insert_statements:
+            #     print(statement)
+            table_name2insert_statements[table_name] = insert_statements
+
+        except Exception as e:
+            print(e)
+
+    cursor.close()
+    conn.close()
+
+    return table_name2insert_statements
+
+if __name__ == "__main__":
+    random.seed(42)
+    db_path = "../database_synthesis/synthetic_sqlite_databases"
+    prompt_template = open("./prompt_templates/sql_synthesis_prompt.txt", "r", encoding = "utf-8").read()
+    functions = json.load(open("./prompt_templates/sqlite_funcs.json"))
+
+    complexity2criterion = {
+        "Simple": simple_criterion,
+        "Moderate": moderate_criterion,
+        "Complex": complex_criterion, 
+        "Highly Complex": highly_complex_criterion
+    }
+
+    db_names = os.listdir(db_path)
+    prompts = []
+    for db_name in tqdm(db_names):
+        try:
+            db_file_dir = os.path.join(db_path, db_name, db_name + ".sqlite")
+            table_names, create_statements = obtain_db_schema(db_file_dir)
+            table_name2insert_statements = obtain_insert_statements(db_file_dir, table_names)
+
+            for _ in range(0, 300):
+                complexity = random.sample(["Simple", "Moderate", "Complex", "Highly Complex"], 1)[0] 
+
+                insert_statements = []
+                for table_name in table_names:
+                    insert_statements += table_name2insert_statements.get(table_name, [])
+                
+                if len(insert_statements) == 0:
+                    db_value_prompt = ""
+                else:
+                    if len(insert_statements) > 4:
+                        insert_statements = random.sample(insert_statements, 4)
+                    db_value_prompt = insert_stmts_template.format(insert_statements = "\n\n".join(insert_statements))
+
+                function_num = random.randint(0, 2)
+                if function_num == 0:
+                    sql_function_prompt = "### SQL Functions\nYou can use any function supported by the database engine."
+                else:
+                    sql_funcs = ""
+                    sampled_functions = random.sample(functions, function_num)
+                    for idx, func in enumerate(sampled_functions):
+                        sql_funcs += f"Function {idx + 1}:\n" + func.strip() + "\n"
+                    sql_function_prompt = sql_func_template.format(sql_funcs = sql_funcs)
+
+                column_count = np.random.geometric(0.6, 1)[0]
+                prompt = prompt_template.format(
+                    schema_str = "\n\n".join(create_statements),
+                    sql_function_prompt = sql_function_prompt.strip(),
+                    db_value_prompt = db_value_prompt.strip(),
+                    complexity = complexity,
+                    criterion = complexity2criterion[complexity].strip(),
+                    db_engine = "SQLite",
+                    column_count = column_count
+                )
+
+                prompts.append({"prompt": prompt, "db_id": db_name})
+        except Exception as e:
+            print(e)
+
+    with open("./prompts/sql_synthesis_prompts.json", "w", encoding="utf-8") as f:
+        f.write(json.dumps(prompts, indent=2, ensure_ascii=False))
\ No newline at end of file
--- a/data_synthesis/sql_synthesis/post_process_sqls.py
+++ b/data_synthesis/sql_synthesis/post_process_sqls.py
+import json
+import sqlite3
+import os
+import sys
+import re
+import time
+
+from tqdm import tqdm
+from func_timeout import func_timeout, FunctionTimedOut
+import multiprocessing as mp
+import ijson
+
+def execute_sql(sql, db_path):
+    if sql.strip() == "":
+        return None
+
+    execution_result = None
+    column_count = None
+    conn = None
+    try:
+        conn = sqlite3.connect(db_path)
+        cursor = conn.cursor()
+
+        # start a transaction
+        cursor.execute("BEGIN")
+
+        # execute the SQL query
+        cursor.execute(sql)
+        execution_result = cursor.fetchall()
+        column_count = len(cursor.description)
+        
+        # roll back the transaction to ensure that the database state is not changed
+        cursor.execute("ROLLBACK")
+    except Exception as e:
+        # print(f"An error occurred: {e}")
+        pass
+    finally:
+        if conn is not None:
+            conn.close()
+    return execution_result, column_count
+
+def execute_wrapper(sample_idx, db_id, sql, complexity, timeout, db_dir):
+    try:
+        execution_result, column_count = func_timeout(timeout, execute_sql, args = (sql, os.path.join(db_dir, db_id, db_id + ".sqlite")))
+        if execution_result is None or column_count is None:
+            return [sample_idx, db_id, sql, complexity, 0, 0, 0]
+        else:
+            return [sample_idx, db_id, sql, complexity, 1, column_count, len(execution_result)]
+    except KeyboardInterrupt:
+        sys.exit(0)
+    except FunctionTimedOut:
+        return [sample_idx, db_id, sql, complexity, 0, 0, 0]
+    except Exception as e:
+        return [sample_idx, db_id, sql, complexity, 0, 0, 0]
+
+def execute_callback(result):
+    sample_idx, db_id, sql, complexity, valid_flag, column_count, rows = result
+    if valid_flag == 1:
+        no_timeout_synthesized_sqls.append(
+            {"db_id": db_id, "sql": sql, "column_count": column_count, "rows": rows, "complexity": complexity}
+        )
+    # print("Done:", sample_idx)
+
+def remove_timeout_sqls_parallel(synthesized_sqls, db_dir, num_cpus = 20, timeout = 1):
+    '''Execute the sqls in parallel'''
+    parallel_batch_size = 10240
+    batches = [synthesized_sqls[i: i+parallel_batch_size] for i in range(0, len(synthesized_sqls), parallel_batch_size)]
+
+    assert len(synthesized_sqls) == sum([len(batch_sqls) for batch_sqls in batches])
+
+    for batch_idx, batch_sqls in enumerate(batches):
+        print(f"execution process: {batch_idx+1}/{len(batches)}")
+        pool = mp.Pool(processes = num_cpus)
+        for sample_idx, sql_info in enumerate(batch_sqls):
+            pool.apply_async(
+                execute_wrapper, 
+                args = (sample_idx, sql_info["db_id"], sql_info["sql"], sql_info["complexity"], timeout, db_dir), 
+                callback = execute_callback
+            )
+        pool.close()
+        pool.join()
+        time.sleep(10)
+
+def analyze_complexity(results):
+    complexity2num = dict()
+    for res in results:
+        complexity = res["complexity"]
+        if complexity in complexity2num:
+            complexity2num[complexity] += 1
+        else:
+            complexity2num[complexity] = 1
+    print(complexity2num)
+
+def analyze_column_count(results):
+    column_count2num = dict()
+    for res in results:
+        column_count = res["column_count"]
+        if column_count in column_count2num:
+            column_count2num[column_count] += 1
+        else:
+            column_count2num[column_count] = 1
+    print(column_count2num)
+
+def analyze_advanced_functions(results):
+    function2num = dict()
+    functions = json.load(open("prompt_templates/sqlite_funcs.json"))
+    functions = [func_desc.split("(")[0] for func_desc in functions]
+
+    for res in results:
+        sql = res["sql"]
+        for function in functions:
+            if function.lower()+"(" in sql.lower():
+                if function in function2num:
+                    function2num[function] += 1
+                else:
+                    function2num[function] = 1
+
+    print(function2num)
+
+def analyze_used_tables_num(synthesized_sqls, db_id2table_names):
+    used_tables_num2count = dict()
+    for sql_info in tqdm(synthesized_sqls):
+        table_names_in_db = db_id2table_names[sql_info["db_id"]]
+        sql = sql_info["sql"]
+        if sql.endswith(";"):
+            sql = sql[:-1]
+        sql_tokens = sql.strip().lower().split()
+        # print(table_names_in_db)
+        # print(sql_tokens)
+
+        used_tables = set()
+
+        for table_name in table_names_in_db:
+            if table_name.lower() in sql_tokens:
+                used_tables.add(table_name.lower())
+
+        used_tables_num = len(used_tables)
+        # print(used_tables)
+        # print(used_tables_num)
+        # print("------------------------------------------")
+        if used_tables_num in used_tables_num2count:
+            used_tables_num2count[used_tables_num] += 1
+        else:
+            used_tables_num2count[used_tables_num] = 1
+    
+    print(used_tables_num2count)
+
+
+def filter_executable_sqls(synthesized_sqls, db_dir):
+    executable_sqls = []
+    for sql_info in tqdm(synthesized_sqls):
+        db_path = os.path.join(db_dir, sql_info["db_id"], sql_info["db_id"] + ".sqlite")
+        query_plan, _ = execute_sql("EXPLAIN QUERY PLAN " + sql_info["sql"], db_path)
+        if query_plan is not None:
+            sql_info["query_plan"] = str(query_plan)
+            executable_sqls.append(sql_info)
+    return executable_sqls
+
+def filter_select_sqls(synthesized_sqls):
+    '''
+        remain SELECT-type queries
+    '''
+    select_sqls = []
+    for sql_info in tqdm(synthesized_sqls):
+        # remove comments
+        sql_wo_comments = re.sub(r'/\*.*?\*/', '', sql_info["sql"], flags=re.DOTALL)
+        sql_wo_comments = re.sub(r'--.*', '', sql_wo_comments)
+        sql_wo_comments = sql_wo_comments.strip()
+
+        if sql_wo_comments.lower().startswith("select") or \
+            sql_wo_comments.lower().startswith("with"):
+            select_sqls.append(sql_info)
+    return select_sqls
+
+def dedup_using_query_plan(synthesized_sqls):
+    unique_plans = set()
+    deduped_sqls = []
+    for sql_info in tqdm(synthesized_sqls):
+        query_plan = sql_info["query_plan"]
+        if query_plan not in unique_plans:
+            unique_plans.add(query_plan)
+            deduped_sqls.append(sql_info)
+    return deduped_sqls
+
+def obtain_sql_template(sql):
+    # Handles single and double quoted strings, numbers, NULL, TRUE, FALSE
+    pattern = r"""
+        (?<!\w)'(?:\\.|[^'])*' |  # single quoted strings
+        (?<!\w)"(?:\\.|[^"])*" |  # double quoted strings
+        (?<!\w)-?\b\d+(\.\d+)?([eE][-+]?\d+)?\b | # numbers with scientific notation
+        \bNULL\b |                # NULL
+        \bTRUE\b |                # TRUE
+        \bFALSE\b                 # FALSE
+    """
+
+    # replace values with a special token <value>
+    template = re.sub(pattern, "<value>", sql, flags=re.IGNORECASE | re.VERBOSE)
+    template = template.lower().replace("\n", " ").strip()
+    
+    # Replace multiple spaces with a single space
+    template = re.sub(r'\s+', ' ', template)
+    
+    return template
+
+def dedup_using_query_template(synthesized_sqls):
+    unique_templates = set()
+    deduped_sqls = []
+    for sql_info in tqdm(synthesized_sqls):
+        template = obtain_sql_template(sql_info["sql"])
+        if template not in unique_templates:
+            unique_templates.add(template)
+            deduped_sqls.append(sql_info)
+    return deduped_sqls
+
+def parse_response(response):
+    pattern = r"```sql\s*(.*?)\s*```"
+    
+    sql_blocks = re.findall(pattern, response, re.DOTALL)
+
+    if sql_blocks:
+        # Extract the last SQL query in the response text and remove extra whitespace characters
+        last_sql = sql_blocks[-1].strip()
+        return last_sql
+    else:
+        # print("No SQL blocks found.")
+        return ""
+
+def obtain_db_id2table_names(results, db_dir):
+    db_ids = list(set([res["db_id"] for res in results]))
+    print("len(db_ids):", len(db_ids))
+    db_id2table_names = dict()
+    for db_id in db_ids:
+        results, _ = execute_sql(
+            "SELECT name FROM sqlite_master WHERE type='table';", 
+            os.path.join(db_dir, db_id, db_id + ".sqlite")
+        )
+        table_names = [res[0] for res in results]
+        db_id2table_names[db_id] = table_names
+    return db_id2table_names
+
+def load_json_file(file):
+    dataset = []
+    with open(file, 'r', encoding='utf-8') as f:
+        objects = ijson.items(f, 'item')
+        for obj in tqdm(objects):
+            dataset.append(obj)
+    return dataset
+
+if __name__ == "__main__":
+    synthesized_sqls = []
+    db_dir = "../database_synthesis/synthetic_sqlite_databases"
+    llm_responses = load_json_file("./results/sql_synthesis.json")
+    
+    for llm_response in tqdm(llm_responses):
+        sql = parse_response(llm_response["response"])
+        if sql == "":
+            continue
+        synthesized_sqls.append(
+            {
+                "db_id": llm_response["db_id"][:-3] if llm_response["db_id"].endswith(".db") else llm_response["db_id"], 
+                "sql": sql,
+                "complexity": llm_response["prompt"].split("Ensure the SQL query matches the ")[1].split(" level, defined as follows:")[0]
+            }
+        )
+
+    print("original sql num:", len(synthesized_sqls))
+    # analyze_complexity(synthesized_sqls)
+    # analyze_advanced_functions(synthesized_sqls)
+
+    # remove non-SELECT sqls
+    synthesized_sqls = filter_select_sqls(synthesized_sqls)
+    print("sql num after removing non-SELECT sql queries:", len(synthesized_sqls))
+    # analyze_complexity(synthesized_sqls)
+    # analyze_advanced_functions(synthesized_sqls)
+    
+    # remove sqls with syntax errors
+    synthesized_sqls = filter_executable_sqls(synthesized_sqls, db_dir)
+    print("sql num after removing syntax-error sqls:", len(synthesized_sqls))
+    # analyze_complexity(synthesized_sqls)
+    # analyze_advanced_functions(synthesized_sqls)
+
+    # # perform deduplication according to the query plan
+    # synthesized_sqls = dedup_using_query_plan(synthesized_sqls)
+    # print("sql num after deduplication (query plan level):", len(synthesized_sqls))
+    # print(synthesized_sqls[0].keys())
+    # # analyze_complexity(synthesized_sqls)
+    # # analyze_advanced_functions(synthesized_sqls)
+
+    # remove timeout sqls
+    no_timeout_synthesized_sqls = mp.Manager().list()
+    remove_timeout_sqls_parallel(synthesized_sqls, db_dir, 10, 2)
+    synthesized_sqls = list(no_timeout_synthesized_sqls)
+    print("sql num after removing timeout sqls:", len(synthesized_sqls))
+    print(synthesized_sqls[0].keys())
+    # analyze_complexity(synthesized_sqls)
+    analyze_column_count(synthesized_sqls)
+    # analyze_advanced_functions(synthesized_sqls)
+
+    # perform deduplication according to the query template
+    synthesized_sqls = dedup_using_query_template(synthesized_sqls)
+    print("sql num after deduplication (tempalte level):", len(synthesized_sqls))
+    # analyze_complexity(synthesized_sqls)
+    analyze_column_count(synthesized_sqls)
+    # analyze_advanced_functions(synthesized_sqls)
+
+    # analyze the number of used tables
+    analyze_used_tables_num(
+        synthesized_sqls, 
+        obtain_db_id2table_names(synthesized_sqls, db_dir)
+    )
+
+    with open("./results/synthetic_sqls.json", "w", encoding="utf-8") as f:
+        f.write(json.dumps(synthesized_sqls, indent=2, ensure_ascii=False))
\ No newline at end of file
--- a/data_synthesis/sql_synthesis/prompt_templates/sql_synthesis_prompt.txt
+++ b/data_synthesis/sql_synthesis/prompt_templates/sql_synthesis_prompt.txt
+**Task Overview**
+Create an executable SQL query based on the provided information.
+
+**Database Schema**
+{schema_str}
+
+{sql_function_prompt}
+
+{db_value_prompt}
+
+**SQL Query Complexity**
+Ensure the SQL query matches the {complexity} level, defined as follows:
+{criterion}
+
+**Output Format Requirements**
+Enclose the SQL query in a code block:
+```sql
+  -- Your SQL query here
+```
+
+**SQL Query Requirements**
+1. Use the syntax specific to the {db_engine} database engine.
+2. Incorporate advanced functions if appropriate, but they are not mandatory.
+3. Address real-world data analysis needs. Avoid trivial or nonsensical queries.
+4. (Very important) Ensure the final SQL query selects {column_count} columns.
+
+**Answer**
+Let's proceed step by step.
\ No newline at end of file
--- a/data_synthesis/sql_synthesis/prompt_templates/sqlite_funcs.json
+++ b/data_synthesis/sql_synthesis/prompt_templates/sqlite_funcs.json
--- a/data_synthesis/sql_synthesis/synthesize_sql.py
+++ b/data_synthesis/sql_synthesis/synthesize_sql.py
+import argparse
+import json
+import re
+from tqdm import tqdm
+
+import openai
+
+
+def parse_response(response):
+    pattern = r"```sql\s*(.*?)\s*```"
+    
+    sql_blocks = re.findall(pattern, response, re.DOTALL)
+
+    if sql_blocks:
+        # Extract the last SQL query in the response text and remove extra whitespace characters
+        last_sql = sql_blocks[-1].strip()
+        return last_sql
+    else:
+        print("No SQL blocks found.")
+        return ""
+
+def llm_inference(model, base_url, prompts, db_ids):
+    """
+    Generates responses using an LLM for given prompts.
+
+    Args:
+        model: The LLM to use for generating responses.
+        prompts (list of str): A list of prompts for the model.
+        db_ids (list of str): A list of database IDs corresponding to each prompt.
+
+    Returns:
+        list of dict: A list of dictionaries containing the prompt, db_id, and generated response.
+    """
+    client = openai.OpenAI(
+        base_url=base_url,
+        api_key="EMPTY"
+    )
+    # Replace with actual LLM call to generate responses
+    # `responses` should be a list of strings (list of str), where each string is the LLM's output for a prompt.
+    # responses = None # model.generate(prompts, temperature=0.8, n=1), this is an example call, adjust as needed
+    responses = []
+    for prompt in prompts:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[{"role":"user", "content": prompt}],
+            max_tokens=4196,
+            temperature=0.2
+        )
+        responses.append(response.choices[0].message.content.strip())
+
+    results = [
+        {
+            "prompt": prompt,
+            "db_id": db_id,
+            "response": response
+        }
+        for prompt, db_id, response in zip(prompts, db_ids, responses)
+    ]
+
+    return results
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type = str)
+    parser.add_argument("--base_url", type=str)
+    parser.add_argument("--nums", type=int, default=None)
+    opt = parser.parse_args()
+    print(opt)
+
+    input_dataset = json.load(open("./prompts/sql_synthesis_prompts.json"))[:opt.nums]
+    output_file = "./results/sql_synthesis.json"
+
+    db_ids = [data["db_id"] for data in input_dataset]
+    prompts = [data["prompt"] for data in input_dataset]
+    
+    results = llm_inference(opt.model, opt.base_url, prompts, db_ids)
+
+    with open(output_file, "w", encoding = "utf-8") as f:
+        f.write(json.dumps(results, indent = 2, ensure_ascii = False))
\ No newline at end of file
--- a/examples/example_1.txt
+++ b/examples/example_1.txt
+Task Overview:
+You are a data science expert. Below, you are provided with a database schema and a natural language question. Your task is to understand the schema and generate a valid SQL query to answer the question.
+
+Database Engine:
+SQLite
+
+Database Schema:
+CREATE TABLE cards (
+    id integer, -- unique id number identifying the cards, example: [41138, 1349]
+    artist text, -- example: ['Pete Venters', 'Volkan Baǵa']
+    asciiName text, -- example: ['El-Hajjaj', 'Junun Efreet']
+    availability text, -- example: ['mtgo,paper', 'paper']
+    borderColor text, -- example: ['black', 'white']
+    cardKingdomFoilId text, -- example: ['123094', '123095']
+    cardKingdomId text, -- example: ['122719', '122720']
+    colorIdentity text, -- example: ['W', 'B']
+    colorIndicator text, -- example: ['U', 'G']
+    colors text, -- example: ['W', 'B']
+    convertedManaCost real, -- example: [7.0, 5.0]
+    duelDeck text, -- example: ['a', 'b']
+    edhrecRank integer, -- rec Rank in edh, example: [15650, 12702]
+    faceConvertedManaCost real, -- example: [4.0, 5.0]
+    faceName text, -- example: ['Dusk', 'Dawn']
+    flavorName text, -- example: ['Godzilla, King of the Monsters', 'King Caesar, Ancient Guardian']
+    flavorText text, -- example: ['Every tear shed is a drop of immortality', 'The perfect antidote for a tightly packe']
+    frameEffects text, -- example: ['legendary', 'nyxtouched']
+    frameVersion text, -- example: ['2003', '1993']
+    hand text, -- example: ['1', '0']
+    hasAlternativeDeckLimit integer, -- example: [0, 1]
+    hasContentWarning integer, -- example: [0, 1]
+    hasFoil integer, -- example: [0, 1]
+    hasNonFoil integer, -- example: [1, 0]
+    isAlternative integer, -- example: [0, 1]
+    isFullArt integer, -- example: [0, 1]
+    isOnlineOnly integer, -- example: [0, 1]
+    isOversized integer, -- example: [0, 1]
+    isPromo integer, -- is Promotion, example: [0, 1]
+    isReprint integer, -- example: [1, 0]
+    isReserved integer, -- example: [0, 1]
+    isStarter integer, -- example: [0, 1]
+    isStorySpotlight integer, -- example: [0, 1]
+    isTextless integer, -- example: [0, 1]
+    isTimeshifted integer, -- example: [0, 1]
+    keywords text, -- example: ['First strike', 'Flying']
+    layout text, -- example: ['normal', 'aftermath']
+    leadershipSkills text, -- example: ["{'brawl': False, 'commander': True, 'oat", "{'brawl': False, 'commander': False, 'oa"]
+    life text, -- example: ['-5', '-1']
+    loyalty text, -- example: ['6', '3']
+    manaCost text, -- example: ['{5}{W}{W}', '{4}{W}']
+    mcmId text, -- example: ['16165', '16166']
+    mcmMetaId text, -- example: ['156', '176']
+    mtgArenaId text, -- example: ['74983', '74986']
+    mtgjsonV4Id text, -- example: ['ad41be73-582f-58ed-abd4-a88c1f616ac3', '9eb2e54c-a12b-5e88-a9c0-d8c84c52d59c']
+    mtgoFoilId text, -- example: ['27501', '26993']
+    mtgoId text, -- example: ['27500', '26992']
+    multiverseId text, -- example: ['130550', '129465']
+    name text, -- example: ["Ancestor's Chosen", 'Angel of Mercy']
+    number text, -- example: ['1', '2']
+    originalReleaseDate text, -- example: ['2012/12/1', '2006/12/1']
+    originalText text, -- example: ['First strike (This creature deals combat', "Flying (This creature can't be blocked e"]
+    originalType text, -- example: ['Creature - Human Cleric', 'Creature - Angel']
+    otherFaceIds text, -- example: ['87f0062a-8321-5c16-960e-a12ce1df5839', 'f9f10d34-071c-57a6-b58c-7553abad5c20']
+    power text, -- example: ['4', '3']
+    printings text, -- example: ['10E,JUD,UMA', '10E,8ED,9ED,DDC,DVD,IMA,INV,JMP,MB1,P02,']
+    promoTypes text, -- example: ['boxtopper,boosterfun', 'boosterfun']
+    purchaseUrls text, -- example: ["{'cardKingdom': 'https://mtgjson.com/lin"]
+    rarity text, -- example: ['uncommon', 'common']
+    scryfallId text, -- example: ['7a5cd03c-4227-4551-aa4b-7d119f0468b5', '8f7980d4-da43-4d6d-ad16-14b8a34ae91d']
+    scryfallIllustrationId text, -- example: ['be2f7173-c8b7-4172-a388-9b2c6b3c16e5', 'e4d6c53f-e936-4be8-8b70-47c2be863b20']
+    scryfallOracleId text, -- example: ['fc2ccab7-cab1-4463-b73d-898070136d74', 'a2daaf32-dbfe-4618-892e-0da24f63a44a']
+    setCode text, -- example: ['10E', '2ED']
+    side text, -- example: ['a', 'b']
+    subtypes text, -- example: ['Human,Cleric', 'Angel']
+    supertypes text, -- example: ['Legendary', 'Basic']
+    tcgplayerProductId text, -- example: ['15032', '15033']
+    text text, -- example: ['First strike (This creature deals combat', 'Flying\nWhen Angel of Mercy enters the ba']
+    toughness text, -- example: ['4', '3']
+    type text, -- example: ['Creature — Human Cleric', 'Creature — Angel']
+    types text, -- example: ['Creature', 'Instant']
+    uuid text, -- example: ['00010d56-fe38-5e35-8aed-518019aa36a5', '0001e0d0-2dcd-5640-aadc-a84765cf5fc9']
+    variations text, -- example: ['b7c19924-b4bf-56fc-aa73-f586e940bd42', '8fd4e2eb-3eb4-50ea-856b-ef638fa47f8a']
+    watermark text, -- example: ['set', 'set (HOU)', 'set (LGN)']
+    PRIMARY KEY (id)
+);
+
+CREATE TABLE foreign_data (
+    id integer, -- example: [1, 2]
+    flavorText text, -- example: ['„Es ist der Wille aller, und meine Hand,', '"La voluntad de todos, realizada por mi ']
+    `language` text, -- example: ['Italian', 'German', 'Spanish']
+    multiverseid integer, -- example: [148411, 150317]
+    name text, -- example: ['Ausgewählter der Ahnfrau', 'Elegido de la Antepasada']
+    text text, -- example: ['Erstschlag (Diese Kreatur fügt Kampfscha', 'Daña primero. (Esta criatura hace daño d']
+    type text, -- example: ['Kreatur — Mensch, Kleriker', 'Criatura — Clérigo humano']
+    uuid text, -- example: ['5f8287b1-5bb6-5f4c-ad17-316a40d5bb0c', '57aaebc1-850c-503d-9f6e-bb8d00d8bf7c']
+    PRIMARY KEY (id),
+    CONSTRAINT fk_foreign_data_uuid FOREIGN KEY (uuid) REFERENCES cards (uuid)
+);
+
+CREATE TABLE legalities (
+    id integer, -- example: [1, 2]
+    format text, -- example: ['commander', 'duel']
+    status text, -- example: ['Legal', 'Banned']
+    uuid text, -- example: ['5f8287b1-5bb6-5f4c-ad17-316a40d5bb0c', '57aaebc1-850c-503d-9f6e-bb8d00d8bf7c']
+    PRIMARY KEY (id),
+    CONSTRAINT fk_legalities_uuid FOREIGN KEY (uuid) REFERENCES cards (uuid)
+);
+
+CREATE TABLE sets (
+    id integer, -- example: [1, 2]
+    baseSetSize integer, -- example: [383, 302]
+    block text, -- example: ['Core Set', 'Mirrodin']
+    booster text, -- example: ["{'default': {'boosters': [{'contents': {"]
+    code text, -- example: ['10E', '2ED']
+    isFoilOnly integer, -- example: [0, 1]
+    isForeignOnly integer, -- example: [0, 1]
+    isNonFoilOnly integer, -- example: [0, 1]
+    isOnlineOnly integer, -- example: [0, 1]
+    isPartialPreview integer, -- example: [0, 1]
+    keyruneCode text, -- example: ['10E', '2ED']
+    mcmId integer, -- magic card market id, example: [74, 3204]
+    mcmIdExtras integer, -- magic card market ID Extras, example: [3209, 3459]
+    mcmName text, -- magic card market name, example: ['Tenth Edition', 'Double Masters']
+    mtgoCode text, -- magic the gathering online code, example: ['10E', '2XM']
+    name text, -- example: ['Tenth Edition', 'Unlimited Edition']
+    parentCode text, -- example: ['JMP', 'MH1']
+    releaseDate date, -- example: ['2007-07-13', '1993-12-01']
+    tcgplayerGroupId integer, -- example: [1, 115]
+    totalSetSize integer, -- example: [508, 302]
+    type text, -- example: ['core', 'masters']
+    PRIMARY KEY (id)
+);
+
+CREATE TABLE set_translations (
+    id integer, -- example: [1, 2]
+    `language` text, -- example: ['Italian', 'Chinese Simplified', 'Chinese Traditional']
+    setCode text, -- example: ['10E', '4ED']
+    translation text, -- example: ['核心系列第十版', 'Dixième édition']
+    PRIMARY KEY (id),
+    CONSTRAINT fk_set_translations_setcode FOREIGN KEY (setCode) REFERENCES sets (code)
+);
+
+CREATE TABLE rulings (
+    id integer, -- example: [1, 2]
+    `date` date, -- example: ['2007-07-15', '2007-02-01']
+    text text, -- example: ['You draw the card when Bandage resolves,', 'If you double a negative life total, you']
+    uuid text, -- example: ['6d268c95-c176-5766-9a46-c14f739aba1c', '56f4935b-f6c5-59b9-88bf-9bcce20247ce']
+    PRIMARY KEY (id),
+    CONSTRAINT fk_rulings_uuid FOREIGN KEY (uuid) REFERENCES cards (uuid)
+);
+This schema describes the database's structure, including tables, columns, primary keys, foreign keys, and any relevant relationships or constraints.
+
+Question:
+Italian translation refers to language = 'Italian'; have a translation means translation is not null; base set number of under 100 refers to baseSetSize < 10
+Among the sets of cards that have an Italian translation, how many of them have a base set number of under 100?
+
+Instructions:
+- Make sure you only output the information that is asked in the question. If the question asks for a specific column, make sure to only include that column in the SELECT clause, nothing more.
+- The generated query should return all of the information asked in the question without any missing or extra information.
+- Before generating the final SQL query, please think through the steps of how to write the query.
+
+Output Format:
+In your answer, please enclose the generated SQL query in a code block:
+```sql
+-- Your SQL query
+```
+
+Take a deep breath and think step by step to find the correct SQL query.
\ No newline at end of file
--- a/examples/example_2.txt
+++ b/examples/example_2.txt
+Task Overview:
+You are a data science expert. Below, you are provided with a database schema and a natural language question. Your task is to understand the schema and generate a valid SQL query to answer the question.
+
+Database Engine:
+SQLite
+
+Database Schema:
+CREATE TABLE continents (
+    ContId number, -- example: [1, 2]
+    Continent text, -- example: ['america', 'europe']
+    PRIMARY KEY (ContId)
+);
+
+CREATE TABLE countries (
+    CountryId number, -- example: [1, 2]
+    CountryName text, -- example: ['usa', 'germany']
+    Continent number, -- example: [1, 2]
+    PRIMARY KEY (CountryId),
+    CONSTRAINT fk_countries_continent FOREIGN KEY (Continent) REFERENCES continents (ContId)
+);
+
+CREATE TABLE car_makers (
+    Id number, -- example: [1, 2]
+    Maker text, -- example: ['amc', 'volkswagen']
+    FullName text, -- example: ['American Motor Company', 'Volkswagen']
+    Country text, -- example: ['1', '2']
+    PRIMARY KEY (Id),
+    CONSTRAINT fk_car_makers_country FOREIGN KEY (Country) REFERENCES countries (CountryId)
+);
+
+CREATE TABLE model_list (
+    ModelId number, -- example: [1, 2]
+    Maker number, -- example: [1, 2]
+    Model text, -- example: ['amc', 'audi']
+    PRIMARY KEY (ModelId),
+    CONSTRAINT fk_model_list_maker FOREIGN KEY (Maker) REFERENCES car_makers (Id)
+);
+
+CREATE TABLE car_names (
+    MakeId number, -- example: [1, 2]
+    Model text, -- example: ['chevrolet', 'buick']
+    Make text, -- example: ['chevrolet chevelle malibu', 'buick skylark 320']
+    PRIMARY KEY (MakeId),
+    CONSTRAINT fk_car_names_model FOREIGN KEY (Model) REFERENCES model_list (Model)
+);
+
+CREATE TABLE cars_data (
+    Id number, -- example: [1, 2]
+    MPG text, -- example: ['18', '15']
+    Cylinders number, -- example: [8, 4]
+    Edispl number, -- example: [307.0, 350.0]
+    Horsepower text, -- example: ['130', '165']
+    Weight number, -- example: [3504, 3693]
+    Accelerate number, -- example: [12.0, 11.5]
+    `Year` number, -- example: [1970, 1971]
+    PRIMARY KEY (Id),
+    CONSTRAINT fk_cars_data_id FOREIGN KEY (Id) REFERENCES car_names (MakeId)
+);
+This schema describes the database's structure, including tables, columns, primary keys, foreign keys, and any relevant relationships or constraints.
+
+Question:
+How many car makers are there in each continents? List the continent name and the count.
+
+Instructions:
+- Make sure you only output the information that is asked in the question. If the question asks for a specific column, make sure to only include that column in the SELECT clause, nothing more.
+- The generated query should return all of the information asked in the question without any missing or extra information.
+- Before generating the final SQL query, please think through the steps of how to write the query.
+
+Output Format:
+In your answer, please enclose the generated SQL query in a code block:
+```sql
+-- Your SQL query
+```
+
+Take a deep breath and think step by step to find the correct SQL query.
\ No newline at end of file
--- a/examples/example_3.txt
+++ b/examples/example_3.txt
+Task Overview:
+You are a data science expert. Below, you are provided with a database schema and a natural language question. Your task is to understand the schema and generate a valid SQL query to answer the question.
+
+Database Engine:
+SQLite
+
+Database Schema:
+CREATE TABLE Player_Attributes (
+    id integer, -- example: [1, 2]
+    player_fifa_api_id integer, -- player federation international football association api id, example: [218353, 189615]
+    player_api_id integer, -- example: [505942, 155782]
+    `date` text, -- example: ['2016-02-18 00:00:00', '2015-11-19 00:00:00']
+    overall_rating integer, -- example: [67, 62]
+    potential integer, -- example: [71, 66]
+    preferred_foot text, -- example: ['right', 'left']
+    attacking_work_rate text, -- example: ['medium', 'high']
+    defensive_work_rate text, -- example: ['medium', 'high']
+    crossing integer, -- example: [49, 48]
+    finishing integer, -- example: [44, 43]
+    heading_accuracy integer, -- example: [71, 70]
+    short_passing integer, -- example: [61, 60]
+    volleys integer, -- example: [44, 43]
+    dribbling integer, -- example: [51, 50]
+    curve integer, -- example: [45, 44]
+    free_kick_accuracy integer, -- example: [39, 38]
+    long_passing integer, -- example: [64, 63]
+    ball_control integer, -- example: [49, 48]
+    acceleration integer, -- example: [60, 79]
+    sprint_speed integer, -- example: [64, 78]
+    agility integer, -- example: [59, 78]
+    reactions integer, -- example: [47, 46]
+    balance integer, -- example: [65, 90]
+    shot_power integer, -- example: [55, 54]
+    jumping integer, -- example: [58, 85]
+    stamina integer, -- example: [54, 79]
+    strength integer, -- example: [76, 56]
+    long_shots integer, -- example: [35, 34]
+    aggression integer, -- example: [71, 63]
+    interceptions integer, -- example: [70, 41]
+    positioning integer, -- example: [45, 44]
+    vision integer, -- example: [54, 53]
+    penalties integer, -- example: [48, 47]
+    marking integer, -- example: [65, 62]
+    standing_tackle integer, -- example: [69, 66]
+    sliding_tackle integer, -- example: [69, 66]
+    gk_diving integer, -- goalkeep diving, example: [6, 5]
+    gk_handling integer, -- goalkeep handling, example: [11, 10]
+    gk_kicking integer, -- goalkeep kicking, example: [10, 9]
+    gk_positioning integer, -- goalkeep positioning, example: [8, 7]
+    gk_reflexes integer, -- goalkeep reflexes, example: [8, 7]
+    PRIMARY KEY (id),
+    CONSTRAINT fk_player_attributes_player_fifa_api_id FOREIGN KEY (player_fifa_api_id) REFERENCES Player (player_fifa_api_id),
+    CONSTRAINT fk_player_attributes_player_api_id FOREIGN KEY (player_api_id) REFERENCES Player (player_api_id)
+);
+
+CREATE TABLE Player (
+    id integer, -- example: [3879, 401]
+    player_api_id integer, -- example: [2625, 2752]
+    player_name text, -- example: ['Aaron Mooy', 'Aaron Appindangoye', 'Aaron Cresswell']
+    player_fifa_api_id integer, -- player federation international football association api id, example: [2, 6]
+    birthday text, -- example: ['1992-02-29 00:00:00', '1989-12-15 00:00:00']
+    height integer, -- example: [182.88, 170.18]
+    weight integer, -- example: [187, 146]
+    PRIMARY KEY (id)
+);
+
+CREATE TABLE League (
+    id integer, -- example: [1, 1729]
+    country_id integer, -- example: [1, 1729]
+    name text, -- example: ['Belgium Jupiler League', 'England Premier League']
+    PRIMARY KEY (id),
+    CONSTRAINT fk_league_country_id FOREIGN KEY (country_id) REFERENCES Country (id)
+);
+
+CREATE TABLE Country (
+    id integer, -- example: [1, 1729]
+    name text, -- example: ['Belgium', 'England']
+    PRIMARY KEY (id)
+);
+
+CREATE TABLE Team (
+    id integer, -- example: [31446, 1513]
+    team_api_id integer, -- example: [1601, 1773]
+    team_fifa_api_id integer, -- team federation international football association api id, example: [673, 675]
+    team_long_name text, -- example: ['KRC Genk', 'Beerschot AC']
+    team_short_name text, -- example: ['GEN', 'BAC']
+    PRIMARY KEY (id)
+);
+
+CREATE TABLE Team_Attributes (
+    id integer, -- example: [1, 2]
+    team_fifa_api_id integer, -- team federation international football association api id, example: [434, 77]
+    team_api_id integer, -- example: [9930, 8485]
+    `date` text, -- example: ['2010-02-22 00:00:00', '2014-09-19 00:00:00']
+    buildUpPlaySpeed integer, -- example: [60, 52]
+    buildUpPlaySpeedClass text, -- example: ['Balanced', 'Fast']
+    buildUpPlayDribbling integer, -- example: [48, 41]
+    buildUpPlayDribblingClass text, -- example: ['Little', 'Normal']
+    buildUpPlayPassing integer, -- example: [50, 56]
+    buildUpPlayPassingClass text, -- example: ['Mixed', 'Long']
+    buildUpPlayPositioningClass text, -- example: ['Organised', 'Free Form']
+    chanceCreationPassing integer, -- example: [60, 54]
+    chanceCreationPassingClass text, -- example: ['Normal', 'Risky']
+    chanceCreationCrossing integer, -- example: [65, 63]
+    chanceCreationCrossingClass text, -- example: ['Normal', 'Lots']
+    chanceCreationShooting integer, -- example: [55, 64]
+    chanceCreationShootingClass text, -- example: ['Normal', 'Lots']
+    chanceCreationPositioningClass text, -- example: ['Organised', 'Free Form']
+    defencePressure integer, -- example: [50, 47]
+    defencePressureClass text, -- example: ['Medium', 'Deep']
+    defenceAggression integer, -- example: [55, 44]
+    defenceAggressionClass text, -- example: ['Press', 'Double']
+    defenceTeamWidth integer, -- example: [45, 54]
+    defenceTeamWidthClass text, -- example: ['Normal', 'Wide']
+    defenceDefenderLineClass text, -- example: ['Cover', 'Offside Trap']
+    PRIMARY KEY (id),
+    CONSTRAINT fk_team_attributes_team_fifa_api_id FOREIGN KEY (team_fifa_api_id) REFERENCES Team (team_fifa_api_id),
+    CONSTRAINT fk_team_attributes_team_api_id FOREIGN KEY (team_api_id) REFERENCES Team (team_api_id)
+);
+
+CREATE TABLE `Match` (
+    id integer, -- example: [4769, 4770]
+    country_id integer, -- example: [1, 1729]
+    league_id integer, -- example: [1, 1729]
+    season text, -- example: ['2008/2009', '2009/2010']
+    stage integer, -- example: [1, 10]
+    `date` text, -- example: ['2008-08-17 00:00:00', '2008-08-16 00:00:00']
+    match_api_id integer, -- example: [483129, 483130]
+    home_team_api_id integer, -- example: [9987, 10000]
+    away_team_api_id integer, -- example: [9993, 9994]
+    home_team_goal integer, -- example: [1, 0]
+    away_team_goal integer, -- example: [1, 0]
+    home_player_X1 integer, -- example: [1, 2]
+    home_player_X2 integer, -- example: [2, 4]
+    home_player_X3 integer, -- example: [4, 6]
+    home_player_X4 integer, -- example: [6, 8]
+    home_player_X5 integer, -- example: [8, 6]
+    home_player_X6 integer, -- example: [2, 6]
+    home_player_X7 integer, -- example: [4, 8]
+    home_player_X8 integer, -- example: [6, 2]
+    home_player_X9 integer, -- example: [8, 4]
+    home_player_X10 integer, -- example: [4, 6]
+    home_player_X11 integer, -- example: [6, 4]
+    away_player_X1 integer, -- example: [1, 2]
+    away_player_X2 integer, -- example: [2, 4]
+    away_player_X3 integer, -- example: [4, 6]
+    away_player_X4 integer, -- example: [6, 8]
+    away_player_X5 integer, -- example: [8, 6]
+    away_player_X6 integer, -- example: [2, 4]
+    away_player_X7 integer, -- example: [4, 6]
+    away_player_X8 integer, -- example: [6, 8]
+    away_player_X9 integer, -- example: [8, 2]
+    away_player_X10 integer, -- example: [4, 6]
+    away_player_X11 integer, -- example: [6, 4]
+    home_player_Y1 integer, -- example: [1, 3]
+    home_player_Y2 integer, -- example: [3, 0]
+    home_player_Y3 integer, -- example: [3, 5]
+    home_player_Y4 integer, -- example: [3, 5]
+    home_player_Y5 integer, -- example: [3, 7]
+    home_player_Y6 integer, -- example: [7, 3]
+    home_player_Y7 integer, -- example: [7, 6]
+    home_player_Y8 integer, -- example: [7, 8]
+    home_player_Y9 integer, -- example: [7, 10]
+    home_player_Y10 integer, -- example: [10, 7]
+    home_player_Y11 integer, -- example: [10, 11]
+    away_player_Y1 integer, -- example: [1, 3]
+    away_player_Y2 integer, -- example: [3]
+    away_player_Y3 integer, -- example: [3, 7]
+    away_player_Y4 integer, -- example: [3, 5]
+    away_player_Y5 integer, -- example: [3, 7]
+    away_player_Y6 integer, -- example: [7, 3]
+    away_player_Y7 integer, -- example: [7, 6]
+    away_player_Y8 integer, -- example: [7, 8]
+    away_player_Y9 integer, -- example: [7, 10]
+    away_player_Y10 integer, -- example: [10, 7]
+    away_player_Y11 integer, -- example: [10, 11]
+    home_player_1 integer, -- example: [39890, 38327]
+    home_player_2 integer, -- example: [67950, 39580]
+    home_player_3 integer, -- example: [38788, 67958]
+    home_player_4 integer, -- example: [38312, 67959]
+    home_player_5 integer, -- example: [26235, 37112]
+    home_player_6 integer, -- example: [36393, 46004]
+    home_player_7 integer, -- example: [148286, 164732]
+    home_player_8 integer, -- example: [67898, 39631]
+    home_player_9 integer, -- example: [26916, 164352]
+    home_player_10 integer, -- example: [38801, 38423]
+    home_player_11 integer, -- example: [94289, 26502]
+    away_player_1 integer, -- example: [34480, 37937]
+    away_player_2 integer, -- example: [38388, 38293]
+    away_player_3 integer, -- example: [26458, 148313]
+    away_player_4 integer, -- example: [13423, 104411]
+    away_player_5 integer, -- example: [38389, 148314]
+    away_player_6 integer, -- example: [38798, 37202]
+    away_player_7 integer, -- example: [30949, 43158]
+    away_player_8 integer, -- example: [38253, 9307]
+    away_player_9 integer, -- example: [106013, 42153]
+    away_player_10 integer, -- example: [38383, 32690]
+    away_player_11 integer, -- example: [46552, 38782]
+    goal text, -- example: ['<goal><value><comment>n</comment><stats>']
+    shoton text, -- example: ['<shoton><value><stats><blocked>1</blocke']
+    shotoff text, -- example: ['<shotoff><value><stats><shotoff>1</shoto']
+    foulcommit text, -- example: ['<foulcommit><value><stats><foulscommitte']
+    card text, -- example: ['<card><value><comment>y</comment><stats>', '<card />']
+    `cross` text, -- example: ['<cross><value><stats><crosses>1</crosses']
+    corner text, -- example: ['<corner><value><stats><corners>1</corner']
+    possession text, -- example: ['<possession><value><comment>56</comment>', '<possession><value><comment>65</comment>']
+    B365H real, -- example: [1.73, 1.95]
+    B365D real, -- example: [3.4, 3.2]
+    B365A real, -- example: [5.0, 3.6]
+    BWH real, -- example: [1.75, 1.8]
+    BWD real, -- example: [3.35, 3.3]
+    BWA real, -- example: [4.2, 3.95]
+    IWH real, -- example: [1.85, 1.9]
+    IWD real, -- example: [3.2, 3.1]
+    IWA real, -- example: [3.5, 2.3]
+    LBH real, -- example: [1.8, 1.9]
+    LBD real, -- example: [3.3, 3.2]
+    LBA real, -- example: [3.75, 3.5]
+    PSH real, -- example: [5.1, 2.48]
+    PSD real, -- example: [3.82, 3.52]
+    PSA real, -- example: [1.76, 2.96]
+    WHH real, -- example: [1.7, 1.83]
+    WHD real, -- example: [3.3, 3.25]
+    WHA real, -- example: [4.33, 3.6]
+    SJH real, -- example: [1.9, 1.95]
+    SJD real, -- example: [3.3, 4.0]
+    SJA real, -- example: [4.0, 3.8]
+    VCH real, -- example: [1.65, 2.0]
+    VCD real, -- example: [3.4, 3.25]
+    VCA real, -- example: [4.5, 3.25]
+    GBH real, -- example: [1.78, 1.85]
+    GBD real, -- example: [3.25, 3.2]
+    GBA real, -- example: [4.0, 3.75]
+    BSH real, -- example: [1.73, 1.91]
+    BSD real, -- example: [3.4, 3.25]
+    BSA real, -- example: [4.2, 3.6]
+    PRIMARY KEY (id),
+    CONSTRAINT fk_match_home_team_api_id FOREIGN KEY (home_team_api_id) REFERENCES Team (team_api_id),
+    CONSTRAINT fk_match_away_team_api_id FOREIGN KEY (away_team_api_id) REFERENCES Team (team_api_id),
+    CONSTRAINT fk_match_home_player_1 FOREIGN KEY (home_player_1) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_home_player_2 FOREIGN KEY (home_player_2) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_home_player_3 FOREIGN KEY (home_player_3) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_home_player_4 FOREIGN KEY (home_player_4) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_home_player_5 FOREIGN KEY (home_player_5) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_home_player_6 FOREIGN KEY (home_player_6) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_home_player_7 FOREIGN KEY (home_player_7) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_home_player_8 FOREIGN KEY (home_player_8) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_home_player_9 FOREIGN KEY (home_player_9) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_home_player_10 FOREIGN KEY (home_player_10) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_home_player_11 FOREIGN KEY (home_player_11) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_away_player_1 FOREIGN KEY (away_player_1) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_away_player_2 FOREIGN KEY (away_player_2) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_away_player_3 FOREIGN KEY (away_player_3) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_away_player_4 FOREIGN KEY (away_player_4) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_away_player_5 FOREIGN KEY (away_player_5) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_away_player_6 FOREIGN KEY (away_player_6) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_away_player_7 FOREIGN KEY (away_player_7) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_away_player_8 FOREIGN KEY (away_player_8) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_away_player_9 FOREIGN KEY (away_player_9) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_away_player_10 FOREIGN KEY (away_player_10) REFERENCES Player (player_api_id),
+    CONSTRAINT fk_match_away_player_11 FOREIGN KEY (away_player_11) REFERENCES Player (player_api_id)
+);
+This schema describes the database's structure, including tables, columns, primary keys, foreign keys, and any relevant relationships or constraints.
+
+Question:
+Aaron Mooy refers to player_name = 'Aaron Mooy'; on 2016/2/4 refers to date LIKE '2016-02-04%';
+What was the overall rating for Aaron Mooy on 2016/2/4?
+
+Instructions:
+- Make sure you only output the information that is asked in the question. If the question asks for a specific column, make sure to only include that column in the SELECT clause, nothing more.
+- The generated query should return all of the information asked in the question without any missing or extra information.
+- Before generating the final SQL query, please think through the steps of how to write the query.
+
+Output Format:
+In your answer, please enclose the generated SQL query in a code block:
+```sql
+-- Your SQL query
+```
+
+Take a deep breath and think step by step to find the correct SQL query.
\ No newline at end of file