Commit 4f4ba442 authored by mashun1's avatar mashun1
Browse files

omnisql

parents
Pipeline #2643 canceled with stages
import os
import re
import json
import random
import sqlite3
import traceback
def merge_foreign_keys_to_create_table(create_stmts, fk_stmts):
# Extract foreign key constraint information
#ALTER TABLE "performance_metrics" ADD CONSTRAINT fk_performance_metrics_app_id FOREIGN KEY ("app_id") REFERENCES applications ("app_id");
fk_constraints = {}
for alter_statement in fk_stmts:
match = re.search(r'ALTER TABLE "(\w+)" ADD CONSTRAINT (\w+) FOREIGN KEY \("(\w+)"\) REFERENCES (\w+) \("(\w+)"\)', alter_statement)
if match:
table_name = match.group(1)
constraint_name = match.group(2)
column_name = match.group(3)
ref_table_name = match.group(4)
ref_column_name = match.group(5)
if table_name in fk_constraints:
fk_constraints[table_name].append(f'CONSTRAINT {constraint_name} FOREIGN KEY ("{column_name}") REFERENCES {ref_table_name} ("{ref_column_name}")')
else:
fk_constraints[table_name] = [f'CONSTRAINT {constraint_name} FOREIGN KEY ("{column_name}") REFERENCES {ref_table_name} ("{ref_column_name}")']
# Merge foreign key constraints into the CREATE TABLE statement
modified_create_table_statements = []
for create_statement in create_stmts:
match = re.search(r'CREATE TABLE "(\w+)"', create_statement)
if match:
table_name = match.group(1)
if table_name in fk_constraints:
for fk in fk_constraints[table_name]:
create_statement = create_statement.rstrip('\n);') + '), \n ' + fk + '\n);'
modified_create_table_statements.append(create_statement)
return modified_create_table_statements
def verify_ddl_in_transaction(ddl_stmts, db_id):
create_stmts = ddl_stmts['create_stmts']
insert_stmts = ddl_stmts['insert_stmts']
alter_stmts = ddl_stmts['alter_stmts']
fk_stmts = ddl_stmts['fk_stmts']
stmts = merge_foreign_keys_to_create_table(create_stmts, fk_stmts)
os.makedirs(f'synthetic_sqlite_databases/{db_id}', exist_ok=True)
try:
# connect db
conn = sqlite3.connect(f'synthetic_sqlite_databases/{db_id}/{db_id}.sqlite')
cursor = conn.cursor()
# begin transaction
conn.execute('BEGIN TRANSACTION')
cursor.execute('PRAGMA foreign_keys = OFF;')
# CREATE TABLE
for stmt in stmts:
# print(stmt)
try:
cursor.execute(stmt)
except Exception as e:
# print("Exception: ", str(e))
continue
# INSERT INTO
for stmt in insert_stmts:
# print(stmt)
try:
cursor.execute(stmt)
except Exception as e:
# print("Exception: ", str(e))
continue
cursor.execute('PRAGMA foreign_keys = ON;')
# update values in foreign key columns
for alter_stmt in alter_stmts:
stmt = alter_stmt['alter_stmt']
values = alter_stmt['values']
# create an empty dict to fill placeholder
filled_values = {}
for i, value in enumerate(values):
tp = value['type']
rg = value['range']
v = random.randint(0, rg)
if tp == "TEXT":
v = str(v)
elif tp == "INTEGER":
v = int(v)
filled_values[f'id_{i}'] = v
stmt = stmt.format(**filled_values)
try:
cursor.execute(stmt)
except Exception as e:
# print("Exception: ", str(e))
continue
# commit transaciton
conn.commit()
print("Transaction committed successfully.")
except Exception as e:
# if any error occurs, roll back the transaction
conn.rollback()
print("Transaction failed and rolled back. Error:", str(e))
raise Exception()
finally:
# close the connection
conn.close()
def convert_complex_type(sql_type):
"""Converts complex types such as Array and Struct to SQLite-compatible types."""
if "Array" in sql_type:
return "TEXT" # Convert Array to TEXT (as JSON-encoded strings)
elif "Struct" in sql_type:
return "TEXT" # Convert Struct to TEXT (as JSON-encoded strings)
else:
# Mapping for standard types
type_mapping = {
"INTEGER": "INTEGER",
"VARCHAR": "TEXT", # SQLite treats all VARCHAR as TEXT
"TEXT": "TEXT",
"REAL": "REAL",
"FLOAT": "REAL",
"DATE": "TEXT",
"TIME": "TEXT",
"BOOLEAN": "INTEGER" # SQLite uses INTEGER for boolean
}
return type_mapping.get(sql_type, "TEXT") # Default to TEXT if unknown type
def format_value_for_sqlite(value, column_type):
"""Formats values for SQLite, including handling Array and Struct types."""
if "Array" in column_type or "Struct" in column_type:
# Convert complex types (Array, Struct) to JSON strings
return f"'{json.dumps(value)}'"
elif isinstance(value, str):
# Escape single quotes in strings using replace before f-string
value = value.replace("'", "''")
return f"'{value}'"
elif value is None:
return "NULL"
return str(value)
def generate_sqlite_ddl(json_schema):
"""Generates SQLite DDL statements including primary and foreign keys, table descriptions, and sample row insertion."""
result = {}
ddl_statements = []
insert_stmts = []
foreign_key_statements = set()
foreign_keys_alter = {}
foreign_keys_alter_stmts = []
rows_cnt = {}
table_pk = {}
table_cols = {}
table_types = {}
for table in json_schema['tables']:
table_name = table['table_name']
table_description = table.get('table_description', '')
column_names = table['column_names']
column_types = table['column_types']
descriptions = table['column_descriptions']
primary_key = table.get('primary_key', [])
sample_rows = table.get('sample_rows', [])
# Step 1: Create table comment (table description as a comment)
# if table_description:
# ddl_statements.append(f'-- {table_description}')
# Step 2: Create table without foreign key constraints
columns_ddl = []
table_cols[table_name] = column_names
table_types[table_name] = column_types
for i, column_name in enumerate(column_names):
column_type = convert_complex_type(column_types[i])
description = descriptions[i]
columns_ddl.append(f'"{column_name}" {column_type} /* {description} */')
# Add primary key constraint
if primary_key:
table_pk[table_name] = primary_key
pk_columns = ', '.join(f'"{col}"' for col in primary_key)
columns_ddl.append(f'PRIMARY KEY ({pk_columns})')
ddl = f'CREATE TABLE "{table_name}" (\n ' + ',\n '.join(columns_ddl) + '\n);'
ddl_statements.append(ddl)
rows_cnt[table_name] = len(sample_rows)
# Insert sample rows
if sample_rows:
for idx, row in enumerate(sample_rows):
# if idx > 2: #
# break
# Find the index of the primary key column
pk_indices = [column_names.index(key) for key in primary_key]
values = [format_value_for_sqlite(value, column_types[i]) for i, value in enumerate(row)]
for pk_idx in pk_indices:
type_str = convert_complex_type(column_types[pk_idx])
if type_str == 'TEXT':
values[pk_idx] = str(idx)
elif type_str == 'INTEGER':
values[pk_idx] = idx
elif type_str == "REAL":
values[pk_idx] = float(idx)
if len(column_names) != len(values):
continue
values = ", ".join([str(value) for value in values])
# print(values)
insert_stmt = f'INSERT INTO "{table_name}" ({", ".join(column_names)}) VALUES ({values});'
insert_stmts.append(insert_stmt)
table_sets = {}
for table_name, pks in table_pk.items():
table_sets[table_name] = set(pks)
for fk in json_schema['foreign_keys']:
table_name = fk['source_table']
src_cols = fk['column_in_source_table'] if type(fk['column_in_source_table']) == list else [fk['column_in_source_table']]
ref_cols = fk['column_in_referenced_table'] if type(fk['column_in_referenced_table']) == list else [fk['column_in_referenced_table']]
real_src_cols = []
real_ref_cols = []
for src_col, ref_col in zip(src_cols, ref_cols):
if src_col in table_sets[table_name]:
continue
real_ref_cols.append(ref_col)
real_src_cols.append(src_col)
if len(real_src_cols) == 0:
continue
fk_source_cols = ', '.join(f'"{col}"' for col in real_src_cols)
fk_ref_table = fk['referenced_table']
fk_ref_cols = ', '.join(f'"{col}"' for col in real_ref_cols)
column_names = table_cols[table_name]
column_types = table_types[table_name]
fk_stmt = (f'ALTER TABLE "{table_name}" '
f'ADD CONSTRAINT fk_{table_name}_{"_".join(real_src_cols)} '
f'FOREIGN KEY ({fk_source_cols}) REFERENCES {fk_ref_table} ({fk_ref_cols});')
if fk_stmt in foreign_key_statements:
continue
foreign_key_statements.add(fk_stmt)
if table_name in foreign_keys_alter:
for i in range(len(real_src_cols)):
foreign_keys_alter[table_name]['ref_table'].append(fk_ref_table)
foreign_keys_alter[table_name]['fk_cols'].extend(real_src_cols)
foreign_keys_alter[table_name]['fk_types'].extend([convert_complex_type(column_types[column_names.index(fk)]) for fk in real_src_cols])
else:
foreign_keys_alter[table_name] = {
"src_table": table_name,
"ref_table": [fk_ref_table],
"fk_cols": real_src_cols,
"fk_types": [convert_complex_type(column_types[column_names.index(fk)]) for fk in real_src_cols],
"pk_cols": table_pk[table_name],
"pk_types": [convert_complex_type(column_types[column_names.index(pk)]) for pk in table_pk[table_name]]
}
# for stmt in ddl_statements:
# pass
# Alter table for foreign key constraint DDL
for table_name, fk_alter in foreign_keys_alter.items():
source_table = fk_alter["src_table"]
ref_table = fk_alter["ref_table"]
src_row_num = rows_cnt[source_table]
ref_row_num = [rows_cnt[ref] for ref in ref_table]
pk_cols = fk_alter["pk_cols"]
pk_types = fk_alter["pk_types"]
cols = fk_alter["fk_cols"]
types = fk_alter["fk_types"]
for i in range(src_row_num):
ddl_stmt = f"UPDATE {source_table} SET "
fk_des = []
for j, col, tp in zip(range(len(cols)), cols, types):
id = random.randint(0, ref_row_num[j]-1)
fk_des.append({"type": tp, "range": ref_row_num[j]-1})
if tp == "TEXT":
id = str(id)
elif tp == "REAL":
id = float(id)
ddl_stmt += (f"{col}"+" = {id_"+str(j)+"}, ")
ddl_stmt = ddl_stmt.strip()[:-1] + " WHERE "
for j, pk, ptp in zip(range(len(pk_cols)) , pk_cols, pk_types):
i_v = i
if ptp == "TEXT":
i_v = str(i_v)
elif ptp == "REAL":
i_v = float(i_v)
if j == 0:
ddl_stmt += f"{pk} = {i_v}"
else:
ddl_stmt += f" and {pk} = {i_v}"
ddl_stmt += ";"
foreign_keys_alter_stmts.append({"alter_stmt": ddl_stmt, "values": fk_des})
# execute update
# for stmt in foreign_key_statements:
# pass
result["create_stmts"] = ddl_statements
result["insert_stmts"] = insert_stmts
result["alter_stmts"] = foreign_keys_alter_stmts
result["fk_stmts"] = list(foreign_key_statements)
return result
# Example usage:
json_schema_str = '''{
"tables": [
{
"table_name": "datasets",
"table_description": "Stores details of all greenhouse gas datasets collected from global sites.",
"column_names": ["dataset_id", "dataset_number", "site_id", "category", "gas_name", "sampling_method", "frequency", "year", "download_link", "readme_link"],
"column_types": ["INTEGER", "INTEGER", "INTEGER", "VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", "INTEGER", "VARCHAR", "VARCHAR"],
"column_descriptions": [
"Unique identifier for each dataset",
"Number assigned to the dataset",
"Reference to the site where the data was collected",
"Category of the data (e.g., Greenhouse Gases)",
"Name of the gas being monitored",
"Method of sampling (e.g., Surface PFP, Aircraft PFP, Flask)",
"Sampling frequency (e.g., Discrete, Continuous)",
"Year when the data was collected",
"Link to download the dataset",
"Link to the readme or metadata of the dataset"
],
"primary_key": ["dataset_id"],
"sample_rows": [
[151, 151, 1, "Greenhouse Gases", "Carbon Dioxide(CO2)", "Surface PFP", "Discrete", 2023, "download_link_151", "readme_link_151"],
[152, 152, 2, "Greenhouse Gases", "Carbon Dioxide(CO2)", "Aircraft PFP", "Discrete", 2023, "download_link_152", "readme_link_152"]
]
},
{
"table_name": "sites",
"table_description": "Details of the sites where greenhouse gas samples are collected.",
"column_names": ["site_id", "site_name", "location", "country", "contact_email"],
"column_types": ["INTEGER", "VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR"],
"column_descriptions": [
"Unique identifier for each site",
"Name of the site",
"Geographical location of the site",
"Country where the site is located",
"Contact email for the site or environmental team"
],
"primary_key": ["site_id"],
"sample_rows": [
[1, "West Branch, Iowa", "West Branch, Iowa, United States", "USA", "contact@westbranch.us"],
[2, "Walnut Grove, California", "Walnut Grove, California, United States", "USA", "contact@walnutgrove.us"]
]
},
{
"table_name": "sampling_methods",
"table_description": "Details of various sampling methods used for collecting air samples.",
"column_names": ["method_id", "method_name", "description"],
"column_types": ["INTEGER", "VARCHAR", "TEXT"],
"column_descriptions": [
"Unique identifier for each sampling method",
"Name of the sampling method (e.g., Surface PFP, Aircraft PFP)",
"Detailed description of the sampling method"
],
"primary_key": ["method_id"],
"sample_rows": [
[1, "Surface PFP", "Surface flask sampling for air composition"],
[2, "Aircraft PFP", "Aircraft-based flask sampling for higher altitude air"]
]
},
{
"table_name": "gas_samples",
"table_description": "Raw data of the gas concentrations measured at each site.",
"column_names": ["sample_id", "dataset_id", "gas_name", "concentration", "measurement_date", "measurement_time"],
"column_types": ["INTEGER", "INTEGER", "VARCHAR", "FLOAT", "DATE", "TIME"],
"column_descriptions": [
"Unique identifier for each gas sample",
"Reference to the dataset from which the sample is drawn",
"Name of the gas measured (e.g., CO2, CH4)",
"Concentration of the gas in ppm (parts per million)",
"Date of the measurement",
"Time of the measurement"
],
"primary_key": ["sample_id"],
"sample_rows": [
[1, 151, "Carbon Dioxide(CO2)", 405.2, "2023-05-01", "12:00:00"],
[2, 152, "Carbon Dioxide(CO2)", 407.8, "2023-05-02", "12:30:00"]
]
},
{
"table_name": "users",
"table_description": "Details of users accessing the datasets and samples.",
"column_names": ["user_id", "user_name", "email", "organization", "role"],
"column_types": ["INTEGER", "VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR"],
"column_descriptions": [
"Unique identifier for each user",
"Full name of the user",
"Email address of the user",
"Organization the user belongs to",
"Role of the user (e.g., researcher, admin, viewer)"
],
"primary_key": ["user_id"],
"sample_rows": [
[101, "Dr. Alice Green", "alice.green@enviroresearch.org", "EnviroResearch", "researcher"],
[102, "John Doe", "john.doe@climatelabs.org", "Climate Labs", "admin"]
]
}
],
"foreign_keys": [
{
"source_table": "datasets",
"column_in_source_table": "site_id",
"referenced_table": "sites",
"column_in_referenced_table": "site_id"
},
{
"source_table": "gas_samples",
"column_in_source_table": "dataset_id",
"referenced_table": "datasets",
"column_in_referenced_table": "dataset_id"
}
]
}'''
def verify_schema(json_schema, db_id):
# Convert the schema into DDL statements
try:
ddl_stmts = generate_sqlite_ddl(json_schema)
verify_ddl_in_transaction(ddl_stmts, db_id)
return True
except Exception as e:
print("Exception type:", type(e))
print("Exception message:", e)
# traceback.print_exc()
return False
# Print the DDL output
# print(ddl_output["create_stmts"])
# print(ddl_output["insert_stmts"])
# print(ddl_output["alter_stmts"])
# print(ddl_output["fk_stmts"])
if __name__ == "__main__":
verify_schema(json.loads(json_schema_str), "test_db")
\ No newline at end of file
import argparse
import json
import os
import re
import time
import json_repair
import openai
def parse_response(response):
domain_pattern = r'(?<=\[START_DOMAIN\])(.*?)(?=\[END_DOMAIN\])'
scenario_pattern = r'(?<=\[START_SCENARIO\])(.*?)(?=\[END_SCENARIO\])'
schema_pattern = r'(?<=\[START_DATABASE_SCHEMA\])(.*?)(?=\[END_DATABASE_SCHEMA\])'
try:
domain_match = re.search(domain_pattern, response, re.DOTALL)
domain = domain_match.group(0).strip() if domain_match else None
scenario_match = re.search(scenario_pattern, response, re.DOTALL)
scenario = scenario_match.group(0).strip() if scenario_match else None
schema_match = re.search(schema_pattern, response, re.DOTALL)
schema = schema_match.group(0).strip() if schema_match else None
schema_dict = json_repair.loads(schema)
schema = json.dumps(schema_dict, indent=2, ensure_ascii=False)
return domain, scenario, schema
except Exception as e:
print(response)
print("Parsing Exception:", str(e))
return None, None, None
def llm_inference(model, base_url, prompts):
'''
This function leverages a large language model (LLM) to generate responses for a given list of prompts.
You can integrate your preferred LLM within this function.
Args:
model: The LLM to be used for inference.
prompts: A list of prompts for which the LLM will generate responses.
Returns:
A list of dictionaries containing the prompt, the generated response, and extracted components
(domain, scenario, schema) from the response. Invalid responses are filtered out.
'''
client = openai.OpenAI(
base_url=base_url,
api_key="EMPTY"
)
# Generate responses using the LLM (each prompt corresponds to one response)
# responses = None # Replace this with the actual LLM call, e.g., model.generate(prompts, temperature=0, n=1)
responses = []
for prompt in prompts:
response = client.chat.completions.create(
model=model,
messages=[{"role":"user", "content": prompt}],
max_tokens=4196,
temperature=0.8
)
responses.append(response.choices[0].message.content.strip())
# Initialize a list to store the processed results
results = []
# Iterate over prompts and their corresponding responses
for prompt, response in zip(prompts, responses):
# Parse the response to extract domain, scenario, and schema
domain, scenario, schema = parse_response(response)
# Filter out invalid responses where any component is missing
if domain is None or scenario is None or schema is None:
continue
# Append valid results to the list
results.append({
"prompt": prompt,
"generated_content": {
"response": response,
"domain": domain,
"scenario": scenario,
"schema": schema
}
})
return results
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--model", type = str)
parser.add_argument("--nums", type=int, default=None)
parser.add_argument("--base_url", type=str)
args = parser.parse_args()
print(args)
prompts = json.load(open("./prompts/prompts_schema_synthesis.json"))[:args.nums]
output_file = "./results/schema_synthesis.json"
results = llm_inference(args.model, args.base_url, prompts)
with open(output_file, "w", encoding = "utf-8") as f:
f.write(json.dumps(results, indent = 2, ensure_ascii = False))
# 程式化自然语言问题合成
这是我们数据合成框架的第三步,专门用于为合成 SQL 查询生成风格化的自然语言问题。
## 第 1 步:问题生成
生成风格化的自然语言问题
```bash
# 创建用于生成问题的提示
mkdir prompts
python3 generate_question_synthesis_prompts.py
```
```bash
# 为合成的 SQL 查询生成问题
mkdir results
python3 synthesize_question.py --model model_name --base_url vllm_serve_url(http://x.x.x.x:8000/v1)
```
## 第 2 步:后处理
```bash
# 执行以执行语义一致性选择,确保生成的问题与其相应的 SQL 查询紧密一致
export HF_ENDPOINT=https://hf-mirror.com
python3 post_process_questions.py
```
\ No newline at end of file
# Stylized Natural Language Question Synthesis
This is the third step in our data synthesis framework, dedicated to generating stylized natural language questions for synthetic SQL queries.
## Step 1: Question Generation
Generate stylized natural language questions.
1. Run `python3 generate_question_synthesis_prompts.py` to create prompts for question generation.
2. Execute `python3 synthesize_question.py` to generate questions for the synthesized SQL queries. Note: Ensure the `llm_inference()` function is implemented to integrate your preferred LLM. For each prompt (SQL query), we sample multiple responses (questions) with a temperature of `0.8`.
## Step 2: Post-Processing
1. Execute `python3 post_process_questions.py` to perform semantic consistency selection, ensuring the generated questions align closely with their corresponding SQL queries.
2. The final synthetic `<question, SQL>` pairs will be saved to `./results/question_and_sql_pairs.json`.
\ No newline at end of file
import json
import os
import random
import sqlite3
import numpy as np
import re
from tqdm import tqdm
style2desc = {
"Formal": '''**Formal Style**
- Uses standard grammar and vocabulary.
- Example: Find all students older than 18 years and return their home addresses.''',
"Colloquial": '''**Colloquial Style**
- Employs informal vocabulary and expressions.
- Example: Hey! Could you help me find all the students who are over 18? I'd love to know their names and where they live.''',
"Imperative": '''**Imperative Style**
- Uses command or directive sentences.
- Example: Could you please gather all the students who are older than 18? I really need to know their names and where they live!''',
"Interrogative": '''**Interrogative Style**
- Uses question forms.
- Example: Could you tell me which students are older than 18 and what their home addresses are?''',
"Descriptive": '''**Descriptive Style**
- Uses detailed descriptions with contextual information.
- Example: I want to know the names and home addresses of all students older than 18.''',
"Concise": '''**Concise Style**
- Use short sentences.
- Example: Students older than 18, return their names and addresses.''',
"Vague": '''**Vague Style**
- Includes ambiguous vocabulary requiring inference.
- Example: What are the names and addresses of those older students? (External Knowledge: 'older students' refers to age >= 18.)''',
"Metaphorical": '''**Metaphorical Style**
- Uses metaphors or metaphorical expressions.
- Example: Find the names and addresses of those who have reached adulthood. (External Knowledge: 'reached adulthood' refers to age >= 18.)''',
"Multi-turn Dialogue": '''**Multi-turn Dialogue Style**
- This involves a dialogue to clarify the user's query needs.
- Example: [{"User": "I want to query some student information."}, {"Assistant": "Which students' information would you like to query?"}, {"User": "Students older than 18."}, {"Assistant": "What other information would you like to know about them?"}, {"User": "Names and addresses."}, {"Assistant": "Is there anything else you need?"}, {"User": "No."}, {"Assistant": "OK, I will help you translate your request into an SQL query."}]'''
}
steps_wo_ek = '''1. **Explain the SQL Query:** Provide a detailed explanation of what the query does.
2. **Generate a Question:** Formulate a natural language question based on the SQL query and explanation.'''
steps_w_ek = '''1. **Explain the SQL Query:** Provide a detailed explanation of what the query does.
2. **Generate a Question:** Formulate a natural language question based on the SQL query and explanation.
3. **External Knowledge:** For Vague or Metaphorical styles, include external knowledge to enhance clarity.'''
steps_multi_round = '''1. **Explain the SQL Query:** Provide a detailed explanation of what the query does.
2. **Generate a Dialogue:** Create a conversation between the User and the Assistant based on the SQL query and its explanation.'''
guidelines_wo_ek = '''1. Clearly describe the columns being selected by the SQL query. For example:
- "SELECT * ... FROM ..." means "Find all ...";
- "SELECT f.check_date, f.status, f.remarks, c.year, c.year_min, c.year_max, c.year_average, c.data_quality_score FROM ..." means "Return the check dates, statuses, remarks, years, minimum years, maximum years, average years, and quality scores for ...".
2. Ensure the natural language question accurately captures the semantics of the SQL query, including conditions such as predicates, `ORDER BY`, and `LIMIT` clauses.'''
guidelines_w_ek = '''1. Clearly describe the columns being selected by the SQL query. For example:
- "SELECT * ... FROM ..." means "Find all ...";
- "SELECT f.check_date, f.status, f.remarks, c.year, c.year_min, c.year_max, c.year_average, c.data_quality_score FROM ..." means "Return the check dates, statuses, remarks, years, minimum years, maximum years, average years, and quality scores for ...".
2. Ensure the natural language question accurately captures the semantics of the SQL query, including conditions such as predicates, `ORDER BY`, and `LIMIT` clauses.
3. If necessary, incorporate external knowledge using multiple entries separated by semicolons (";"). These can include formulas, common sense, domain-specific knowledge, or extended context, such as information from long documents. Each entry should be concise.'''
guidelines_multi_round = '''1. Clearly describe the columns being selected by the SQL query. For example:
- "SELECT * ... FROM ..." means "Find all ...";
- "SELECT f.check_date, f.status, f.remarks, c.year, c.year_min, c.year_max, c.year_average, c.data_quality_score FROM ..." means "Return the check dates, statuses, remarks, years, minimum years, maximum years, average years, and quality scores for ...".
2. Ensure the conversation accurately captures the semantics of the SQL query, including conditions such as predicates, `ORDER BY`, and `LIMIT` clauses.'''
output_format_wo_ek = '''Please structure your response as follows:
[EXPLANATION-START]
(SQL Explanation)
[EXPLANATION-END]
[QUESTION-START]
(Natural Language Question)
[QUESTION-END]
- **SQL Explanation**: Provide a clear and detailed explanation of the SQL query, enclosed within [EXPLANATION-START] and [EXPLANATION-END].
- **Natural Language Question**: Translate the SQL query into a natural language question, enclosed within [QUESTION-START] and [QUESTION-END].'''
output_format_w_ek = '''Please structure your response as follows:
[EXPLANATION-START]
(SQL Explanation)
[EXPLANATION-END]
[QUESTION-START]
(Natural Language Question)
[QUESTION-END]
[EXTERNAL-KNOWLEDGE-START]
(External Knowledge)
[EXTERNAL-KNOWLEDGE-END]
- **SQL Explanation**: Provide a clear and detailed explanation of the SQL query, enclosed within [EXPLANATION-START] and [EXPLANATION-END].
- **Natural Language Question**: Translate the SQL query into a natural language question, enclosed within [QUESTION-START] and [QUESTION-END].
- **External Knowledge**: Include any relevant external knowledge if applicable, enclosed within [EXTERNAL-KNOWLEDGE-START] and [EXTERNAL-KNOWLEDGE-END]. Leave this section blank if not needed.'''
output_format_multi_round = '''Please structure your response as follows:
[EXPLANATION-START]
(SQL Explanation)
[EXPLANATION-END]
[QUESTION-START]
(Natural Language Question, in the format of [{"User": ...}, {"Assistant": ...}, {"User": ...}, ....])
[QUESTION-END]
- **SQL Explanation**: Provide a clear and detailed explanation of the SQL query, enclosed within [EXPLANATION-START] and [EXPLANATION-END].
- **Natural Language Question**: Convert the SQL query into a multi-round dialogue, enclosed within [QUESTION-START] and [QUESTION-END]. Represent this as a list that captures multiple rounds of conversation between the User and the Assistant.'''
instruction_wo_ek = "Based on the above information, follow the reasoning steps to generate the explanation and the question corresponding to the SQL query."
instruction_w_ek = "Based on the above information, follow the reasoning steps to generate the explanation, the question, and the external knowledge corresponding to the SQL query."
instruction_multi_round = "Based on the above information, follow the reasoning steps to generate the explanation and the dialogue corresponding to the SQL query."
def obtain_db_schema(db_file_dir):
conn = sqlite3.connect(db_file_dir)
cursor = conn.cursor()
cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
table_names = []
create_statements = []
for table in tables:
table_name, create_statement = table
table_names.append(table_name)
create_statements.append(create_statement)
cursor.close()
conn.close()
return table_names, create_statements
# NOTE: When columns with the same names exist in different tables, more detailed design considerations are necessary
def extract_column_descriptions(create_statements):
column_name2column_desc = dict()
# Regular expression to match column definitions
pattern = r'"(\w+)"\s+\w+\s*/\*\s*(.*?)\s*\*/'
for create_statement in create_statements:
# Find all matches in the string
matches = re.findall(pattern, create_statement)
# Print the results
for column_name, description in matches:
column_name = column_name.lower()
if column_name not in column_name2column_desc:
column_name2column_desc[column_name] = description
return column_name2column_desc
if __name__ == "__main__":
random.seed(42)
db_path = "../database_synthesis/synthetic_sqlite_databases"
sql_infos = json.load(open("../sql_synthesis/results/synthetic_sqls.json"))
question_synthesis_template = open("./prompt_templates/question_synthesis_prompt.txt").read()
styles = ["Formal", "Colloquial", "Imperative", "Interrogative", "Descriptive", "Concise", "Vague", "Metaphorical", "Multi-turn Dialogue"]
print(sql_infos[0])
db_ids = list(set([sql["db_id"] for sql in sql_infos]))
print(len(db_ids))
db_id2column_info = dict()
for db_id in tqdm(db_ids):
table_names, create_statements = obtain_db_schema(os.path.join(db_path, db_id, db_id + ".sqlite"))
db_id2column_info[db_id] = extract_column_descriptions(create_statements)
prompts = []
for sql_info in tqdm(sql_infos):
style_name = random.sample(styles, 1)[0]
column_name2column_desc = db_id2column_info[sql_info["db_id"]]
used_column_name2column_desc = dict()
for column_name, column_desc in column_name2column_desc.items():
if column_name.lower() in sql_info["sql"].lower():
used_column_name2column_desc[column_name] = column_desc
if style_name in ["Vague", "Metaphorical"]: # "Vague" and "Metaphorical" styles require external knowledge
steps = steps_w_ek
guidelines = guidelines_w_ek
instruction = instruction_w_ek
output_format = output_format_w_ek
elif style_name == "Multi-turn Dialogue": # the "Multi-turn Dialogue" style uses a special multi-round format
steps = steps_multi_round
guidelines = guidelines_multi_round
instruction = instruction_multi_round
output_format = output_format_multi_round
else:
steps = steps_wo_ek
guidelines = guidelines_wo_ek
instruction = instruction_wo_ek
output_format = output_format_wo_ek
prompt = question_synthesis_template.format(
style_desc = style2desc[style_name].strip(),
engine = "SQLite",
column_info = json.dumps(used_column_name2column_desc, indent = 2, ensure_ascii = False).strip(),
sql = sql_info["sql"].strip(),
steps = steps.strip(),
guidelines = guidelines.strip(),
output_format = output_format.strip(),
instruction = instruction.strip()
)
sql_info["style"] = style_name
sql_info["prompt"] = prompt
with open("prompts/question_synthesis_prompts.json", "w", encoding="utf-8") as f:
f.write(json.dumps(sql_infos, indent=2, ensure_ascii=False))
\ No newline at end of file
import json
import re
import time
import random
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np
import math
from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
def visualize_embeddings(embeddings, min_index):
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings)
plt.figure(figsize=(8, 6))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], color='red', label='Other Points')
plt.scatter(embeddings_2d[min_index, 0], embeddings_2d[min_index, 1], color='blue', label='Central Point', s=100)
plt.legend()
plt.title('2D PCA of Embeddings')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.savefig(f"embeddings/figure-{random.randint(0,10000000000)}")
def parse_llm_response(response, style):
explanation_pattern = re.compile(r'\[EXPLANATION-START\](.*?)\[EXPLANATION-END\]', re.DOTALL)
question_pattern = re.compile(r'\[QUESTION-START\](.*?)\[QUESTION-END\]', re.DOTALL)
external_knowledge_pattern = re.compile(r'\[EXTERNAL-KNOWLEDGE-START\](.*?)\[EXTERNAL-KNOWLEDGE-END\]', re.DOTALL)
explanation_match = explanation_pattern.search(response)
question_match = question_pattern.search(response)
external_knowledge_match = external_knowledge_pattern.search(response)
explanation_content = explanation_match.group(1).strip() if explanation_match else ""
question_content = question_match.group(1).strip() if question_match else ""
external_knowledge_content = external_knowledge_match.group(1).strip() if external_knowledge_match else ""
if style == "Multi-turn Dialogue":
# parse dialogue
try:
dialog = ""
for turn in json.loads(question_content):
dialog += "**" + list(turn.keys())[0] + "**: " + list(turn.values())[0] + "\n"
question_content = dialog
except Exception as e:
print(e)
return None
if explanation_content == "" or question_content == "":
return None
else:
return {
"question": question_content.strip(),
"explanation": explanation_content.strip(),
"external_knowledge": external_knowledge_content.strip()
}
def integrate_info(sql2question_prompt_info, question_info):
if sql2question_prompt_info["db_id"].endswith(".db"):
db_id = sql2question_prompt_info["db_id"][:-3]
else:
db_id = sql2question_prompt_info["db_id"]
return {
"db_id": db_id,
"sql": sql2question_prompt_info["sql"],
"sql_result_column_count": sql2question_prompt_info["column_count"],
"sql_result_rows_count": sql2question_prompt_info["rows"],
"sql_complexity": sql2question_prompt_info["complexity"],
"question_style": sql2question_prompt_info["style"],
"sql_explanation": question_info["explanation"],
"question": question_info["question"],
"external_knowledge": question_info["external_knowledge"]
}
def edu_distance(vector1, vector2):
distance = 0
for num1, num2 in zip(vector1, vector2):
distance += (num1-num2) ** 2
return math.sqrt(distance)
if __name__ == "__main__":
input_dataset = json.load(open("./results/question_synthesis.json"))
output_file = "./results/question_and_sql_pairs.json"
print("loading SentenceTransformer....")
embedding_model = SentenceTransformer(model_name_or_path = "sentence-transformers/all-mpnet-base-v2", device = "cuda:0")
valid_questions_num = []
result_dataset = []
for data in tqdm(input_dataset):
question_infos = []
for response in data["responses"]:
question_info = parse_llm_response(response, data["style"])
if question_info is not None:
question_infos.append(question_info)
valid_questions_num.append(len(question_infos))
if len(question_infos) == 0: # no valid question
continue
elif len(question_infos) == 1: # only one valid question
result_dataset.append(integrate_info(data, question_infos[0]))
elif len(question_infos) == 2: # two valid questions
# we randomly select one of them
result_dataset.append(integrate_info(data, random.sample(question_infos, 1)[0]))
else: # more than two valid questions
# we vote the final question according to the EK+question embeddings
texts = [question_info["external_knowledge"] + " " + question_info["question"] for question_info in question_infos]
texts = [text.strip() for text in texts]
# we vote the final question according to the question embeddings
# texts = [question_info["question"] for question_info in question_infos]
embeddings = embedding_model.encode(texts)
# find the index of the question at the central point
distance_matrix = cdist(embeddings, embeddings, metric = 'cosine') # metric='cityblock' or metric='euclidean'
distance_sums = distance_matrix.sum(axis = 1)
min_index = np.argmin(distance_sums)
result_dataset.append(integrate_info(data, question_infos[min_index]))
# print("EK:\n", integrate_info(data, question_infos[min_index])["external_knowledge"])
# print("Question:\n", integrate_info(data, question_infos[min_index])["question"])
# print("SQL:\n", integrate_info(data, question_infos[min_index])["sql"])
# print("---------------------------------------")
# visualize_embeddings(embeddings, min_index)
with open(output_file, "w", encoding="utf-8") as f:
f.write(json.dumps(result_dataset, indent=2, ensure_ascii=False))
question_num2count = dict()
for num in valid_questions_num:
if num in question_num2count:
question_num2count[num] += 1
else:
question_num2count[num] = 1
print(question_num2count)
\ No newline at end of file
**Task Overview**
Your task is to create a high-quality natural language question based on a given SQL query and other information.
**Style**
The natural language question should follow this style:
{style_desc}
**Database Engine**
{engine}
**Column Information**
Below are column names and their corresponding descriptions:
{column_info}
**SQL Query**
Given SQL query:
```sql
{sql}
```
**Reasoning Steps**
{steps}
**Guidelines**
{guidelines}
**Output Format**
{output_format}
**Insturction**
{instruction}
\ No newline at end of file
import argparse
import json
from tqdm import tqdm
import openai
def llm_inference(model, base_url, dataset):
"""
Perform LLM inference to generate multiple responses for each prompt in the dataset.
Args:
model: The LLM used for inference.
dataset: A list of dictionaries.
Returns:
A list of dictionaries, where each dictionary includes the original data and the corresponding generated responses.
"""
client = openai.OpenAI(
base_url=base_url,
api_key="EMPTY"
)
prompts = [data["prompt"] for data in dataset]
# Placeholder for storing generated responses for each prompt
# Each element in `responses_list` is a list of responses (strings) corresponding to a prompt.
responses_list = [] # Replace this with your actual response generation logic.
for prompt in prompts:
response = client.chat.completions.create(
model=model,
messages=[{"role":"user", "content": prompt}],
max_tokens=4196,
temperature=0.8
)
responses_list.append(response.choices[0].message.content.strip())
# Initialize an empty list to store the results
results = []
# Iterate through the dataset and the corresponding responses
for data, responses in zip(dataset, responses_list):
# Add the generated responses to the current data entry
data["responses"] = responses
# Append the updated data entry to the results
results.append(data)
return results
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--model", type = str)
parser.add_argument("--base_url", type=str)
opt = parser.parse_args()
print(opt)
input_dataset = json.load(open("./prompts/question_synthesis_prompts.json"))
output_file = "./results/question_synthesis.json"
results = llm_inference(opt.model, opt.base_url, input_dataset)
with open(output_file, "w", encoding = "utf-8") as f:
f.write(json.dumps(results, indent = 2, ensure_ascii = False))
\ No newline at end of file
# 复杂性感知型 SQL 查询生成
## 第 1 步:SQL 查询生成
利用数据库架构、数据库值、查询复杂性和 SQLite 支持的函数生成 SQL 查询
```bash
# 创建用于生成 SQL 查询的提示。
mkdir prompts
python3 generate_sql_synthesis_prompts.py
```
```bash
# 运行以使用 LLM 生成 SQL 查询
python3 synthesize_sql.py --model model_name --base_url vllm_serve_url(http://x.x.x.x:8000/v1)
```
## 第 2 步:后处理
优化生成的 SQL 查询以确保质量并删除无效或冗余的查询
```bash
python3 post_process_sqls.py
```
\ No newline at end of file
# Complexity-Aware SQL Query Generation
This is the second step in our data synthesis framework, focused on generating complexity-aware SQL queries based on synthetic databases.
## Step 1: SQL Query Generation
Generate SQL queries by leveraging database schemas, database values, query complexity, and SQLite-supported functions.
1. Execute `python3 generate_sql_synthesis_prompts.py` to create prompts for SQL query generation.
2. Run `python3 synthesize_sql.py` to generate SQL queries using LLMs. (Note: Implement the `llm_inference()` function to integrate your preferred LLM.)
## Step 2: Post-Processing
Refine the generated SQL queries to ensure quality and remove invalid or redundant queries:
1. Run `python3 post_process_sqls.py` to:
- Discard non-SELECT queries.
- Remove queries with syntax errors or execution timeouts.
- Deduplicate queries based on their templates.
2. The final synthetic SQL queries will be saved in `./results/synthetic_sqls.json`.
import json
import os
import random
import sqlite3
import numpy as np
from tqdm import tqdm
sql_func_template = '''
### SQL Functions
You may consider one or more of the following SQL functions while generating the query:
{sql_funcs}
Important tips:
Except for the functions listed above, you may use any other functions as long as they conform to the syntax of the database engine.
'''
insert_stmts_template = '''
### INSERT INTO Statements
Below are several `INSERT INTO` statements. Use these to help generate predicates (i.e., `WHERE` clauses) in your SQL query:
{insert_statements}
'''
simple_criterion = '''**Criteria:**
Simple SQL queries may satisfy one or more of the following criteria:
- Simple queries should select data from a single table only.
- Basic aggregate functions are permitted, such as `COUNT`, `SUM`, `AVG`, `MIN`, `MAX`.
- No joins are allowed; the query must operate on a single table.
**Example of Simple SQL Query:**
```sql
SELECT name, department_name
FROM employees
WHERE level > 5
ORDER BY age DESC;
```'''
moderate_criterion = '''**Criteria:**
Moderate SQL queries may satisfy one or more of the following criteria:
- Involves table joins, such as `JOIN`, `INNER JOIN`, `LEFT JOIN`, `CROSS JOIN`, etc.
- Includes subqueries within the `SELECT` or `WHERE` clauses.
- Utilizes aggregate functions alongside a `GROUP BY` clause.
- Contains complex `WHERE` conditions, including `IN`, `BETWEEN`, `LIKE`.
- Incorporate a `HAVING` clause to filter aggregated results.
- Uses aggregate functions like `COUNT`, `SUM`, `AVG`, `MIN`, `MAX`, etc.
**Example of Moderate SQL Query:**
```sql
SELECT e.name, d.department_name, AVG(s.salary) AS average_salary
FROM employees e
INNER JOIN departments d ON e.department_id = d.department_id
LEFT JOIN salaries s ON e.employee_id = s.employee_id
WHERE e.age > 30 AND e.status = 'active'
GROUP BY e.name, d.department_name
HAVING AVG(s.salary) > 50000;
```'''
complex_criterion = '''**Criteria:**
Complex SQL queries may satisfy one or more of the following criteria:
- Contains complex nested subqueries.
- Utilizes multiple types of joins, including self-joins.
- Includes window functions, such as `ROW_NUMBER`, `RANK`, etc.
- Uses Common Table Expressions (CTEs) for improved readability.
- Combines multiple aggregate functions.
- Involves complex `WHERE` and `HAVING` clauses with multiple conditions.
- Utilizes advanced functions and operators.
**Example of Complex SQL Query:**
```sql
WITH EmployeeCTE AS (
SELECT employee_id, name, department_id, ROW_NUMBER() OVER (PARTITION BY department_id ORDER BY salary DESC) AS rank
FROM employees
)
SELECT e.name, d.department_name
FROM EmployeeCTE e
INNER JOIN departments d ON e.department_id = d.department_id
WHERE e.rank <= 3;
```'''
highly_complex_criterion = '''**Criteria:**
Highly complex SQL queries may satisfy one or more of the following criteria:
- Includes multiple Common Table Expressions (CTEs) for readability.
- Combines nested subqueries and various joins.
- Utilizes recursive CTEs for hierarchical or recursive queries.
- Extensively uses advanced window functions.
- May involve `UNION` or `UNION ALL` to combine result sets.
- Implements complex logic with advanced analytical functions.
- Employs a wide range of SQL clauses and conditions.
- Utilizes a broad spectrum of SQL functions and advanced features.
**Example of Highly Complex SQL Query:**
```sql
WITH RECURSIVE EmployeeHierarchy AS (
SELECT employee_id, name, manager_id, department_id, 1 as level
FROM employees
WHERE manager_id IS NULL
UNION ALL
SELECT e.employee_id, e.name, e.manager_id, e.department_id, eh.level + 1
FROM employees e
JOIN EmployeeHierarchy eh ON e.manager_id = eh.employee_id
),
DepartmentSalaries AS (
SELECT eh.employee_id, eh.name, eh.level, d.department_name, s.salary, d.department_id
FROM EmployeeHierarchy eh
INNER JOIN departments d ON eh.department_id = d.department_id
INNER JOIN salaries s ON eh.employee_id = s.employee_id
),
DepartmentStats AS (
SELECT
d.department_id,
COUNT(e.employee_id) AS employee_count,
AVG(s.salary) AS average_salary
FROM employees e
INNER JOIN salaries s ON e.employee_id = s.employee_id
INNER JOIN departments d ON e.department_id = d.department_id
GROUP BY d.department_id
)
SELECT ds.name, ds.level,
SUM(ds.salary) OVER (PARTITION BY ds.department_id ORDER BY ds.level, ds.name) AS cumulative_salary
FROM DepartmentSalaries ds
INNER JOIN DepartmentStats dstat ON ds.department_id = dstat.department_id
ORDER BY ds.level, ds.name;
```'''
def obtain_db_schema(db_file_dir):
conn = sqlite3.connect(db_file_dir)
cursor = conn.cursor()
cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
table_names = []
create_statements = []
for table in tables:
table_name, create_statement = table
table_names.append(table_name)
create_statements.append(create_statement)
cursor.close()
conn.close()
return table_names, create_statements
def obtain_insert_statements(db_file_dir, table_names):
table_name2insert_statements = dict()
conn = sqlite3.connect(db_file_dir)
cursor = conn.cursor()
for table_name in table_names:
try:
cursor.execute(f'SELECT * FROM "{table_name}" LIMIT 2')
rows = cursor.fetchall()
column_names = [description[0] for description in cursor.description]
insert_statements = []
for row in rows:
values = ', '.join([f"'{str(value)}'" if isinstance(value, str) else str(value) for value in row])
insert_statement = f"INSERT INTO {table_name} ({', '.join(column_names)}) VALUES ({values});"
insert_statements.append(insert_statement)
# for statement in insert_statements:
# print(statement)
table_name2insert_statements[table_name] = insert_statements
except Exception as e:
print(e)
cursor.close()
conn.close()
return table_name2insert_statements
if __name__ == "__main__":
random.seed(42)
db_path = "../database_synthesis/synthetic_sqlite_databases"
prompt_template = open("./prompt_templates/sql_synthesis_prompt.txt", "r", encoding = "utf-8").read()
functions = json.load(open("./prompt_templates/sqlite_funcs.json"))
complexity2criterion = {
"Simple": simple_criterion,
"Moderate": moderate_criterion,
"Complex": complex_criterion,
"Highly Complex": highly_complex_criterion
}
db_names = os.listdir(db_path)
prompts = []
for db_name in tqdm(db_names):
try:
db_file_dir = os.path.join(db_path, db_name, db_name + ".sqlite")
table_names, create_statements = obtain_db_schema(db_file_dir)
table_name2insert_statements = obtain_insert_statements(db_file_dir, table_names)
for _ in range(0, 300):
complexity = random.sample(["Simple", "Moderate", "Complex", "Highly Complex"], 1)[0]
insert_statements = []
for table_name in table_names:
insert_statements += table_name2insert_statements.get(table_name, [])
if len(insert_statements) == 0:
db_value_prompt = ""
else:
if len(insert_statements) > 4:
insert_statements = random.sample(insert_statements, 4)
db_value_prompt = insert_stmts_template.format(insert_statements = "\n\n".join(insert_statements))
function_num = random.randint(0, 2)
if function_num == 0:
sql_function_prompt = "### SQL Functions\nYou can use any function supported by the database engine."
else:
sql_funcs = ""
sampled_functions = random.sample(functions, function_num)
for idx, func in enumerate(sampled_functions):
sql_funcs += f"Function {idx + 1}:\n" + func.strip() + "\n"
sql_function_prompt = sql_func_template.format(sql_funcs = sql_funcs)
column_count = np.random.geometric(0.6, 1)[0]
prompt = prompt_template.format(
schema_str = "\n\n".join(create_statements),
sql_function_prompt = sql_function_prompt.strip(),
db_value_prompt = db_value_prompt.strip(),
complexity = complexity,
criterion = complexity2criterion[complexity].strip(),
db_engine = "SQLite",
column_count = column_count
)
prompts.append({"prompt": prompt, "db_id": db_name})
except Exception as e:
print(e)
with open("./prompts/sql_synthesis_prompts.json", "w", encoding="utf-8") as f:
f.write(json.dumps(prompts, indent=2, ensure_ascii=False))
\ No newline at end of file
import json
import sqlite3
import os
import sys
import re
import time
from tqdm import tqdm
from func_timeout import func_timeout, FunctionTimedOut
import multiprocessing as mp
import ijson
def execute_sql(sql, db_path):
if sql.strip() == "":
return None
execution_result = None
column_count = None
conn = None
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# start a transaction
cursor.execute("BEGIN")
# execute the SQL query
cursor.execute(sql)
execution_result = cursor.fetchall()
column_count = len(cursor.description)
# roll back the transaction to ensure that the database state is not changed
cursor.execute("ROLLBACK")
except Exception as e:
# print(f"An error occurred: {e}")
pass
finally:
if conn is not None:
conn.close()
return execution_result, column_count
def execute_wrapper(sample_idx, db_id, sql, complexity, timeout, db_dir):
try:
execution_result, column_count = func_timeout(timeout, execute_sql, args = (sql, os.path.join(db_dir, db_id, db_id + ".sqlite")))
if execution_result is None or column_count is None:
return [sample_idx, db_id, sql, complexity, 0, 0, 0]
else:
return [sample_idx, db_id, sql, complexity, 1, column_count, len(execution_result)]
except KeyboardInterrupt:
sys.exit(0)
except FunctionTimedOut:
return [sample_idx, db_id, sql, complexity, 0, 0, 0]
except Exception as e:
return [sample_idx, db_id, sql, complexity, 0, 0, 0]
def execute_callback(result):
sample_idx, db_id, sql, complexity, valid_flag, column_count, rows = result
if valid_flag == 1:
no_timeout_synthesized_sqls.append(
{"db_id": db_id, "sql": sql, "column_count": column_count, "rows": rows, "complexity": complexity}
)
# print("Done:", sample_idx)
def remove_timeout_sqls_parallel(synthesized_sqls, db_dir, num_cpus = 20, timeout = 1):
'''Execute the sqls in parallel'''
parallel_batch_size = 10240
batches = [synthesized_sqls[i: i+parallel_batch_size] for i in range(0, len(synthesized_sqls), parallel_batch_size)]
assert len(synthesized_sqls) == sum([len(batch_sqls) for batch_sqls in batches])
for batch_idx, batch_sqls in enumerate(batches):
print(f"execution process: {batch_idx+1}/{len(batches)}")
pool = mp.Pool(processes = num_cpus)
for sample_idx, sql_info in enumerate(batch_sqls):
pool.apply_async(
execute_wrapper,
args = (sample_idx, sql_info["db_id"], sql_info["sql"], sql_info["complexity"], timeout, db_dir),
callback = execute_callback
)
pool.close()
pool.join()
time.sleep(10)
def analyze_complexity(results):
complexity2num = dict()
for res in results:
complexity = res["complexity"]
if complexity in complexity2num:
complexity2num[complexity] += 1
else:
complexity2num[complexity] = 1
print(complexity2num)
def analyze_column_count(results):
column_count2num = dict()
for res in results:
column_count = res["column_count"]
if column_count in column_count2num:
column_count2num[column_count] += 1
else:
column_count2num[column_count] = 1
print(column_count2num)
def analyze_advanced_functions(results):
function2num = dict()
functions = json.load(open("prompt_templates/sqlite_funcs.json"))
functions = [func_desc.split("(")[0] for func_desc in functions]
for res in results:
sql = res["sql"]
for function in functions:
if function.lower()+"(" in sql.lower():
if function in function2num:
function2num[function] += 1
else:
function2num[function] = 1
print(function2num)
def analyze_used_tables_num(synthesized_sqls, db_id2table_names):
used_tables_num2count = dict()
for sql_info in tqdm(synthesized_sqls):
table_names_in_db = db_id2table_names[sql_info["db_id"]]
sql = sql_info["sql"]
if sql.endswith(";"):
sql = sql[:-1]
sql_tokens = sql.strip().lower().split()
# print(table_names_in_db)
# print(sql_tokens)
used_tables = set()
for table_name in table_names_in_db:
if table_name.lower() in sql_tokens:
used_tables.add(table_name.lower())
used_tables_num = len(used_tables)
# print(used_tables)
# print(used_tables_num)
# print("------------------------------------------")
if used_tables_num in used_tables_num2count:
used_tables_num2count[used_tables_num] += 1
else:
used_tables_num2count[used_tables_num] = 1
print(used_tables_num2count)
def filter_executable_sqls(synthesized_sqls, db_dir):
executable_sqls = []
for sql_info in tqdm(synthesized_sqls):
db_path = os.path.join(db_dir, sql_info["db_id"], sql_info["db_id"] + ".sqlite")
query_plan, _ = execute_sql("EXPLAIN QUERY PLAN " + sql_info["sql"], db_path)
if query_plan is not None:
sql_info["query_plan"] = str(query_plan)
executable_sqls.append(sql_info)
return executable_sqls
def filter_select_sqls(synthesized_sqls):
'''
remain SELECT-type queries
'''
select_sqls = []
for sql_info in tqdm(synthesized_sqls):
# remove comments
sql_wo_comments = re.sub(r'/\*.*?\*/', '', sql_info["sql"], flags=re.DOTALL)
sql_wo_comments = re.sub(r'--.*', '', sql_wo_comments)
sql_wo_comments = sql_wo_comments.strip()
if sql_wo_comments.lower().startswith("select") or \
sql_wo_comments.lower().startswith("with"):
select_sqls.append(sql_info)
return select_sqls
def dedup_using_query_plan(synthesized_sqls):
unique_plans = set()
deduped_sqls = []
for sql_info in tqdm(synthesized_sqls):
query_plan = sql_info["query_plan"]
if query_plan not in unique_plans:
unique_plans.add(query_plan)
deduped_sqls.append(sql_info)
return deduped_sqls
def obtain_sql_template(sql):
# Handles single and double quoted strings, numbers, NULL, TRUE, FALSE
pattern = r"""
(?<!\w)'(?:\\.|[^'])*' | # single quoted strings
(?<!\w)"(?:\\.|[^"])*" | # double quoted strings
(?<!\w)-?\b\d+(\.\d+)?([eE][-+]?\d+)?\b | # numbers with scientific notation
\bNULL\b | # NULL
\bTRUE\b | # TRUE
\bFALSE\b # FALSE
"""
# replace values with a special token <value>
template = re.sub(pattern, "<value>", sql, flags=re.IGNORECASE | re.VERBOSE)
template = template.lower().replace("\n", " ").strip()
# Replace multiple spaces with a single space
template = re.sub(r'\s+', ' ', template)
return template
def dedup_using_query_template(synthesized_sqls):
unique_templates = set()
deduped_sqls = []
for sql_info in tqdm(synthesized_sqls):
template = obtain_sql_template(sql_info["sql"])
if template not in unique_templates:
unique_templates.add(template)
deduped_sqls.append(sql_info)
return deduped_sqls
def parse_response(response):
pattern = r"```sql\s*(.*?)\s*```"
sql_blocks = re.findall(pattern, response, re.DOTALL)
if sql_blocks:
# Extract the last SQL query in the response text and remove extra whitespace characters
last_sql = sql_blocks[-1].strip()
return last_sql
else:
# print("No SQL blocks found.")
return ""
def obtain_db_id2table_names(results, db_dir):
db_ids = list(set([res["db_id"] for res in results]))
print("len(db_ids):", len(db_ids))
db_id2table_names = dict()
for db_id in db_ids:
results, _ = execute_sql(
"SELECT name FROM sqlite_master WHERE type='table';",
os.path.join(db_dir, db_id, db_id + ".sqlite")
)
table_names = [res[0] for res in results]
db_id2table_names[db_id] = table_names
return db_id2table_names
def load_json_file(file):
dataset = []
with open(file, 'r', encoding='utf-8') as f:
objects = ijson.items(f, 'item')
for obj in tqdm(objects):
dataset.append(obj)
return dataset
if __name__ == "__main__":
synthesized_sqls = []
db_dir = "../database_synthesis/synthetic_sqlite_databases"
llm_responses = load_json_file("./results/sql_synthesis.json")
for llm_response in tqdm(llm_responses):
sql = parse_response(llm_response["response"])
if sql == "":
continue
synthesized_sqls.append(
{
"db_id": llm_response["db_id"][:-3] if llm_response["db_id"].endswith(".db") else llm_response["db_id"],
"sql": sql,
"complexity": llm_response["prompt"].split("Ensure the SQL query matches the ")[1].split(" level, defined as follows:")[0]
}
)
print("original sql num:", len(synthesized_sqls))
# analyze_complexity(synthesized_sqls)
# analyze_advanced_functions(synthesized_sqls)
# remove non-SELECT sqls
synthesized_sqls = filter_select_sqls(synthesized_sqls)
print("sql num after removing non-SELECT sql queries:", len(synthesized_sqls))
# analyze_complexity(synthesized_sqls)
# analyze_advanced_functions(synthesized_sqls)
# remove sqls with syntax errors
synthesized_sqls = filter_executable_sqls(synthesized_sqls, db_dir)
print("sql num after removing syntax-error sqls:", len(synthesized_sqls))
# analyze_complexity(synthesized_sqls)
# analyze_advanced_functions(synthesized_sqls)
# # perform deduplication according to the query plan
# synthesized_sqls = dedup_using_query_plan(synthesized_sqls)
# print("sql num after deduplication (query plan level):", len(synthesized_sqls))
# print(synthesized_sqls[0].keys())
# # analyze_complexity(synthesized_sqls)
# # analyze_advanced_functions(synthesized_sqls)
# remove timeout sqls
no_timeout_synthesized_sqls = mp.Manager().list()
remove_timeout_sqls_parallel(synthesized_sqls, db_dir, 10, 2)
synthesized_sqls = list(no_timeout_synthesized_sqls)
print("sql num after removing timeout sqls:", len(synthesized_sqls))
print(synthesized_sqls[0].keys())
# analyze_complexity(synthesized_sqls)
analyze_column_count(synthesized_sqls)
# analyze_advanced_functions(synthesized_sqls)
# perform deduplication according to the query template
synthesized_sqls = dedup_using_query_template(synthesized_sqls)
print("sql num after deduplication (tempalte level):", len(synthesized_sqls))
# analyze_complexity(synthesized_sqls)
analyze_column_count(synthesized_sqls)
# analyze_advanced_functions(synthesized_sqls)
# analyze the number of used tables
analyze_used_tables_num(
synthesized_sqls,
obtain_db_id2table_names(synthesized_sqls, db_dir)
)
with open("./results/synthetic_sqls.json", "w", encoding="utf-8") as f:
f.write(json.dumps(synthesized_sqls, indent=2, ensure_ascii=False))
\ No newline at end of file
**Task Overview**
Create an executable SQL query based on the provided information.
**Database Schema**
{schema_str}
{sql_function_prompt}
{db_value_prompt}
**SQL Query Complexity**
Ensure the SQL query matches the {complexity} level, defined as follows:
{criterion}
**Output Format Requirements**
Enclose the SQL query in a code block:
```sql
-- Your SQL query here
```
**SQL Query Requirements**
1. Use the syntax specific to the {db_engine} database engine.
2. Incorporate advanced functions if appropriate, but they are not mandatory.
3. Address real-world data analysis needs. Avoid trivial or nonsensical queries.
4. (Very important) Ensure the final SQL query selects {column_count} columns.
**Answer**
Let's proceed step by step.
\ No newline at end of file
import argparse
import json
import re
from tqdm import tqdm
import openai
def parse_response(response):
pattern = r"```sql\s*(.*?)\s*```"
sql_blocks = re.findall(pattern, response, re.DOTALL)
if sql_blocks:
# Extract the last SQL query in the response text and remove extra whitespace characters
last_sql = sql_blocks[-1].strip()
return last_sql
else:
print("No SQL blocks found.")
return ""
def llm_inference(model, base_url, prompts, db_ids):
"""
Generates responses using an LLM for given prompts.
Args:
model: The LLM to use for generating responses.
prompts (list of str): A list of prompts for the model.
db_ids (list of str): A list of database IDs corresponding to each prompt.
Returns:
list of dict: A list of dictionaries containing the prompt, db_id, and generated response.
"""
client = openai.OpenAI(
base_url=base_url,
api_key="EMPTY"
)
# Replace with actual LLM call to generate responses
# `responses` should be a list of strings (list of str), where each string is the LLM's output for a prompt.
# responses = None # model.generate(prompts, temperature=0.8, n=1), this is an example call, adjust as needed
responses = []
for prompt in prompts:
response = client.chat.completions.create(
model=model,
messages=[{"role":"user", "content": prompt}],
max_tokens=4196,
temperature=0.2
)
responses.append(response.choices[0].message.content.strip())
results = [
{
"prompt": prompt,
"db_id": db_id,
"response": response
}
for prompt, db_id, response in zip(prompts, db_ids, responses)
]
return results
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--model", type = str)
parser.add_argument("--base_url", type=str)
parser.add_argument("--nums", type=int, default=None)
opt = parser.parse_args()
print(opt)
input_dataset = json.load(open("./prompts/sql_synthesis_prompts.json"))[:opt.nums]
output_file = "./results/sql_synthesis.json"
db_ids = [data["db_id"] for data in input_dataset]
prompts = [data["prompt"] for data in input_dataset]
results = llm_inference(opt.model, opt.base_url, prompts, db_ids)
with open(output_file, "w", encoding = "utf-8") as f:
f.write(json.dumps(results, indent = 2, ensure_ascii = False))
\ No newline at end of file
Task Overview:
You are a data science expert. Below, you are provided with a database schema and a natural language question. Your task is to understand the schema and generate a valid SQL query to answer the question.
Database Engine:
SQLite
Database Schema:
CREATE TABLE cards (
id integer, -- unique id number identifying the cards, example: [41138, 1349]
artist text, -- example: ['Pete Venters', 'Volkan Baǵa']
asciiName text, -- example: ['El-Hajjaj', 'Junun Efreet']
availability text, -- example: ['mtgo,paper', 'paper']
borderColor text, -- example: ['black', 'white']
cardKingdomFoilId text, -- example: ['123094', '123095']
cardKingdomId text, -- example: ['122719', '122720']
colorIdentity text, -- example: ['W', 'B']
colorIndicator text, -- example: ['U', 'G']
colors text, -- example: ['W', 'B']
convertedManaCost real, -- example: [7.0, 5.0]
duelDeck text, -- example: ['a', 'b']
edhrecRank integer, -- rec Rank in edh, example: [15650, 12702]
faceConvertedManaCost real, -- example: [4.0, 5.0]
faceName text, -- example: ['Dusk', 'Dawn']
flavorName text, -- example: ['Godzilla, King of the Monsters', 'King Caesar, Ancient Guardian']
flavorText text, -- example: ['Every tear shed is a drop of immortality', 'The perfect antidote for a tightly packe']
frameEffects text, -- example: ['legendary', 'nyxtouched']
frameVersion text, -- example: ['2003', '1993']
hand text, -- example: ['1', '0']
hasAlternativeDeckLimit integer, -- example: [0, 1]
hasContentWarning integer, -- example: [0, 1]
hasFoil integer, -- example: [0, 1]
hasNonFoil integer, -- example: [1, 0]
isAlternative integer, -- example: [0, 1]
isFullArt integer, -- example: [0, 1]
isOnlineOnly integer, -- example: [0, 1]
isOversized integer, -- example: [0, 1]
isPromo integer, -- is Promotion, example: [0, 1]
isReprint integer, -- example: [1, 0]
isReserved integer, -- example: [0, 1]
isStarter integer, -- example: [0, 1]
isStorySpotlight integer, -- example: [0, 1]
isTextless integer, -- example: [0, 1]
isTimeshifted integer, -- example: [0, 1]
keywords text, -- example: ['First strike', 'Flying']
layout text, -- example: ['normal', 'aftermath']
leadershipSkills text, -- example: ["{'brawl': False, 'commander': True, 'oat", "{'brawl': False, 'commander': False, 'oa"]
life text, -- example: ['-5', '-1']
loyalty text, -- example: ['6', '3']
manaCost text, -- example: ['{5}{W}{W}', '{4}{W}']
mcmId text, -- example: ['16165', '16166']
mcmMetaId text, -- example: ['156', '176']
mtgArenaId text, -- example: ['74983', '74986']
mtgjsonV4Id text, -- example: ['ad41be73-582f-58ed-abd4-a88c1f616ac3', '9eb2e54c-a12b-5e88-a9c0-d8c84c52d59c']
mtgoFoilId text, -- example: ['27501', '26993']
mtgoId text, -- example: ['27500', '26992']
multiverseId text, -- example: ['130550', '129465']
name text, -- example: ["Ancestor's Chosen", 'Angel of Mercy']
number text, -- example: ['1', '2']
originalReleaseDate text, -- example: ['2012/12/1', '2006/12/1']
originalText text, -- example: ['First strike (This creature deals combat', "Flying (This creature can't be blocked e"]
originalType text, -- example: ['Creature - Human Cleric', 'Creature - Angel']
otherFaceIds text, -- example: ['87f0062a-8321-5c16-960e-a12ce1df5839', 'f9f10d34-071c-57a6-b58c-7553abad5c20']
power text, -- example: ['4', '3']
printings text, -- example: ['10E,JUD,UMA', '10E,8ED,9ED,DDC,DVD,IMA,INV,JMP,MB1,P02,']
promoTypes text, -- example: ['boxtopper,boosterfun', 'boosterfun']
purchaseUrls text, -- example: ["{'cardKingdom': 'https://mtgjson.com/lin"]
rarity text, -- example: ['uncommon', 'common']
scryfallId text, -- example: ['7a5cd03c-4227-4551-aa4b-7d119f0468b5', '8f7980d4-da43-4d6d-ad16-14b8a34ae91d']
scryfallIllustrationId text, -- example: ['be2f7173-c8b7-4172-a388-9b2c6b3c16e5', 'e4d6c53f-e936-4be8-8b70-47c2be863b20']
scryfallOracleId text, -- example: ['fc2ccab7-cab1-4463-b73d-898070136d74', 'a2daaf32-dbfe-4618-892e-0da24f63a44a']
setCode text, -- example: ['10E', '2ED']
side text, -- example: ['a', 'b']
subtypes text, -- example: ['Human,Cleric', 'Angel']
supertypes text, -- example: ['Legendary', 'Basic']
tcgplayerProductId text, -- example: ['15032', '15033']
text text, -- example: ['First strike (This creature deals combat', 'Flying\nWhen Angel of Mercy enters the ba']
toughness text, -- example: ['4', '3']
type text, -- example: ['Creature — Human Cleric', 'Creature — Angel']
types text, -- example: ['Creature', 'Instant']
uuid text, -- example: ['00010d56-fe38-5e35-8aed-518019aa36a5', '0001e0d0-2dcd-5640-aadc-a84765cf5fc9']
variations text, -- example: ['b7c19924-b4bf-56fc-aa73-f586e940bd42', '8fd4e2eb-3eb4-50ea-856b-ef638fa47f8a']
watermark text, -- example: ['set', 'set (HOU)', 'set (LGN)']
PRIMARY KEY (id)
);
CREATE TABLE foreign_data (
id integer, -- example: [1, 2]
flavorText text, -- example: ['„Es ist der Wille aller, und meine Hand,', '"La voluntad de todos, realizada por mi ']
`language` text, -- example: ['Italian', 'German', 'Spanish']
multiverseid integer, -- example: [148411, 150317]
name text, -- example: ['Ausgewählter der Ahnfrau', 'Elegido de la Antepasada']
text text, -- example: ['Erstschlag (Diese Kreatur fügt Kampfscha', 'Daña primero. (Esta criatura hace daño d']
type text, -- example: ['Kreatur — Mensch, Kleriker', 'Criatura — Clérigo humano']
uuid text, -- example: ['5f8287b1-5bb6-5f4c-ad17-316a40d5bb0c', '57aaebc1-850c-503d-9f6e-bb8d00d8bf7c']
PRIMARY KEY (id),
CONSTRAINT fk_foreign_data_uuid FOREIGN KEY (uuid) REFERENCES cards (uuid)
);
CREATE TABLE legalities (
id integer, -- example: [1, 2]
format text, -- example: ['commander', 'duel']
status text, -- example: ['Legal', 'Banned']
uuid text, -- example: ['5f8287b1-5bb6-5f4c-ad17-316a40d5bb0c', '57aaebc1-850c-503d-9f6e-bb8d00d8bf7c']
PRIMARY KEY (id),
CONSTRAINT fk_legalities_uuid FOREIGN KEY (uuid) REFERENCES cards (uuid)
);
CREATE TABLE sets (
id integer, -- example: [1, 2]
baseSetSize integer, -- example: [383, 302]
block text, -- example: ['Core Set', 'Mirrodin']
booster text, -- example: ["{'default': {'boosters': [{'contents': {"]
code text, -- example: ['10E', '2ED']
isFoilOnly integer, -- example: [0, 1]
isForeignOnly integer, -- example: [0, 1]
isNonFoilOnly integer, -- example: [0, 1]
isOnlineOnly integer, -- example: [0, 1]
isPartialPreview integer, -- example: [0, 1]
keyruneCode text, -- example: ['10E', '2ED']
mcmId integer, -- magic card market id, example: [74, 3204]
mcmIdExtras integer, -- magic card market ID Extras, example: [3209, 3459]
mcmName text, -- magic card market name, example: ['Tenth Edition', 'Double Masters']
mtgoCode text, -- magic the gathering online code, example: ['10E', '2XM']
name text, -- example: ['Tenth Edition', 'Unlimited Edition']
parentCode text, -- example: ['JMP', 'MH1']
releaseDate date, -- example: ['2007-07-13', '1993-12-01']
tcgplayerGroupId integer, -- example: [1, 115]
totalSetSize integer, -- example: [508, 302]
type text, -- example: ['core', 'masters']
PRIMARY KEY (id)
);
CREATE TABLE set_translations (
id integer, -- example: [1, 2]
`language` text, -- example: ['Italian', 'Chinese Simplified', 'Chinese Traditional']
setCode text, -- example: ['10E', '4ED']
translation text, -- example: ['核心系列第十版', 'Dixième édition']
PRIMARY KEY (id),
CONSTRAINT fk_set_translations_setcode FOREIGN KEY (setCode) REFERENCES sets (code)
);
CREATE TABLE rulings (
id integer, -- example: [1, 2]
`date` date, -- example: ['2007-07-15', '2007-02-01']
text text, -- example: ['You draw the card when Bandage resolves,', 'If you double a negative life total, you']
uuid text, -- example: ['6d268c95-c176-5766-9a46-c14f739aba1c', '56f4935b-f6c5-59b9-88bf-9bcce20247ce']
PRIMARY KEY (id),
CONSTRAINT fk_rulings_uuid FOREIGN KEY (uuid) REFERENCES cards (uuid)
);
This schema describes the database's structure, including tables, columns, primary keys, foreign keys, and any relevant relationships or constraints.
Question:
Italian translation refers to language = 'Italian'; have a translation means translation is not null; base set number of under 100 refers to baseSetSize < 10
Among the sets of cards that have an Italian translation, how many of them have a base set number of under 100?
Instructions:
- Make sure you only output the information that is asked in the question. If the question asks for a specific column, make sure to only include that column in the SELECT clause, nothing more.
- The generated query should return all of the information asked in the question without any missing or extra information.
- Before generating the final SQL query, please think through the steps of how to write the query.
Output Format:
In your answer, please enclose the generated SQL query in a code block:
```sql
-- Your SQL query
```
Take a deep breath and think step by step to find the correct SQL query.
\ No newline at end of file
Task Overview:
You are a data science expert. Below, you are provided with a database schema and a natural language question. Your task is to understand the schema and generate a valid SQL query to answer the question.
Database Engine:
SQLite
Database Schema:
CREATE TABLE continents (
ContId number, -- example: [1, 2]
Continent text, -- example: ['america', 'europe']
PRIMARY KEY (ContId)
);
CREATE TABLE countries (
CountryId number, -- example: [1, 2]
CountryName text, -- example: ['usa', 'germany']
Continent number, -- example: [1, 2]
PRIMARY KEY (CountryId),
CONSTRAINT fk_countries_continent FOREIGN KEY (Continent) REFERENCES continents (ContId)
);
CREATE TABLE car_makers (
Id number, -- example: [1, 2]
Maker text, -- example: ['amc', 'volkswagen']
FullName text, -- example: ['American Motor Company', 'Volkswagen']
Country text, -- example: ['1', '2']
PRIMARY KEY (Id),
CONSTRAINT fk_car_makers_country FOREIGN KEY (Country) REFERENCES countries (CountryId)
);
CREATE TABLE model_list (
ModelId number, -- example: [1, 2]
Maker number, -- example: [1, 2]
Model text, -- example: ['amc', 'audi']
PRIMARY KEY (ModelId),
CONSTRAINT fk_model_list_maker FOREIGN KEY (Maker) REFERENCES car_makers (Id)
);
CREATE TABLE car_names (
MakeId number, -- example: [1, 2]
Model text, -- example: ['chevrolet', 'buick']
Make text, -- example: ['chevrolet chevelle malibu', 'buick skylark 320']
PRIMARY KEY (MakeId),
CONSTRAINT fk_car_names_model FOREIGN KEY (Model) REFERENCES model_list (Model)
);
CREATE TABLE cars_data (
Id number, -- example: [1, 2]
MPG text, -- example: ['18', '15']
Cylinders number, -- example: [8, 4]
Edispl number, -- example: [307.0, 350.0]
Horsepower text, -- example: ['130', '165']
Weight number, -- example: [3504, 3693]
Accelerate number, -- example: [12.0, 11.5]
`Year` number, -- example: [1970, 1971]
PRIMARY KEY (Id),
CONSTRAINT fk_cars_data_id FOREIGN KEY (Id) REFERENCES car_names (MakeId)
);
This schema describes the database's structure, including tables, columns, primary keys, foreign keys, and any relevant relationships or constraints.
Question:
How many car makers are there in each continents? List the continent name and the count.
Instructions:
- Make sure you only output the information that is asked in the question. If the question asks for a specific column, make sure to only include that column in the SELECT clause, nothing more.
- The generated query should return all of the information asked in the question without any missing or extra information.
- Before generating the final SQL query, please think through the steps of how to write the query.
Output Format:
In your answer, please enclose the generated SQL query in a code block:
```sql
-- Your SQL query
```
Take a deep breath and think step by step to find the correct SQL query.
\ No newline at end of file
Task Overview:
You are a data science expert. Below, you are provided with a database schema and a natural language question. Your task is to understand the schema and generate a valid SQL query to answer the question.
Database Engine:
SQLite
Database Schema:
CREATE TABLE Player_Attributes (
id integer, -- example: [1, 2]
player_fifa_api_id integer, -- player federation international football association api id, example: [218353, 189615]
player_api_id integer, -- example: [505942, 155782]
`date` text, -- example: ['2016-02-18 00:00:00', '2015-11-19 00:00:00']
overall_rating integer, -- example: [67, 62]
potential integer, -- example: [71, 66]
preferred_foot text, -- example: ['right', 'left']
attacking_work_rate text, -- example: ['medium', 'high']
defensive_work_rate text, -- example: ['medium', 'high']
crossing integer, -- example: [49, 48]
finishing integer, -- example: [44, 43]
heading_accuracy integer, -- example: [71, 70]
short_passing integer, -- example: [61, 60]
volleys integer, -- example: [44, 43]
dribbling integer, -- example: [51, 50]
curve integer, -- example: [45, 44]
free_kick_accuracy integer, -- example: [39, 38]
long_passing integer, -- example: [64, 63]
ball_control integer, -- example: [49, 48]
acceleration integer, -- example: [60, 79]
sprint_speed integer, -- example: [64, 78]
agility integer, -- example: [59, 78]
reactions integer, -- example: [47, 46]
balance integer, -- example: [65, 90]
shot_power integer, -- example: [55, 54]
jumping integer, -- example: [58, 85]
stamina integer, -- example: [54, 79]
strength integer, -- example: [76, 56]
long_shots integer, -- example: [35, 34]
aggression integer, -- example: [71, 63]
interceptions integer, -- example: [70, 41]
positioning integer, -- example: [45, 44]
vision integer, -- example: [54, 53]
penalties integer, -- example: [48, 47]
marking integer, -- example: [65, 62]
standing_tackle integer, -- example: [69, 66]
sliding_tackle integer, -- example: [69, 66]
gk_diving integer, -- goalkeep diving, example: [6, 5]
gk_handling integer, -- goalkeep handling, example: [11, 10]
gk_kicking integer, -- goalkeep kicking, example: [10, 9]
gk_positioning integer, -- goalkeep positioning, example: [8, 7]
gk_reflexes integer, -- goalkeep reflexes, example: [8, 7]
PRIMARY KEY (id),
CONSTRAINT fk_player_attributes_player_fifa_api_id FOREIGN KEY (player_fifa_api_id) REFERENCES Player (player_fifa_api_id),
CONSTRAINT fk_player_attributes_player_api_id FOREIGN KEY (player_api_id) REFERENCES Player (player_api_id)
);
CREATE TABLE Player (
id integer, -- example: [3879, 401]
player_api_id integer, -- example: [2625, 2752]
player_name text, -- example: ['Aaron Mooy', 'Aaron Appindangoye', 'Aaron Cresswell']
player_fifa_api_id integer, -- player federation international football association api id, example: [2, 6]
birthday text, -- example: ['1992-02-29 00:00:00', '1989-12-15 00:00:00']
height integer, -- example: [182.88, 170.18]
weight integer, -- example: [187, 146]
PRIMARY KEY (id)
);
CREATE TABLE League (
id integer, -- example: [1, 1729]
country_id integer, -- example: [1, 1729]
name text, -- example: ['Belgium Jupiler League', 'England Premier League']
PRIMARY KEY (id),
CONSTRAINT fk_league_country_id FOREIGN KEY (country_id) REFERENCES Country (id)
);
CREATE TABLE Country (
id integer, -- example: [1, 1729]
name text, -- example: ['Belgium', 'England']
PRIMARY KEY (id)
);
CREATE TABLE Team (
id integer, -- example: [31446, 1513]
team_api_id integer, -- example: [1601, 1773]
team_fifa_api_id integer, -- team federation international football association api id, example: [673, 675]
team_long_name text, -- example: ['KRC Genk', 'Beerschot AC']
team_short_name text, -- example: ['GEN', 'BAC']
PRIMARY KEY (id)
);
CREATE TABLE Team_Attributes (
id integer, -- example: [1, 2]
team_fifa_api_id integer, -- team federation international football association api id, example: [434, 77]
team_api_id integer, -- example: [9930, 8485]
`date` text, -- example: ['2010-02-22 00:00:00', '2014-09-19 00:00:00']
buildUpPlaySpeed integer, -- example: [60, 52]
buildUpPlaySpeedClass text, -- example: ['Balanced', 'Fast']
buildUpPlayDribbling integer, -- example: [48, 41]
buildUpPlayDribblingClass text, -- example: ['Little', 'Normal']
buildUpPlayPassing integer, -- example: [50, 56]
buildUpPlayPassingClass text, -- example: ['Mixed', 'Long']
buildUpPlayPositioningClass text, -- example: ['Organised', 'Free Form']
chanceCreationPassing integer, -- example: [60, 54]
chanceCreationPassingClass text, -- example: ['Normal', 'Risky']
chanceCreationCrossing integer, -- example: [65, 63]
chanceCreationCrossingClass text, -- example: ['Normal', 'Lots']
chanceCreationShooting integer, -- example: [55, 64]
chanceCreationShootingClass text, -- example: ['Normal', 'Lots']
chanceCreationPositioningClass text, -- example: ['Organised', 'Free Form']
defencePressure integer, -- example: [50, 47]
defencePressureClass text, -- example: ['Medium', 'Deep']
defenceAggression integer, -- example: [55, 44]
defenceAggressionClass text, -- example: ['Press', 'Double']
defenceTeamWidth integer, -- example: [45, 54]
defenceTeamWidthClass text, -- example: ['Normal', 'Wide']
defenceDefenderLineClass text, -- example: ['Cover', 'Offside Trap']
PRIMARY KEY (id),
CONSTRAINT fk_team_attributes_team_fifa_api_id FOREIGN KEY (team_fifa_api_id) REFERENCES Team (team_fifa_api_id),
CONSTRAINT fk_team_attributes_team_api_id FOREIGN KEY (team_api_id) REFERENCES Team (team_api_id)
);
CREATE TABLE `Match` (
id integer, -- example: [4769, 4770]
country_id integer, -- example: [1, 1729]
league_id integer, -- example: [1, 1729]
season text, -- example: ['2008/2009', '2009/2010']
stage integer, -- example: [1, 10]
`date` text, -- example: ['2008-08-17 00:00:00', '2008-08-16 00:00:00']
match_api_id integer, -- example: [483129, 483130]
home_team_api_id integer, -- example: [9987, 10000]
away_team_api_id integer, -- example: [9993, 9994]
home_team_goal integer, -- example: [1, 0]
away_team_goal integer, -- example: [1, 0]
home_player_X1 integer, -- example: [1, 2]
home_player_X2 integer, -- example: [2, 4]
home_player_X3 integer, -- example: [4, 6]
home_player_X4 integer, -- example: [6, 8]
home_player_X5 integer, -- example: [8, 6]
home_player_X6 integer, -- example: [2, 6]
home_player_X7 integer, -- example: [4, 8]
home_player_X8 integer, -- example: [6, 2]
home_player_X9 integer, -- example: [8, 4]
home_player_X10 integer, -- example: [4, 6]
home_player_X11 integer, -- example: [6, 4]
away_player_X1 integer, -- example: [1, 2]
away_player_X2 integer, -- example: [2, 4]
away_player_X3 integer, -- example: [4, 6]
away_player_X4 integer, -- example: [6, 8]
away_player_X5 integer, -- example: [8, 6]
away_player_X6 integer, -- example: [2, 4]
away_player_X7 integer, -- example: [4, 6]
away_player_X8 integer, -- example: [6, 8]
away_player_X9 integer, -- example: [8, 2]
away_player_X10 integer, -- example: [4, 6]
away_player_X11 integer, -- example: [6, 4]
home_player_Y1 integer, -- example: [1, 3]
home_player_Y2 integer, -- example: [3, 0]
home_player_Y3 integer, -- example: [3, 5]
home_player_Y4 integer, -- example: [3, 5]
home_player_Y5 integer, -- example: [3, 7]
home_player_Y6 integer, -- example: [7, 3]
home_player_Y7 integer, -- example: [7, 6]
home_player_Y8 integer, -- example: [7, 8]
home_player_Y9 integer, -- example: [7, 10]
home_player_Y10 integer, -- example: [10, 7]
home_player_Y11 integer, -- example: [10, 11]
away_player_Y1 integer, -- example: [1, 3]
away_player_Y2 integer, -- example: [3]
away_player_Y3 integer, -- example: [3, 7]
away_player_Y4 integer, -- example: [3, 5]
away_player_Y5 integer, -- example: [3, 7]
away_player_Y6 integer, -- example: [7, 3]
away_player_Y7 integer, -- example: [7, 6]
away_player_Y8 integer, -- example: [7, 8]
away_player_Y9 integer, -- example: [7, 10]
away_player_Y10 integer, -- example: [10, 7]
away_player_Y11 integer, -- example: [10, 11]
home_player_1 integer, -- example: [39890, 38327]
home_player_2 integer, -- example: [67950, 39580]
home_player_3 integer, -- example: [38788, 67958]
home_player_4 integer, -- example: [38312, 67959]
home_player_5 integer, -- example: [26235, 37112]
home_player_6 integer, -- example: [36393, 46004]
home_player_7 integer, -- example: [148286, 164732]
home_player_8 integer, -- example: [67898, 39631]
home_player_9 integer, -- example: [26916, 164352]
home_player_10 integer, -- example: [38801, 38423]
home_player_11 integer, -- example: [94289, 26502]
away_player_1 integer, -- example: [34480, 37937]
away_player_2 integer, -- example: [38388, 38293]
away_player_3 integer, -- example: [26458, 148313]
away_player_4 integer, -- example: [13423, 104411]
away_player_5 integer, -- example: [38389, 148314]
away_player_6 integer, -- example: [38798, 37202]
away_player_7 integer, -- example: [30949, 43158]
away_player_8 integer, -- example: [38253, 9307]
away_player_9 integer, -- example: [106013, 42153]
away_player_10 integer, -- example: [38383, 32690]
away_player_11 integer, -- example: [46552, 38782]
goal text, -- example: ['<goal><value><comment>n</comment><stats>']
shoton text, -- example: ['<shoton><value><stats><blocked>1</blocke']
shotoff text, -- example: ['<shotoff><value><stats><shotoff>1</shoto']
foulcommit text, -- example: ['<foulcommit><value><stats><foulscommitte']
card text, -- example: ['<card><value><comment>y</comment><stats>', '<card />']
`cross` text, -- example: ['<cross><value><stats><crosses>1</crosses']
corner text, -- example: ['<corner><value><stats><corners>1</corner']
possession text, -- example: ['<possession><value><comment>56</comment>', '<possession><value><comment>65</comment>']
B365H real, -- example: [1.73, 1.95]
B365D real, -- example: [3.4, 3.2]
B365A real, -- example: [5.0, 3.6]
BWH real, -- example: [1.75, 1.8]
BWD real, -- example: [3.35, 3.3]
BWA real, -- example: [4.2, 3.95]
IWH real, -- example: [1.85, 1.9]
IWD real, -- example: [3.2, 3.1]
IWA real, -- example: [3.5, 2.3]
LBH real, -- example: [1.8, 1.9]
LBD real, -- example: [3.3, 3.2]
LBA real, -- example: [3.75, 3.5]
PSH real, -- example: [5.1, 2.48]
PSD real, -- example: [3.82, 3.52]
PSA real, -- example: [1.76, 2.96]
WHH real, -- example: [1.7, 1.83]
WHD real, -- example: [3.3, 3.25]
WHA real, -- example: [4.33, 3.6]
SJH real, -- example: [1.9, 1.95]
SJD real, -- example: [3.3, 4.0]
SJA real, -- example: [4.0, 3.8]
VCH real, -- example: [1.65, 2.0]
VCD real, -- example: [3.4, 3.25]
VCA real, -- example: [4.5, 3.25]
GBH real, -- example: [1.78, 1.85]
GBD real, -- example: [3.25, 3.2]
GBA real, -- example: [4.0, 3.75]
BSH real, -- example: [1.73, 1.91]
BSD real, -- example: [3.4, 3.25]
BSA real, -- example: [4.2, 3.6]
PRIMARY KEY (id),
CONSTRAINT fk_match_home_team_api_id FOREIGN KEY (home_team_api_id) REFERENCES Team (team_api_id),
CONSTRAINT fk_match_away_team_api_id FOREIGN KEY (away_team_api_id) REFERENCES Team (team_api_id),
CONSTRAINT fk_match_home_player_1 FOREIGN KEY (home_player_1) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_home_player_2 FOREIGN KEY (home_player_2) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_home_player_3 FOREIGN KEY (home_player_3) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_home_player_4 FOREIGN KEY (home_player_4) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_home_player_5 FOREIGN KEY (home_player_5) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_home_player_6 FOREIGN KEY (home_player_6) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_home_player_7 FOREIGN KEY (home_player_7) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_home_player_8 FOREIGN KEY (home_player_8) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_home_player_9 FOREIGN KEY (home_player_9) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_home_player_10 FOREIGN KEY (home_player_10) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_home_player_11 FOREIGN KEY (home_player_11) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_away_player_1 FOREIGN KEY (away_player_1) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_away_player_2 FOREIGN KEY (away_player_2) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_away_player_3 FOREIGN KEY (away_player_3) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_away_player_4 FOREIGN KEY (away_player_4) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_away_player_5 FOREIGN KEY (away_player_5) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_away_player_6 FOREIGN KEY (away_player_6) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_away_player_7 FOREIGN KEY (away_player_7) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_away_player_8 FOREIGN KEY (away_player_8) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_away_player_9 FOREIGN KEY (away_player_9) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_away_player_10 FOREIGN KEY (away_player_10) REFERENCES Player (player_api_id),
CONSTRAINT fk_match_away_player_11 FOREIGN KEY (away_player_11) REFERENCES Player (player_api_id)
);
This schema describes the database's structure, including tables, columns, primary keys, foreign keys, and any relevant relationships or constraints.
Question:
Aaron Mooy refers to player_name = 'Aaron Mooy'; on 2016/2/4 refers to date LIKE '2016-02-04%';
What was the overall rating for Aaron Mooy on 2016/2/4?
Instructions:
- Make sure you only output the information that is asked in the question. If the question asks for a specific column, make sure to only include that column in the SELECT clause, nothing more.
- The generated query should return all of the information asked in the question without any missing or extra information.
- Before generating the final SQL query, please think through the steps of how to write the query.
Output Format:
In your answer, please enclose the generated SQL query in a code block:
```sql
-- Your SQL query
```
Take a deep breath and think step by step to find the correct SQL query.
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment