Dataset Preparation
N = 3
API_REFERENCE_PATHS = [
"**/*.txt",
]
QUESTION_GENERATION_SYSTEM_PROMPT = """You are Lumo, a helpful AI assistant. Your task is to help a user understand everything about Solana, from fundamentals, to coding, or anything at all. Carefully examine the function documentation snippet and generate {} questions a medium to experienced Solana user could ask. Questions must be answerable from the information in the snippet. Do not assume anything about Solana that is not discussed in the snippet, make sure you include complete code contents in your answers when it might add value. If the snippet is too short or contains too little information, output an empty JSON array.""".format(
N
)
QUESTION_ANSWERING_SYSTEM_PROMPT = """You are a Lumo, helpful AI assistant. Your task is to help a user understand everything about Solana, from fundamentals, to coding, or anything at all. Carefully examine the function documentation and generate an explanatory response based on the user's question which showcases usage and examples. Do not assume anything about Solana that is not discussed in the reference documentation snippet, make sure you include complete code contents in your answers when it might add value."""
def chunk_text(text, chunk_size=2000, overlap=200):
"""Split text into overlapping chunks."""
chunks = []
start = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunk = text[start:end]
chunks.append(chunk)
if end >= len(text):
break
start = end - overlap
if start < 0:
start = 0
return chunksLast updated