@lap v0.3
# Machine-readable API spec. Each @endpoint block is one API call.
@api Text Generation Inference
@version 3.3.6-dev0
@endpoints 12
@toc root(1), chat_tokenize(1), generate(1), generate_stream(1), health(1), info(1), invocations(1), metrics(1), tokenize(1), chat(1), completions(1), models(1)

@group root
@endpoint POST /
@required {inputs: str}
@optional {parameters: map{adapter_id: str, best_of: int, decoder_input_details: bool, details: bool, do_sample: bool, frequency_penalty: num(float), grammar: any, max_new_tokens: int(int32), repetition_penalty: num(float), return_full_text: bool, seed: int(int64), stop: [str], temperature: num(float), top_k: int(int32), top_n_tokens: int(int32), top_p: num(float), truncate: int, typical_p: num(float), watermark: bool}, stream: bool=false}
@returns(200)
@errors {422, 424, 429, 500}

@endgroup

@group chat_tokenize
@endpoint POST /chat_tokenize
@required {messages: [any]}
@optional {frequency_penalty: num(float), logit_bias: [num(float)], logprobs: bool, max_tokens: int(int32)=1024, model: str, n: int(int32), presence_penalty: num(float), response_format: any=null, seed: int(int64), stop: [str], stream: bool, stream_options: any, temperature: num(float), tool_choice: any=auto, tool_prompt: str, tools: [map{function!: map, type!: str}], top_logprobs: int(int32), top_p: num(float)}
@returns(200) {templated_text: str, tokenize_response: [map]}
@errors {404}

@endgroup

@group generate
@endpoint POST /generate
@required {inputs: str}
@optional {parameters: map{adapter_id: str, best_of: int, decoder_input_details: bool, details: bool, do_sample: bool, frequency_penalty: num(float), grammar: any, max_new_tokens: int(int32), repetition_penalty: num(float), return_full_text: bool, seed: int(int64), stop: [str], temperature: num(float), top_k: int(int32), top_n_tokens: int(int32), top_p: num(float), truncate: int, typical_p: num(float), watermark: bool}}
@returns(200) {details: any?, generated_text: str}
@errors {422, 424, 429, 500}

@endgroup

@group generate_stream
@endpoint POST /generate_stream
@required {inputs: str}
@optional {parameters: map{adapter_id: str, best_of: int, decoder_input_details: bool, details: bool, do_sample: bool, frequency_penalty: num(float), grammar: any, max_new_tokens: int(int32), repetition_penalty: num(float), return_full_text: bool, seed: int(int64), stop: [str], temperature: num(float), top_k: int(int32), top_n_tokens: int(int32), top_p: num(float), truncate: int, typical_p: num(float), watermark: bool}}
@returns(200)
@errors {422, 424, 429, 500}

@endgroup

@group health
@endpoint GET /health
@returns(200)
@errors {503}

@endgroup

@group info
@endpoint GET /info
@returns(200) {docker_label: str?, max_best_of: int, max_client_batch_size: int, max_concurrent_requests: int, max_input_tokens: int, max_stop_sequences: int, max_total_tokens: int, model_id: str, model_pipeline_tag: str?, model_sha: str?, router: str, sha: str?, validation_workers: int, version: str}

@endgroup

@group invocations
@endpoint POST /invocations
@returns(200)
@errors {422, 424, 429, 500}

@endgroup

@group metrics
@endpoint GET /metrics
@returns(200)

@endgroup

@group tokenize
@endpoint POST /tokenize
@required {inputs: str}
@optional {parameters: map{adapter_id: str, best_of: int, decoder_input_details: bool, details: bool, do_sample: bool, frequency_penalty: num(float), grammar: any, max_new_tokens: int(int32), repetition_penalty: num(float), return_full_text: bool, seed: int(int64), stop: [str], temperature: num(float), top_k: int(int32), top_n_tokens: int(int32), top_p: num(float), truncate: int, typical_p: num(float), watermark: bool}}
@returns(200)
@errors {404}

@endgroup

@group chat
@endpoint POST /v1/chat/completions
@required {messages: [any]}
@optional {frequency_penalty: num(float), logit_bias: [num(float)], logprobs: bool, max_tokens: int(int32)=1024, model: str, n: int(int32), presence_penalty: num(float), response_format: any=null, seed: int(int64), stop: [str], stream: bool, stream_options: any, temperature: num(float), tool_choice: any=auto, tool_prompt: str, tools: [map{function!: map, type!: str}], top_logprobs: int(int32), top_p: num(float)}
@returns(200) {choices: [map], created: int(int64), id: str, model: str, system_fingerprint: str, usage: map{completion_tokens: int(int32), prompt_tokens: int(int32), total_tokens: int(int32)}}
@errors {422, 424, 429, 500}

@endgroup

@group completions
@endpoint POST /v1/completions
@required {prompt: [str]}
@optional {frequency_penalty: num(float), max_tokens: int(int32)=1024, model: str, repetition_penalty: num(float), seed: int(int64), stop: [str], stream: bool, suffix: str, temperature: num(float), top_p: num(float)}
@returns(200) {choices: [map], created: int(int64), id: str, model: str, system_fingerprint: str, usage: map{completion_tokens: int(int32), prompt_tokens: int(int32), total_tokens: int(int32)}}
@errors {422, 424, 429, 500}

@endgroup

@group models
@endpoint GET /v1/models
@returns(200) {created: int(int64), id: str, object: str, owned_by: str}
@errors {404}

@endgroup

@end