{"note":"OpenAPI conversion -- returning structured metadata","name":"together","description":"Together APIs","version":"2.0.0","base_url":"https://api.together.xyz/v1","endpoints":98,"raw":"@lap v0.3\n# Machine-readable API spec. Each @endpoint block is one API call.\n@api Together APIs\n@base https://api.together.xyz/v1\n@version 2.0.0\n@auth Bearer bearer\n@endpoints 98\n@hint download_for_search\n@toc deployments(17), voices(1), videos(2), chat(1), completions(1), embeddings(1), models(2), jobs(2), images(1), files(5), fine-tunes(10), finetune(1), rerank(1), audio(4), compute(11), clusters(1), endpoints(5), hardware(1), tci(2), batches(4), evaluation(5), realtime(1), queue(4), rl(15)\n\n@group deployments\n@endpoint GET /deployments\n@desc Get the list of deployments\n@returns(200) {data: [map], object: any} # List of deployments\n@errors {500: Internal server error}\n\n@endpoint POST /deployments\n@desc Create a new deployment\n@required {gpu_type: str # GPUType specifies the GPU hardware to use (e.g., \"h100-80gb\")., image: str # Image is the container image to deploy from registry.together.ai., name: str # Name is the unique identifier for your deployment. Must contain only alphanumeric characters, underscores, or hyphens (1-100 characters)}\n@optional {args: [str] # Args overrides the container's CMD. Provide as an array of arguments (e.g., [\"python\", \"app.py\"]), autoscaling: any # Autoscaling configuration. Example: {\"metric\": \"QueueBacklogPerWorker\", \"target\": 1.01} to scale based on queue backlog. Omit or set to null to disable autoscaling, command: [str] # Command overrides the container's ENTRYPOINT. Provide as an array (e.g., [\"/bin/sh\", \"-c\"]), cpu: num # CPU is the number of CPU cores to allocate per container instance (e.g., 0.1 = 100 milli cores), description: str # Description is an optional human-readable description of your deployment, environment_variables: [map{name!: str, value: str, value_from_secret: str}] # EnvironmentVariables is a list of environment variables to set in the container. Each must have a name and either a value or value_from_secret, gpu_count: int # GPUCount is the number of GPUs to allocate per container instance. Defaults to 0 if not specified, health_check_path: str # HealthCheckPath is the HTTP path for health checks (e.g., \"/health\"). If set, the platform will check this endpoint to determine container health, max_replicas: int # MaxReplicas is the maximum number of container instances that can be scaled up to. If not set, will be set to MinReplicas, memory: num # Memory is the amount of RAM to allocate per container instance in GiB (e.g., 0.5 = 512MiB), min_replicas: int # MinReplicas is the minimum number of container instances to run. Defaults to 1 if not specified, port: int # Port is the container port your application listens on (e.g., 8080 for web servers). Required if your application serves traffic, storage: int # Storage is the amount of ephemeral disk storage to allocate per container instance (e.g., 10 = 10GiB), termination_grace_period_seconds: int # TerminationGracePeriodSeconds is the time in seconds to wait for graceful shutdown before forcefully terminating the replica, volumes: [map{mount_path!: str, name!: str, version: int}] # Volumes is a list of volume mounts to attach to the container. Each mount must reference an existing volume by name}\n@returns(200) {args: [str], autoscaling: any, command: [str], cpu: num, created_at: str(date-time), description: str, desired_replicas: int, environment_variables: [map], gpu_count: int, gpu_type: str, health_check_path: str, id: str, image: str, max_replicas: int, memory: num, min_replicas: int, name: str, object: any, port: int, ready_replicas: int, replica_events: map, status: any, storage: int, updated_at: str(date-time), volumes: [map]} # Deployment created successfully\n@errors {400: Invalid request, 500: Internal server error}\n\n@endpoint DELETE /deployments/{id}\n@desc Delete a deployment\n@required {id: str}\n@returns(200) Deployment deleted successfully\n@errors {404: Deployment not found, 500: Internal server error}\n\n@endpoint GET /deployments/{id}\n@desc Get a deployment by ID or name\n@required {id: str}\n@returns(200) {args: [str], autoscaling: any, command: [str], cpu: num, created_at: str(date-time), description: str, desired_replicas: int, environment_variables: [map], gpu_count: int, gpu_type: str, health_check_path: str, id: str, image: str, max_replicas: int, memory: num, min_replicas: int, name: str, object: any, port: int, ready_replicas: int, replica_events: map, status: any, storage: int, updated_at: str(date-time), volumes: [map]} # Deployment details\n@errors {404: Deployment not found, 500: Internal server error}\n\n@endpoint PATCH /deployments/{id}\n@desc Update a deployment\n@required {id: str}\n@optional {args: [str] # Args overrides the container's CMD. Provide as an array of arguments (e.g., [\"python\", \"app.py\"]), autoscaling: any # Autoscaling configuration for the deployment. Set to {} to disable autoscaling, command: [str] # Command overrides the container's ENTRYPOINT. Provide as an array (e.g., [\"/bin/sh\", \"-c\"]), cpu: num # CPU is the number of CPU cores to allocate per container instance (e.g., 0.1 = 100 milli cores), description: str # Description is an optional human-readable description of your deployment, environment_variables: [map{name!: str, value: str, value_from_secret: str}] # EnvironmentVariables is a list of environment variables to set in the container. This will replace all existing environment variables, gpu_count: int # GPUCount is the number of GPUs to allocate per container instance, gpu_type: str # GPUType specifies the GPU hardware to use (e.g., \"h100-80gb\"), health_check_path: str # HealthCheckPath is the HTTP path for health checks (e.g., \"/health\"). Set to empty string to disable health checks, image: str # Image is the container image to deploy from registry.together.ai., max_replicas: int # MaxReplicas is the maximum number of replicas that can be scaled up to., memory: num # Memory is the amount of RAM to allocate per container instance in GiB (e.g., 0.5 = 512MiB), min_replicas: int # MinReplicas is the minimum number of replicas to run, name: str # Name is the new unique identifier for your deployment. Must contain only alphanumeric characters, underscores, or hyphens (1-100 characters), port: int # Port is the container port your application listens on (e.g., 8080 for web servers), storage: int # Storage is the amount of ephemeral disk storage to allocate per container instance (e.g., 10 = 10GiB), termination_grace_period_seconds: int # TerminationGracePeriodSeconds is the time in seconds to wait for graceful shutdown before forcefully terminating the replica, volumes: [map{mount_path!: str, name!: str, version: int}] # Volumes is a list of volume mounts to attach to the container. This will replace all existing volumes}\n@returns(200) {args: [str], autoscaling: any, command: [str], cpu: num, created_at: str(date-time), description: str, desired_replicas: int, environment_variables: [map], gpu_count: int, gpu_type: str, health_check_path: str, id: str, image: str, max_replicas: int, memory: num, min_replicas: int, name: str, object: any, port: int, ready_replicas: int, replica_events: map, status: any, storage: int, updated_at: str(date-time), volumes: [map]} # Deployment updated successfully\n@errors {400: Invalid request, 404: Deployment not found, 500: Internal server error}\n\n@endpoint GET /deployments/{id}/logs\n@desc Get logs for a deployment\n@required {id: str}\n@optional {replica_id: str}\n@returns(200) {lines: [str]} # Deployment logs\n@errors {404: Deployment not found, 500: Internal server error}\n\n@endpoint GET /deployments/secrets\n@desc Get the list of project secrets\n@returns(200) {data: [map], object: any} # List of secrets\n@errors {500: Internal server error}\n\n@endpoint POST /deployments/secrets\n@desc Create a new secret\n@required {name: str # Name is the unique identifier for the secret. Can contain alphanumeric characters, underscores, hyphens, forward slashes, and periods (1-100 characters), value: str # Value is the sensitive data to store securely (e.g., API keys, passwords, tokens). This value will be encrypted at rest}\n@optional {description: str # Description is an optional human-readable description of the secret's purpose (max 500 characters), project_id: str # ProjectID is ignored - the project is automatically determined from your authentication}\n@returns(200) {created_at: str, created_by: str, description: str, id: str, last_updated_by: str, name: str, object: any, updated_at: str} # Secret created successfully\n@errors {400: Invalid request, 500: Internal server error}\n\n@endpoint DELETE /deployments/secrets/{id}\n@desc Delete a secret\n@required {id: str}\n@returns(200) Secret deleted successfully\n@errors {404: Secret not found, 500: Internal server error}\n\n@endpoint GET /deployments/secrets/{id}\n@desc Get a secret by ID or name\n@required {id: str}\n@returns(200) {created_at: str, created_by: str, description: str, id: str, last_updated_by: str, name: str, object: any, updated_at: str} # Secret details\n@errors {404: Secret not found, 500: Internal server error}\n\n@endpoint PATCH /deployments/secrets/{id}\n@desc Update a secret\n@required {id: str}\n@optional {description: str # Description is an optional human-readable description of the secret's purpose (max 500 characters), name: str # Name is the new unique identifier for the secret. Can contain alphanumeric characters, underscores, hyphens, forward slashes, and periods (1-100 characters), project_id: str # ProjectID is ignored - the project is automatically determined from your authentication, value: str # Value is the new sensitive data to store securely. Updating this will replace the existing secret value}\n@returns(200) {created_at: str, created_by: str, description: str, id: str, last_updated_by: str, name: str, object: any, updated_at: str} # Secret updated successfully\n@errors {400: Invalid request, 404: Secret not found, 500: Internal server error}\n\n@endpoint GET /deployments/storage/{filename}\n@desc Download a file\n@required {filename: str}\n@errors {307: Redirect to signed download URL, 400: Invalid request, 404: File not found, 500: Internal error}\n\n@endpoint GET /deployments/storage/volumes\n@desc Get the list of project volumes\n@returns(200) {data: [map], object: any} # List of volumes\n@errors {500: Internal server error}\n\n@endpoint POST /deployments/storage/volumes\n@desc Create a new volume\n@required {content: any # Content specifies the content configuration for this volume, name: str # Name is the unique identifier for the volume within the project, type: any # Type is the volume type (currently only \"readOnly\" is supported)}\n@returns(200) {content: map{files: [map], source_prefix: str, type: str}, created_at: str, current_version: int, id: str, mounted_by: [str], name: str, object: str, type: str, updated_at: str, version_history: map} # Volume created successfully\n@errors {400: Invalid request, 500: Internal server error}\n\n@endpoint DELETE /deployments/storage/volumes/{id}\n@desc Delete a volume\n@required {id: str}\n@returns(200) Volume deleted successfully\n@errors {404: Volume not found, 500: Internal server error}\n\n@endpoint GET /deployments/storage/volumes/{id}\n@desc Get a volume by ID or name\n@required {id: str}\n@returns(200) {content: map{files: [map], source_prefix: str, type: str}, created_at: str, current_version: int, id: str, mounted_by: [str], name: str, object: str, type: str, updated_at: str, version_history: map} # Volume details\n@errors {404: Volume not found, 500: Internal server error}\n\n@endpoint PATCH /deployments/storage/volumes/{id}\n@desc Update a volume\n@required {id: str}\n@optional {content: any # Content specifies the new content that will be preloaded to this volume, name: str # Name is the new unique identifier for the volume within the project, type: any # Type is the new volume type (currently only \"readOnly\" is supported)}\n@returns(200) {content: map{files: [map], source_prefix: str, type: str}, created_at: str, current_version: int, id: str, mounted_by: [str], name: str, object: str, type: str, updated_at: str, version_history: map} # Volume updated successfully\n@errors {400: Invalid request, 404: Volume not found, 500: Internal server error}\n\n@endgroup\n\n@group voices\n@endpoint GET /voices\n@desc Fetch available voices for each model\n@returns(200) {data: [map]} # Success\n\n@endgroup\n\n@group videos\n@endpoint GET /videos/{id}\n@desc Fetch video metadata\n@required {id: str}\n@returns(200) {id: str, object: any, model: str, status: str, created_at: num, completed_at: num, size: str, seconds: str, error: map{code: str, message: str}, outputs: map{cost: int, video_url: str}} # Success\n@errors {400: Invalid request parameters., 404: Video ID not found.}\n\n@endpoint POST /videos\n@desc Create video\n@required {model: str # The model to be used for the video creation request.}\n@optional {prompt: str # Text prompt that describes the video to generate., height: int, width: int, resolution: str # Video resolution., ratio: str # Aspect ratio of the video., seconds: str # Clip duration in seconds., fps: int # Frames per second. Defaults to 24., steps: int # The number of denoising steps the model performs during video generation. More steps typically result in higher quality output but require longer processing time., seed: int # Seed to use in initializing the video generation.  Using the same seed allows deterministic video generation.  If not provided a random seed is generated for each request., guidance_scale: int # Controls how closely the video generation follows your prompt. Higher values make the model adhere more strictly to your text description, while lower values allow more creative freedom. guidence_scale affects both visual content and temporal consistency.Recommended range is 6.0-10.0 for most video models. Values above 12 may cause over-guidance artifacts or unnatural motion patterns., output_format: str(MP4/WEBM), output_quality: int # Compression quality. Defaults to 20., negative_prompt: str # Similar to prompt, but specifies what to avoid instead of what to include, generate_audio: bool # Whether to generate audio for the video., media: map{frame_images: [map], frame_videos: [map], reference_images: [str], reference_videos: [map], source_video: map, audio_inputs: [map]} # Contains all media inputs for video generation. Accepted fields depend on the model type., frame_images: [map{input_image!: str, frame: any}] # Deprecated: use media.frame_images instead. Array of images to guide video generation, similar to keyframes., reference_images: [str] # Deprecated: use media.reference_images instead. Unlike frame_images which constrain specific timeline positions, reference images guide the general appearance that should appear consistently across the video.}\n@returns(200) {id: str, object: any, model: str, status: str, created_at: num, completed_at: num, size: str, seconds: str, error: map{code: str, message: str}, outputs: map{cost: int, video_url: str}} # Success\n\n@endgroup\n\n@group chat\n@endpoint POST /chat/completions\n@desc Create chat completion\n@required {messages: [any] # A list of messages comprising the conversation so far., model: str # The name of the model to query.  [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)}\n@optional {max_tokens: int # The maximum number of tokens to generate., stop: [str] # A list of string sequences that will truncate (stop) inference text output. For example, \"\" will stop generation as soon as the model generates the given token., temperature: num(float) # A decimal number from 0-1 that determines the degree of randomness in the response. A temperature less than 1 favors more correctness and is appropriate for question answering or summarization. A value closer to 1 introduces more randomness in the output., top_p: num(float) # A percentage (also called the nucleus parameter) that's used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities. It specifies a probability threshold below which all less likely tokens are filtered out. This technique helps maintain diversity and generate more fluent and natural-sounding text., top_k: int(int32) # An integer that's used to limit the number of choices for the next predicted word or token. It specifies the maximum number of tokens to consider at each step, based on their probability of occurrence. This technique helps to speed up the generation process and can improve the quality of the generated text by focusing on the most likely options., context_length_exceeded_behavior: str(truncate/error)=error # Defined the behavior of the API when max_tokens exceed the maximum context length of the model. When set to 'error', API will return 400 with appropriate error message. When set to 'truncate', override the max_tokens with maximum context length of the model., repetition_penalty: num # A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition., stream: bool # If true, stream tokens as Server-Sent Events as the model generates them instead of waiting for the full model response. The stream terminates with `data: [DONE]`. If false, return a single JSON object containing the results., logprobs: int # An integer between 0 and 20 of the top k tokens to return log probabilities for at each generation step, instead of just the sampled token. Log probabilities help assess model confidence in token predictions., echo: bool # If true, the response will contain the prompt. Can be used with `logprobs` to return prompt logprobs., n: int # The number of completions to generate for each prompt., min_p: num(float) # A number between 0 and 1 that can be used as an alternative to top_p and top-k., presence_penalty: num(float) # A number between -2.0 and 2.0 where a positive value increases the likelihood of a model talking about new topics., frequency_penalty: num(float) # A number between -2.0 and 2.0 where a positive value decreases the likelihood of repeating tokens that have already been mentioned., logit_bias: map # Adjusts the likelihood of specific tokens appearing in the generated output., seed: int # Seed value for reproducibility., function_call: any, response_format: any # An object specifying the format that the model must output.  Setting to `{ \"type\": \"json_schema\", \"json_schema\": {...} }` enables Structured Outputs which ensures the model will match your supplied JSON schema. Learn more in the [Structured Outputs guide](https://docs.together.ai/docs/json-mode).  Setting to `{ \"type\": \"json_object\" }` enables the older JSON mode, which ensures the message the model generates is valid JSON. Using `json_schema` is preferred for models that support it., tools: [map{type: str, function: map}] # A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of functions the model may generate JSON inputs for., tool_choice: any # Controls which (if any) function is called by the model. By default uses `auto`, which lets the model pick between generating a message or calling a function., compliance: any, chat_template_kwargs: map # Additional configuration to pass to model engine., safety_model: str # The name of the moderation model used to validate tokens. Choose from the available moderation models found [here](https://docs.together.ai/docs/inference-models#moderation-models)., reasoning_effort: str(low/medium/high) # Controls the level of reasoning effort the model should apply when generating responses. Higher values may result in more thoughtful and detailed responses but may take longer to generate., reasoning: map{enabled: bool} # For models that support toggling reasoning functionality, this object can be used to control that functionality.}\n@returns(200) {id: str, choices: [map], usage: map?{prompt_tokens: int, completion_tokens: int, total_tokens: int}, created: int, model: str, prompt: [map], object: any, warnings: [map]} # 200\n@errors {400: BadRequest, 401: Unauthorized, 404: NotFound, 429: RateLimit, 503: Overloaded, 504: Timeout}\n\n@endgroup\n\n@group completions\n@endpoint POST /completions\n@desc Create completion\n@required {prompt: str # A string providing context for the model to complete., model: str # The name of the model to query.  [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#chat-models)}\n@optional {max_tokens: int # The maximum number of tokens to generate., stop: [str] # A list of string sequences that will truncate (stop) inference text output. For example, \"\" will stop generation as soon as the model generates the given token., temperature: num(float) # A decimal number from 0-1 that determines the degree of randomness in the response. A temperature less than 1 favors more correctness and is appropriate for question answering or summarization. A value closer to 1 introduces more randomness in the output., top_p: num(float) # A percentage (also called the nucleus parameter) that's used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities. It specifies a probability threshold below which all less likely tokens are filtered out. This technique helps maintain diversity and generate more fluent and natural-sounding text., top_k: int(int32) # An integer that's used to limit the number of choices for the next predicted word or token. It specifies the maximum number of tokens to consider at each step, based on their probability of occurrence. This technique helps to speed up the generation process and can improve the quality of the generated text by focusing on the most likely options., repetition_penalty: num(float) # A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition., stream: bool # If true, stream tokens as Server-Sent Events as the model generates them instead of waiting for the full model response. The stream terminates with `data: [DONE]`. If false, return a single JSON object containing the results., logprobs: int # An integer between 0 and 20 of the top k tokens to return log probabilities for at each generation step, instead of just the sampled token. Log probabilities help assess model confidence in token predictions., echo: bool # If true, the response will contain the prompt. Can be used with `logprobs` to return prompt logprobs., n: int # The number of completions to generate for each prompt., safety_model: str # The name of the moderation model used to validate tokens. Choose from the available moderation models found [here](https://docs.together.ai/docs/inference-models#moderation-models)., min_p: num(float) # A number between 0 and 1 that can be used as an alternative to top-p and top-k., presence_penalty: num(float) # A number between -2.0 and 2.0 where a positive value increases the likelihood of a model talking about new topics., frequency_penalty: num(float) # A number between -2.0 and 2.0 where a positive value decreases the likelihood of repeating tokens that have already been mentioned., logit_bias: map # Adjusts the likelihood of specific tokens appearing in the generated output., seed: int # Seed value for reproducibility.}\n@returns(200) {id: str, choices: [map], prompt: [map], usage: map?{prompt_tokens: int, completion_tokens: int, total_tokens: int}, created: int, model: str, object: any} # 200\n@errors {400: BadRequest, 401: Unauthorized, 404: NotFound, 429: RateLimit, 503: Overloaded, 504: Timeout}\n\n@endgroup\n\n@group embeddings\n@endpoint POST /embeddings\n@desc Create embedding\n@required {model: str # The name of the embedding model to use.  [See all of Together AI's embedding models](https://docs.together.ai/docs/serverless-models#embedding-models), input: any}\n@returns(200) {object: any, model: str, data: [map]} # 200\n@errors {400: BadRequest, 401: Unauthorized, 404: NotFound, 429: RateLimit, 503: Overloaded, 504: Timeout}\n\n@endgroup\n\n@group models\n@endpoint GET /models\n@desc List all models\n@optional {dedicated: bool}\n@returns(200) 200\n@errors {400: BadRequest, 401: Unauthorized, 404: NotFound, 429: RateLimit, 504: Timeout}\n\n@endpoint POST /models\n@desc Upload a custom model or adapter\n@required {model_name: str # The name to give to your uploaded model, model_source: str # The source location of the model (Hugging Face repo or S3 path)}\n@optional {model_type: str(model/adapter)=model # Whether the model is a full model or an adapter, hf_token: str # Hugging Face token (if uploading from Hugging Face), description: str # A description of your model, base_model: str # The base model to use for an adapter if setting it to run against a serverless pool.  Only used for model_type `adapter`., lora_model: str # The lora pool to use for an adapter if setting it to run against, say, a dedicated pool.  Only used for model_type `adapter`.}\n@returns(200) {data: map{job_id: str, model_name: str, model_id: str, model_source: str}, message: str} # Model / adapter upload job created successfully\n\n@endgroup\n\n@group jobs\n@endpoint GET /jobs/{jobId}\n@desc Get job status\n@required {jobId: str}\n@returns(200) {type: str, job_id: str, status: str, status_updates: [map], args: map{description: str, modelName: str, modelSource: str}, created_at: str(date-time), updated_at: str(date-time)} # Job status retrieved successfully\n\n@endpoint GET /jobs\n@desc List all jobs\n@returns(200) {data: [map]} # Jobs retrieved successfully\n\n@endgroup\n\n@group images\n@endpoint POST /images/generations\n@desc Create image\n@required {prompt: str # A description of the desired images. Maximum length varies by model., model: str # The model to use for image generation.  [See all of Together AI's image models](https://docs.together.ai/docs/serverless-models#image-models)}\n@optional {steps: int=20 # Number of generation steps., image_url: str # URL of an image to use for image models that support it., seed: int # Seed used for generation. Can be used to reproduce image generations., n: int=1 # Number of image results to generate., height: int=1024 # Height of the image to generate in number of pixels., width: int=1024 # Width of the image to generate in number of pixels., negative_prompt: str # The prompt or prompts not to guide the image generation., response_format: str(base64/url) # Format of the image response. Can be either a base64 string or a URL., guidance_scale: num=3.5 # Adjusts the alignment of the generated image with the input prompt. Higher values (e.g., 8-10) make the output more faithful to the prompt, while lower values (e.g., 1-5) encourage more creative freedom., output_format: str(jpeg/png)=jpeg # The format of the image response. Can be either be `jpeg` or `png`. Defaults to `jpeg`., image_loras: [map{path!: str, scale!: num}] # An array of objects that define LoRAs (Low-Rank Adaptations) to influence the generated image., reference_images: [str] # An array of image URLs that guide the overall appearance and style of the generated image. These reference images influence the visual characteristics consistently across the generation., disable_safety_checker: bool # If true, disables the safety checker for image generation.}\n@returns(200) {id: str, model: str, object: any, data: [any]} # Image generated successfully\n\n@endgroup\n\n@group files\n@endpoint GET /files\n@desc List all files\n@returns(200) {data: [map]} # List of files\n\n@endpoint GET /files/{id}\n@desc Retrieve file metadata\n@required {id: str}\n@returns(200) {id: str, object: any, created_at: int, filename: str, bytes: int, purpose: str, Processed: bool, FileType: str} # File retrieved successfully\n\n@endpoint DELETE /files/{id}\n@desc Delete a file\n@required {id: str}\n@returns(200) {id: str, deleted: bool} # File deleted successfully\n\n@endpoint GET /files/{id}/content\n@desc Get file contents\n@required {id: str}\n@returns(200) File content retrieved successfully\n@errors {500: Internal Server Error}\n\n@endpoint POST /files/upload\n@desc Upload a file\n@returns(200) {id: str, object: any, created_at: int, filename: str, bytes: int, purpose: str, Processed: bool, FileType: str} # File uploaded successfully\n@errors {400: Bad Request, 401: Unauthorized, 500: Internal Server Error}\n\n@endgroup\n\n@group fine-tunes\n@endpoint POST /fine-tunes\n@desc Create job\n@required {training_file: str # File-ID of a training file uploaded to the Together API, model: str # Name of the base model to run fine-tune job on}\n@optional {validation_file: str # File-ID of a validation file uploaded to the Together API, packing: bool=true # Whether to use sequence packing for training., n_epochs: int=1 # Number of complete passes through the training dataset (higher values may improve results but increase cost and risk of overfitting), n_checkpoints: int=1 # Number of intermediate model versions saved during training for evaluation, n_evals: int=0 # Number of evaluations to be run on a given validation set during training, batch_size: any=max # Number of training examples processed together (larger batches use more memory but may train faster). Defaults to \"max\". We use training optimizations like packing, so the effective batch size may be different than the value you set., learning_rate: num(float)=0.00001 # Controls how quickly the model adapts to new information (too high may cause instability, too low may slow convergence), lr_scheduler: map{lr_scheduler_type!: str, lr_scheduler_args: any}, warmup_ratio: num(float)=0 # The percent of steps at the start of training to linearly increase the learning rate., max_grad_norm: num(float)=1 # Max gradient norm to be used for gradient clipping. Set to 0 to disable., weight_decay: num(float)=0 # Weight decay. Regularization parameter for the optimizer., random_seed: int # Random seed for reproducible training. When set, the same seed produces the same run (e.g. data shuffle, init). If omitted or null, the server applies its default seed (e.g. 42)., suffix: str # Suffix that will be added to your fine-tuned model name, wandb_api_key: str # Integration key for tracking experiments and model metrics on W&B platform, wandb_base_url: str # The base URL of a dedicated Weights & Biases instance., wandb_project_name: str # The Weights & Biases project for your run. If not specified, will use `together` as the project name., wandb_name: str # The Weights & Biases name for your run., wandb_entity: str # The Weights & Biases entity for your run., train_on_inputs: bool=auto # Whether to mask the user messages in conversational data or prompts in instruction data., training_method: map # The training method to use. 'sft' for Supervised Fine-Tuning or 'dpo' for Direct Preference Optimization., training_type: map=null # The training type to use. If not provided, the job will default to LoRA training type., multimodal_params: map{train_vision: bool}, from_checkpoint: str # The checkpoint identifier to continue training from a previous fine-tuning job. Format is `{$JOB_ID}` or `{$OUTPUT_MODEL_NAME}` or `{$JOB_ID}:{$STEP}` or `{$OUTPUT_MODEL_NAME}:{$STEP}`. The step value is optional; without it, the final checkpoint will be used., from_hf_model: str # The Hugging Face Hub repo to start training from. Should be as close as possible to the base model (specified by the `model` argument) in terms of architecture and size., hf_model_revision: str # The revision of the Hugging Face Hub model to continue training from. E.g., hf_model_revision=main (default, used if the argument is not provided) or hf_model_revision='607a30d783dfa663caf39e06633721c8d4cfcd7e' (specific commit)., hf_api_token: str # The API token for the Hugging Face Hub., hf_output_repo_name: str # The name of the Hugging Face repository to upload the fine-tuned model to.}\n@returns(200) {id: str, status: str, created_at: str(date-time), updated_at: str(date-time), started_at: str(date-time), user_id: str, owner_address: str, total_price: int, token_count: int, events: [map], training_file: str, validation_file: str, packing: bool, model: str, model_output_name: str, suffix: str, n_epochs: int, n_evals: int, n_checkpoints: int, batch_size: int, training_type: any, training_method: any, learning_rate: num(float), lr_scheduler: map{lr_scheduler_type: str, lr_scheduler_args: any}, warmup_ratio: num(float), max_grad_norm: num(float), weight_decay: num(float), random_seed: int?, wandb_project_name: str, wandb_name: str, from_checkpoint: str, from_hf_model: str, hf_model_revision: str, progress: map{estimate_available: bool, seconds_remaining: int}} # Fine-tuning job initiated successfully\n\n@endpoint GET /fine-tunes\n@desc List all jobs\n@returns(200) {data: [map]} # List of fine-tune jobs\n\n@endpoint POST /fine-tunes/estimate-price\n@desc Estimate price\n@required {training_file: str # File-ID of a training file uploaded to the Together API}\n@optional {validation_file: str # File-ID of a validation file uploaded to the Together API, model: str # Name of the base model to run fine-tune job on, n_epochs: int=1 # Number of complete passes through the training dataset (higher values may improve results but increase cost and risk of overfitting), n_evals: int=0 # Number of evaluations to be run on a given validation set during training, training_method: map # The training method to use. 'sft' for Supervised Fine-Tuning or 'dpo' for Direct Preference Optimization., training_type: map=null # The training type to use. If not provided, the job will default to LoRA training type., from_checkpoint: str # The checkpoint identifier to continue training from a previous fine-tuning job. Format is `{$JOB_ID}` or `{$OUTPUT_MODEL_NAME}` or `{$JOB_ID}:{$STEP}` or `{$OUTPUT_MODEL_NAME}:{$STEP}`. The step value is optional; without it, the final checkpoint will be used.}\n@returns(200) {estimated_total_price: num, allowed_to_proceed: bool, user_limit: num, estimated_train_token_count: num, estimated_eval_token_count: num} # Price estimated successfully\n@errors {500: Internal Server Error}\n\n@endpoint GET /fine-tunes/{id}\n@desc List job\n@required {id: str}\n@returns(200) {id: str(uuid), training_file: str, validation_file: str, model: str, model_output_name: str, model_output_path: str, trainingfile_numlines: int, trainingfile_size: int, created_at: str(date-time), updated_at: str(date-time), started_at: str(date-time), n_epochs: int, n_checkpoints: int, n_evals: int, batch_size: any, learning_rate: num, lr_scheduler: map{lr_scheduler_type: str, lr_scheduler_args: any}, warmup_ratio: num, max_grad_norm: num(float), weight_decay: num(float), eval_steps: int, train_on_inputs: any, training_method: map, training_type: map, multimodal_params: map{train_vision: bool}, status: str, job_id: str, events: [map], token_count: int, param_count: int, total_price: int, epochs_completed: int, queue_depth: int, wandb_project_name: str, wandb_url: str, from_checkpoint: str, from_hf_model: str, hf_model_revision: str, progress: map{estimate_available: bool, seconds_remaining: int}} # Fine-tune job details retrieved successfully\n\n@endpoint DELETE /fine-tunes/{id}\n@desc Delete a fine-tune job\n@required {id: str}\n@optional {force: bool=false}\n@returns(200) {message: str} # Fine-tune job deleted successfully\n@errors {404: Fine-tune job not found, 500: Internal server error}\n\n@endpoint GET /fine-tunes/{id}/events\n@desc List job events\n@required {id: str}\n@returns(200) {data: [map]} # List of fine-tune events\n\n@endpoint GET /fine-tunes/{id}/checkpoints\n@desc List checkpoints\n@required {id: str}\n@returns(200) {data: [map]} # List of fine-tune checkpoints\n\n@endgroup\n\n@group finetune\n@endpoint GET /finetune/download\n@desc Download model\n@required {ft_id: str}\n@optional {checkpoint_step: int, checkpoint: str(merged/adapter/model_output_path)}\n@returns(200) Successfully downloaded the fine-tuned model or checkpoint.\n@errors {400: Invalid request parameters., 404: Fine-tune ID not found.}\n\n@endgroup\n\n@group fine-tunes\n@endpoint POST /fine-tunes/{id}/cancel\n@desc Cancel job\n@required {id: str}\n@returns(200) {id: str, status: str, created_at: str(date-time), updated_at: str(date-time), started_at: str(date-time), user_id: str, owner_address: str, total_price: int, token_count: int, events: [map], training_file: str, validation_file: str, packing: bool, model: str, model_output_name: str, suffix: str, n_epochs: int, n_evals: int, n_checkpoints: int, batch_size: int, training_type: any, training_method: any, learning_rate: num(float), lr_scheduler: map{lr_scheduler_type: str, lr_scheduler_args: any}, warmup_ratio: num(float), max_grad_norm: num(float), weight_decay: num(float), random_seed: int?, wandb_project_name: str, wandb_name: str, from_checkpoint: str, from_hf_model: str, hf_model_revision: str, progress: map{estimate_available: bool, seconds_remaining: int}} # Successfully cancelled the fine-tuning job.\n@errors {400: Invalid request parameters., 404: Fine-tune ID not found.}\n\n@endpoint GET /fine-tunes/models/supported\n@desc List supported models\n@returns(200) {models: [str]} # List of supported models.\n\n@endpoint GET /fine-tunes/models/limits\n@desc Get model limits\n@required {model_name: str}\n@returns(200) {model_name: str, full_training: map{max_batch_size: int, max_batch_size_dpo: int, min_batch_size: int}, lora_training: map{max_batch_size: int, max_batch_size_dpo: int, min_batch_size: int, max_rank: int, target_modules: [str]}, max_num_epochs: int, max_num_evals: int, max_learning_rate: num, min_learning_rate: num, supports_vision: bool, supports_tools: bool, supports_reasoning: bool, merge_output_lora: bool} # Model limits.\n@errors {404: Model not found or not supported for fine-tuning.}\n\n@endgroup\n\n@group rerank\n@endpoint POST /rerank\n@desc Create a rerank request\n@required {model: str # The model to be used for the rerank request.  [See all of Together AI's rerank models](https://docs.together.ai/docs/serverless-models#rerank-models), query: str # The search query to be used for ranking., documents: any # List of documents, which can be either strings or objects.}\n@optional {top_n: int # The number of top results to return., return_documents: bool # Whether to return supplied documents with the response., rank_fields: [str] # List of keys in the JSON Object document to rank by. Defaults to use all supplied keys for ranking.}\n@returns(200) {object: any, id: str, model: str, results: [map], usage: map?{prompt_tokens: int, completion_tokens: int, total_tokens: int}} # 200\n@errors {400: BadRequest, 401: Unauthorized, 404: NotFound, 429: RateLimit, 503: Overloaded, 504: Timeout}\n\n@endgroup\n\n@group audio\n@endpoint POST /audio/speech\n@desc Create audio generation request\n@required {model: any # The name of the model to query.  [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#audio-models) The current supported tts models are: - cartesia/sonic - hexgrad/Kokoro-82M - canopylabs/orpheus-3b-0.1-ft, input: str # Input text to generate the audio for, voice: str # The voice to use for generating the audio. The voices supported are different for each model. For eg - for canopylabs/orpheus-3b-0.1-ft, one of the voices supported is tara, for hexgrad/Kokoro-82M, one of the voices supported is af_alloy and for cartesia/sonic, one of the voices supported is \"friendly sidekick\".   You can view the voices supported for each model using the /v1/voices endpoint sending the model name as the query parameter. [View all supported voices here](https://docs.together.ai/docs/text-to-speech#supported-voices).}\n@optional {response_format: str(mp3/wav/raw)=wav # The format of audio output. Supported formats are mp3, wav, raw if streaming is false. If streaming is true, the only supported format is raw., language: str(en/de/fr/es/hi/it/ja/ko/nl/pl/pt/ru/sv/tr/zh)=en # Language of input text., response_encoding: str(pcm_f32le/pcm_s16le/pcm_mulaw/pcm_alaw)=pcm_f32le # Audio encoding of response, sample_rate: int=44100 # Sampling rate to use for the output audio. The default sampling rate for canopylabs/orpheus-3b-0.1-ft and hexgrad/Kokoro-82M is 24000 and for cartesia/sonic is 44100., bit_rate: int(32000/64000/96000/128000/192000)=128000 # Bitrate of the MP3 audio output in bits per second. Only applicable when response_format is mp3. Higher values produce better audio quality at larger file sizes. Default is 128000. Currently supported on Cartesia models., stream: bool=false # If true, output is streamed for several characters at a time instead of waiting for the full response. The stream terminates with `data: [DONE]`. If false, return the encoded audio as octet stream}\n@returns(200) OK\n@errors {400: BadRequest, 429: RateLimit}\n\n@endpoint GET /audio/speech/websocket\n@desc Real-time text-to-speech via WebSocket\n@optional {model: str(hexgrad/Kokoro-82M/cartesia/sonic-english)=hexgrad/Kokoro-82M, voice: str, max_partial_length: int=250}\n@errors {101: Switching Protocols - WebSocket connection established successfully.  Error message format: ```json {   \"type\": \"conversation.item.tts.failed\",   \"error\": {     \"message\": \"Error description\",     \"type\": \"invalid_request_error\",     \"param\": null,     \"code\": \"error_code\"   } } ```}\n\n@endpoint POST /audio/transcriptions\n@desc Create audio transcription request\n@returns(200) OK\n@errors {400: BadRequest, 401: Unauthorized, 429: RateLimit}\n\n@endpoint POST /audio/translations\n@desc Create audio translation request\n@returns(200) OK\n@errors {400: BadRequest, 401: Unauthorized, 429: RateLimit}\n\n@endgroup\n\n@group compute\n@endpoint GET /compute/clusters\n@desc List all GPU clusters.\n@returns(200) {clusters: [map]} # OK\n\n@endpoint POST /compute/clusters\n@desc Create GPU Cluster\n@required {region: str # Region to create the GPU cluster in. Usable regions can be found from `client.clusters.list_regions()`, gpu_type: str(H100_SXM/H200_SXM/RTX_6000_PCI/L40_PCIE/B200_SXM/H100_SXM_INF) # Type of GPU to use in the cluster, num_gpus: int # Number of GPUs to allocate in the cluster. This must be multiple of 8. For example, 8, 16 or 24, cluster_name: str # Name of the GPU cluster., driver_version: str(CUDA_12_5_555/CUDA_12_6_560/CUDA_12_6_565/CUDA_12_8_570) # NVIDIA driver version to use in the cluster., billing_type: str(RESERVED/ON_DEMAND) # RESERVED billing types allow you to specify the duration of the cluster reservation via the duration_days field. ON_DEMAND billing types will give you ownership of the cluster until you delete it.}\n@optional {cluster_type: str(KUBERNETES/SLURM) # Type of cluster to create., duration_days: int # Duration in days to keep the cluster running., shared_volume: map{volume_name!: str, size_tib!: int, region!: str}, volume_id: str # ID of an existing volume to use with the cluster creation.}\n@returns(200) {cluster_id: str, cluster_type: any, region: str, gpu_type: any, cluster_name: str, duration_hours: int, driver_version: any, volumes: [map], status: any, control_plane_nodes: [map], gpu_worker_nodes: [map], kube_config: str, num_gpus: int} # OK\n\n@endpoint GET /compute/clusters/{cluster_id}\n@desc Get GPU cluster by cluster ID\n@required {cluster_id: str}\n@returns(200) {cluster_id: str, cluster_type: any, region: str, gpu_type: any, cluster_name: str, duration_hours: int, driver_version: any, volumes: [map], status: any, control_plane_nodes: [map], gpu_worker_nodes: [map], kube_config: str, num_gpus: int} # OK\n\n@endpoint PUT /compute/clusters/{cluster_id}\n@desc Update a GPU Cluster.\n@required {cluster_id: str}\n@optional {cluster_type: any(KUBERNETES/SLURM) # Type of cluster to update., num_gpus: int # Number of GPUs to allocate in the cluster. This must be multiple of 8. For example, 8, 16 or 24}\n@returns(200) {cluster_id: str, cluster_type: any, region: str, gpu_type: any, cluster_name: str, duration_hours: int, driver_version: any, volumes: [map], status: any, control_plane_nodes: [map], gpu_worker_nodes: [map], kube_config: str, num_gpus: int} # OK\n\n@endpoint DELETE /compute/clusters/{cluster_id}\n@desc Delete GPU cluster by cluster ID\n@required {cluster_id: str}\n@returns(200) {cluster_id: str} # OK\n\n@endpoint GET /compute/regions\n@desc List regions and corresponding supported driver versions\n@returns(200) {regions: [map]} # OK\n\n@endpoint GET /compute/clusters/storage/volumes\n@desc List all shared volumes.\n@returns(200) {volumes: [map]} # OK\n\n@endpoint PUT /compute/clusters/storage/volumes\n@desc Update a shared volume.\n@optional {volume_id: str # ID of the volume to update., size_tib: int # Size of the volume in whole tebibytes (TiB).}\n@returns(200) {volume_id: str, volume_name: str, size_tib: int, status: str} # OK\n\n@endpoint POST /compute/clusters/storage/volumes\n@desc Create a shared volume.\n@required {volume_name: str # Customizable name of the volume to create., size_tib: int # Volume size in whole tebibytes (TiB)., region: str # Region name. Usable regions can be found from `client.clusters.list_regions()`}\n@returns(200) {volume_id: str, volume_name: str, size_tib: int, status: str} # OK\n\n@endpoint GET /compute/clusters/storage/volumes/{volume_id}\n@desc Get shared volume by volume Id.\n@required {volume_id: str}\n@returns(200) {volume_id: str, volume_name: str, size_tib: int, status: str} # OK\n\n@endpoint DELETE /compute/clusters/storage/volumes/{volume_id}\n@desc Delete shared volume by volume id.\n@required {volume_id: str}\n@returns(200) {success: bool} # OK\n\n@endgroup\n\n@group clusters\n@endpoint GET /clusters/availability-zones\n@desc List all available availability zones.\n@returns(200) {avzones: [str]} # Success\n\n@endgroup\n\n@group endpoints\n@endpoint GET /endpoints\n@desc List all endpoints, can be filtered by type\n@optional {type: str(dedicated/serverless), usage_type: str(on-demand/reserved), mine: bool}\n@returns(200) {object: any, data: [map]} # 200\n@errors {403: Unauthorized, 500: Internal error}\n\n@endpoint POST /endpoints\n@desc Create a dedicated endpoint, it will start automatically\n@required {model: str # The model to deploy on this endpoint, hardware: str # The hardware configuration to use for this endpoint, autoscaling: map{min_replicas!: int(int32), max_replicas!: int(int32)} # Configuration for automatic scaling of replicas based on demand.}\n@optional {display_name: str # A human-readable name for the endpoint, disable_prompt_cache: bool=false # This parameter is deprecated and no longer has any effect., disable_speculative_decoding: bool=false # Whether to disable speculative decoding for this endpoint, state: str(STARTED/STOPPED)=STARTED # The desired state of the endpoint, inactive_timeout: int # The number of minutes of inactivity after which the endpoint will be automatically stopped. Set to null, omit or set to 0 to disable automatic timeout., availability_zone: str # Create the endpoint in a specified availability zone (e.g., us-central-4b)}\n@returns(200) {object: any, id: str, name: str, display_name: str, model: str, hardware: str, type: str, owner: str, state: str, autoscaling: map{min_replicas: int(int32), max_replicas: int(int32)}, created_at: str(date-time)} # 200\n@errors {403: Unauthorized, 500: Internal error}\n\n@endpoint GET /endpoints/{endpointId}\n@desc Get endpoint by ID\n@required {endpointId: str}\n@returns(200) {object: any, id: str, name: str, display_name: str, model: str, hardware: str, type: str, owner: str, state: str, autoscaling: map{min_replicas: int(int32), max_replicas: int(int32)}, created_at: str(date-time)} # 200\n@errors {403: Unauthorized, 404: Not Found, 500: Internal error}\n\n@endpoint PATCH /endpoints/{endpointId}\n@desc Update endpoint, this can also be used to start or stop a dedicated endpoint\n@required {endpointId: str}\n@optional {display_name: str # A human-readable name for the endpoint, state: str(STARTED/STOPPED) # The desired state of the endpoint, autoscaling: map{min_replicas!: int(int32), max_replicas!: int(int32)} # Configuration for automatic scaling of replicas based on demand., inactive_timeout: int # The number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable automatic timeout.}\n@returns(200) {object: any, id: str, name: str, display_name: str, model: str, hardware: str, type: str, owner: str, state: str, autoscaling: map{min_replicas: int(int32), max_replicas: int(int32)}, created_at: str(date-time)} # 200\n@errors {403: Unauthorized, 404: Not Found, 500: Internal error}\n\n@endpoint DELETE /endpoints/{endpointId}\n@desc Delete endpoint\n@required {endpointId: str}\n@returns(204) No Content - Endpoint successfully deleted\n@errors {403: Unauthorized, 404: Not Found, 500: Internal error}\n\n@endgroup\n\n@group hardware\n@endpoint GET /hardware\n@desc List available hardware configurations\n@optional {model: str}\n@returns(200) {object: any, data: [map]} # List of available hardware configurations\n@errors {403: Unauthorized, 500: Internal error}\n\n@endgroup\n\n@group tci\n@endpoint POST /tci/execute\n@desc Executes the given code snippet and returns the output. Without a session_id, a new session will be created to run the code. If you do pass in a valid session_id, the code will be run in that session. This is useful for running multiple code snippets in the same environment, because dependencies and similar things are persisted\n@required {code: str # Code snippet to execute., language: any=python # Programming language for the code to execute. Currently only supports Python, but more will be added.}\n@optional {files: [map{content!: str, encoding!: str, name!: str}] # Files to upload to the session. If present, files will be uploaded before executing the given code., session_id: str # Identifier of the current session. Used to make follow-up calls. Requests will return an error if the session does not belong to the caller or has expired.}\n@returns(200) Execute Response\n\n@endpoint GET /tci/sessions\n@desc Lists all your currently active sessions.\n@returns(200) List Response\n\n@endgroup\n\n@group batches\n@endpoint GET /batches\n@desc List batch jobs\n@returns(200) OK\n@errors {401: Unauthorized, 500: Internal Server Error}\n\n@endpoint POST /batches\n@desc Create a batch job\n@required {endpoint: str # The endpoint to use for batch processing, input_file_id: str # ID of the uploaded input file containing batch requests}\n@optional {completion_window: str # Time window for batch completion (optional), priority: int # Priority for batch processing (optional), model_id: str # Model to use for processing batch requests}\n@returns(201) {job: map{id: str(uuid), user_id: str, input_file_id: str, file_size_bytes: int(int64), status: str, job_deadline: str(date-time), created_at: str(date-time), endpoint: str, progress: num(float64), model_id: str, output_file_id: str, error_file_id: str, error: str, completed_at: str(date-time)}, warning: str} # Job created (potentially with warnings)\n@errors {400: Bad Request, 401: Unauthorized, 429: Too Many Requests, 500: Internal Server Error}\n\n@endpoint GET /batches/{id}\n@desc Get a batch job\n@required {id: str}\n@returns(200) {id: str(uuid), user_id: str, input_file_id: str, file_size_bytes: int(int64), status: str, job_deadline: str(date-time), created_at: str(date-time), endpoint: str, progress: num(float64), model_id: str, output_file_id: str, error_file_id: str, error: str, completed_at: str(date-time)} # OK\n@errors {400: Bad Request, 401: Unauthorized, 403: Forbidden, 404: Not Found, 500: Internal Server Error}\n\n@endpoint POST /batches/{id}/cancel\n@desc Cancel a batch job\n@required {id: str}\n@returns(200) {id: str(uuid), user_id: str, input_file_id: str, file_size_bytes: int(int64), status: str, job_deadline: str(date-time), created_at: str(date-time), endpoint: str, progress: num(float64), model_id: str, output_file_id: str, error_file_id: str, error: str, completed_at: str(date-time)} # OK\n@errors {400: Bad Request, 401: Unauthorized, 403: Forbidden, 404: Not Found, 500: Internal Server Error}\n\n@endgroup\n\n@group evaluation\n@endpoint POST /evaluation\n@desc Create an evaluation job\n@required {type: str(classify/score/compare) # The type of evaluation to perform, parameters: any # Type-specific parameters for the evaluation}\n@returns(200) {workflow_id: str, status: str} # Evaluation job created successfully\n@errors {400: Invalid request format, 500: Failed to create evaluation job}\n\n@endpoint GET /evaluation\n@desc Get all evaluation jobs\n@optional {status: str, limit: int=10}\n@returns(200) evaluation jobs retrieved successfully\n@errors {400: Invalid request format, 500: Error retrieving jobs from manager}\n\n@endpoint GET /evaluation/model-list\n@desc Get model list\n@optional {model_source: str=all}\n@returns(200) {model_list: [str]} # Model list retrieved successfully\n@errors {400: Invalid request format, 500: Error retrieving model list}\n\n@endpoint GET /evaluation/{id}\n@desc Get evaluation job details\n@required {id: str}\n@returns(200) {workflow_id: str, type: str, owner_id: str, status: str, status_updates: [map], parameters: map, created_at: str(date-time), updated_at: str(date-time), results: any?} # Evaluation job details retrieved successfully\n@errors {404: Evaluation job not found, 500: Failed to get evaluation job}\n\n@endpoint GET /evaluation/{id}/status\n@desc Get evaluation job status and results\n@required {id: str}\n@returns(200) {status: str, results: any} # Evaluation job status and results retrieved successfully\n@errors {404: Evaluation job not found, 500: Failed to get evaluation job}\n\n@endgroup\n\n@group realtime\n@endpoint GET /realtime\n@desc Real-time audio transcription via WebSocket\n@required {model: str, input_audio_format: str=pcm_s16le_16000 # Audio format specification. Currently supports 16-bit PCM at 16kHz sample rate.}\n@errors {101: Switching Protocols - WebSocket connection established successfully.  Error message format: ```json {   \"type\": \"conversation.item.input_audio_transcription.failed\",   \"error\": {     \"message\": \"Error description\",     \"type\": \"invalid_request_error\",     \"param\": null,     \"code\": \"error_code\"   } } ```}\n\n@endgroup\n\n@group queue\n@endpoint POST /queue/cancel\n@desc Cancel a queued job\n@required {model: str # Model identifier the job was submitted to, request_id: str # The request ID returned from the submit endpoint}\n@returns(200) {status: str} # Successfully canceled\n@errors {400: Invalid request, 404: Request not found, 409: Job could not be canceled (already completed/failed), 500: Internal server error}\n\n@endpoint GET /queue/metrics\n@desc Get queue metrics\n@required {model: str}\n@returns(200) {messages_running: int, messages_waiting: int, total_jobs: int} # Queue metrics\n@errors {400: Invalid request, 401: Unauthorized, 500: Internal server error}\n\n@endpoint GET /queue/status\n@desc Get job status\n@required {request_id: str, model: str}\n@returns(200) {claimed_at: str(date-time), created_at: str(date-time), done_at: str(date-time), info: map, inputs: map, model: str, outputs: map, priority: int, request_id: str, retries: int, status: str, warnings: [str]} # Status information\n@errors {400: Invalid request, 401: Unauthorized, 404: Request not found, 500: Internal server error}\n\n@endpoint POST /queue/submit\n@desc Submit a queued job\n@required {model: str # Required model identifier, payload: map # Freeform model input. Passed unchanged to the model. Contents are model-specific.}\n@optional {info: map # Arbitrary JSON metadata stored with the job and returned in status responses. The model and system may add or update keys during processing., priority: int=0 # Job priority. Higher values are processed first (strict priority ordering). Jobs with equal priority are processed in submission order (FIFO).}\n@returns(200) {error: map{code: str, message: str, param: str, type: str}, requestId: str} # Successfully queued request\n@errors {400: Invalid request, 401: Unauthorized, 500: Internal server error}\n\n@endgroup\n\n@group rl\n@endpoint GET /rl/training-sessions\n@desc List training sessions\n@optional {status: str, limit: int(int32)=20, after: str}\n@returns(200) {data: [map], meta: map{limit: int(int32), has_more: bool, next_cursor: str}} # List of training sessions\n\n@endpoint POST /rl/training-sessions\n@desc Create training session\n@required {base_model: str # Base model to use for the training session}\n@optional {resume_from_checkpoint_id: str # Checkpoint ID to resume from, lora_config: map{rank: int(int64), alpha: int(int64), dropout: num(float)} # LoRA adapter configuration}\n@returns(200) {id: str, status: str, base_model: str, inference_checkpoints: [map], training_checkpoints: [map], resume_from_checkpoint_id: str, step: str(uint64), created_at: str(date-time), updated_at: str(date-time), lora_config: map{rank: int(int64), alpha: int(int64), dropout: num(float)}} # Training session details\n\n@endpoint GET /rl/training-sessions/{session_id}\n@desc Get training session\n@required {session_id: str}\n@returns(200) {id: str, status: str, base_model: str, inference_checkpoints: [map], training_checkpoints: [map], resume_from_checkpoint_id: str, step: str(uint64), created_at: str(date-time), updated_at: str(date-time), lora_config: map{rank: int(int64), alpha: int(int64), dropout: num(float)}} # Training session details\n\n@endpoint POST /rl/training-sessions/{session_id}/stop\n@desc Stop training session\n@required {session_id: str}\n@returns(200) {id: str, status: str, base_model: str, inference_checkpoints: [map], training_checkpoints: [map], resume_from_checkpoint_id: str, step: str(uint64), created_at: str(date-time), updated_at: str(date-time), lora_config: map{rank: int(int64), alpha: int(int64), dropout: num(float)}} # Training session details\n\n@endpoint GET /rl/training-sessions/{session_id}/operations/forward-backward/{operation_id}\n@desc Get forward-backward operation\n@required {session_id: str, operation_id: str}\n@returns(200) {id: str, status: str, output: map{loss: num(double), metrics: map}, error: map{code: str, message: str}} # Forward-backward operation details\n\n@endpoint GET /rl/training-sessions/{session_id}/operations/optim-step/{operation_id}\n@desc Get optim-step operation\n@required {session_id: str, operation_id: str}\n@returns(200) {id: str, status: str, output: map{step: str(uint64)}, error: map{code: str, message: str}}\n\n@endpoint GET /rl/training-sessions/{session_id}/operations/sample/{operation_id}\n@desc Get sample operation\n@required {session_id: str, operation_id: str}\n@returns(200) {id: str, status: str, output: map{rollouts: [map]}, error: map{code: str, message: str}}\n\n@endpoint POST /rl/training-sessions/{session_id}/operations/forward-backward\n@desc Forward-backward pass\n@required {session_id: str, samples: [map{model_input!: map, loss_inputs!: map}] # Batch of training samples to process, loss: map{type!: str, cross_entropy_params: map, grpo_params: map}}\n@returns(200) {id: str, status: str, output: map{loss: num(double), metrics: map}, error: map{code: str, message: str}}\n\n@endpoint POST /rl/training-sessions/{session_id}/operations/optim-step\n@desc Optimizer step\n@required {session_id: str}\n@optional {learning_rate: num(float)=0.0001 # Learning rate for this step., adamw_params: map{beta1: num(float), beta2: num(float), eps: num(float), weight_decay: num(float)} # AdamW optimizer parameters}\n@returns(200) {id: str, status: str, output: map{step: str(uint64)}, error: map{code: str, message: str}} # Optimizer step operation details\n\n@endpoint POST /rl/training-sessions/{session_id}/operations/sample\n@desc Sample\n@required {session_id: str, prompts: [map{chunks!: [map]}] # Input prompts as tokenized chunks}\n@optional {sampling_params: map{max_tokens: int(int32), temperature: num(float), top_p: num(float), top_k: int(int32), stop: [str], seed: str(int64)}, num_samples: int(int64)=1 # Number of completions to generate per prompt}\n@returns(200) {id: str, status: str, output: map{rollouts: [map]}, error: map{code: str, message: str}}\n\n@endpoint POST /rl/training-sessions/{session_id}/operations/inference-checkpoint\n@desc Create inference checkpoint\n@required {session_id: str}\n@returns(200) {id: str, status: str, output: map{model_name: str}, error: map{code: str, message: str}} # Inference checkpoint operation details\n\n@endpoint GET /rl/training-sessions/{session_id}/operations/inference-checkpoint/{operation_id}\n@desc Get inference checkpoint operation\n@required {session_id: str, operation_id: str}\n@returns(200) {id: str, status: str, output: map{model_name: str}, error: map{code: str, message: str}} # Inference checkpoint operation details\n\n@endpoint POST /rl/training-sessions/{session_id}/operations/training-checkpoint\n@desc Save training checkpoint\n@required {session_id: str}\n@returns(200) {id: str, status: str, output: map{checkpoint_id: str}, error: map{code: str, message: str}} # Save training checkpoint operation details\n\n@endpoint GET /rl/training-sessions/{session_id}/operations/training-checkpoint/{operation_id}\n@desc Get save training checkpoint operation\n@required {session_id: str, operation_id: str}\n@returns(200) {id: str, status: str, output: map{checkpoint_id: str}, error: map{code: str, message: str}} # Save training checkpoint operation details\n\n@endpoint GET /rl/checkpoints/{id}/download\n@desc Download checkpoint\n@required {id: str, variant: str}\n@returns(200) {data: [map]} # Checkpoint download URLs\n\n@endgroup\n\n@end\n"}