Sharat Patil
--- a/api_writer/vllm_model.py

+ 1

− 1

View file @ 9e1a7139

Open in Web IDE
+++ b/api_writer/vllm_model.py

+ 1

− 1

View file @ 9e1a7139

Open in Web IDE
 @@ -6,7 +6,7 @@ from api_writer.utils import load_config,load_templates
 import time

 model_id = os.environ['BASE_MODEL'] #"unsloth/tinyllama-bnb-4bit"#
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitsandbytes",load_format="bitsandbytes",enable_lora=True,enable_prefix_caching=True,max_lora_rank=64)
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitsandbytes",load_format="bitsandbytes",enable_lora=True,enable_prefix_caching=True,max_lora_rank=32,max_loras=1)
 gpu_stats = torch.cuda.get_device_properties(0)
 start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
 max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)