Running TGI - AssertionError: libcuda.so cannot found!

After converting the official TGI docker image to an apptainer, I encountered this issue when trying to run TGI with 4 GPUs on 1 node that shards the NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO model.

The error message is:

2024-03-21T14:10:43.340373Z ERROR text_generation_launcher: Method Warmup encountered an error.
Traceback (most recent call last):
File "/opt/conda/bin/text-generation-server", line 8, in <module>
sys.exit(app())
File "/opt/conda/lib/python3.10/site-packages/typer/main.py", line 311, in __call__
return get_command(self)(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1157, in __call__
return self.main(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/typer/core.py", line 778, in main
return _main(
File "/opt/conda/lib/python3.10/site-packages/typer/core.py", line 216, in _main
rv = self.invoke(ctx)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/typer/main.py", line 683, in wrapper
return callback(**use_params)  # type: ignore
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/cli.py", line 89, in serve
server.serve(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py", line 235, in serve
asyncio.run(
File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 636, in run_until_complete
self.run_forever()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
self._run_once()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
handle._run()
File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
self._context.run(self._callback, *self._args)
File "/opt/conda/lib/python3.10/site-packages/grpc_interceptor/server.py", line 165, in invoke_intercept_method
return await self.intercept(
> File "/opt/conda/lib/python3.10/site-packages/text_generation_server/interceptor.py", line 21, in intercept
return await response
File "/opt/conda/lib/python3.10/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 82, in _unary_interceptor
raise error
File "/opt/conda/lib/python3.10/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 73, in _unary_interceptor
return await behavior(request_or_iterator, context)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py", line 95, in Warmup
max_supported_total_tokens = self.model.warmup(batch)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_causal_lm.py", line 756, in warmup
_, batch, _ = self.generate_token(batch)
File "/opt/conda/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_causal_lm.py", line 941, in generate_token
raise e
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_causal_lm.py", line 938, in generate_token
out, speculative_logits = self.forward(batch)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_mistral.py", line 498, in forward
logits, speculative_logits = self.model.forward(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 847, in forward
hidden_states = self.model(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 789, in forward
hidden_states, residual = layer(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 735, in forward
moe_output = self.moe(normed_attn_res_output)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 586, in forward
return self.sparse_forward(x)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 485, in sparse_forward
x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, self.top_k)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 539, in apply
return super().apply(*args, **kwargs)  # type: ignore[misc]
File "/opt/conda/lib/python3.10/site-packages/stk/backend/autocast.py", line 28, in decorate_fwd
return fwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/megablocks/ops/padded_gather.py", line 14, in forward
return kernels.padded_gather(
File "/opt/conda/lib/python3.10/site-packages/megablocks/backend/kernels.py", line 123, in padded_gather
_padded_copy[(indices.shape[0],)](
File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 100, in run
timings = {config: self._bench(*args, config=config, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 100, in <dictcomp>
timings = {config: self._bench(*args, config=config, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 83, in _bench
return do_bench(kernel_call, warmup=self.warmup, rep=self.rep, quantiles=(0.5, 0.2, 0.8))
File "/opt/conda/lib/python3.10/site-packages/triton/testing.py", line 104, in do_bench
fn()
File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 81, in kernel_call
self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current)
File "<string>", line 63, in _padded_copy
File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 425, in compile
so_path = make_stub(name, signature, constants)
File "/opt/conda/lib/python3.10/site-packages/triton/compiler/make_launcher.py", line 39, in make_stub
so = _build(name, src_path, tmpdir)
File "/opt/conda/lib/python3.10/site-packages/triton/common/build.py", line 61, in _build
cuda_lib_dirs = libcuda_dirs()
File "/opt/conda/lib/python3.10/site-packages/triton/common/build.py", line 30, in libcuda_dirs
assert any(os.path.exists(os.path.join(path, 'libcuda.so')) for path in dirs), msg
AssertionError: libcuda.so cannot found!

The issue seems to come from triton not finding libcuda.so which is usually provided by the NVIDIA Container Toolkit at a specific path. But it is also present in the nvidia pytorch container (see this issue on the TGI GitHub Repo). Since we do not have the nvidia container toolkit for now, we use the --nv flag with apptainer, and have to find a workaround. The workaround described in the issue needs a writable filesystem when running the container, maybe there is an easier solution? The newer triton versions implement the LD_LIBRARY_PATH env variable to look for libcuda.so, maybe this is a way forward? Let's also try to get the nvidia container toolkit on our systems ...

Edited Mar 22, 2024 by David Carreto Fidalgo