restructure

e3952d5b · David Carreto Fidalgo · 1c2a2201 · e3952d5b · 1c2a2201 · e3952d5b
Commit e3952d5b authored 1 year ago by David Carreto Fidalgo
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,8 +4,9 @@

 ## Quick start

-```bash
-module load apptainer/1.2.2
+```shell
+module load image_pytorch  # i would prefer pytorch_sif
+apptainer run $IMAGE_SIF example.py
 ```

 - "module load" container
@@ -13,26 +14,160 @@ module load apptainer/1.2.2
 - extend the base container
 - run a slurm job with the extended container

+
 ## Intro to containers

 - what are containers

+
 ### Apptainer

 - basic apptainer commands

+
 ## Access the base containers

+- `module load pytorch...`
+
+- defines env variable $IMAGE_SIF
+- maybe provide `kernel.json` spec files for jupyter lab?
+
+
 ## Build your own containers

-## Use containers Jupyter Lab

-## Use containers in slurm jobs
+> :warning: Do not build on the login nodes!!!
+
+
+- Divide into two part:
+
+### Our base images
+
+- write your definition file
+
+```shell
+module load pytorch # or
+echo $IMAGE_SIF
+```

-## Distributed training with containers
+or
+
+```shell
+module show image_pytorch
+module load apptainer/1.2.2
+```
+
+- copy paste
+
+```definition file
+...
+```
+
+```shell
+apptainer build ...
+```
+### docker hub
+
+ - nvidia containers for raven
+ - amd container for viper
+
+```
+NVIDIA distribution (Raven): 
+
+Tensorflow releases: https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/index.html, published at https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tensorflow
+Pytorch: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html; published at https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch
+AMD distribution (Viper): 
+
+Compatibility matrix: https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/docker-image-support-matrix.html
+Pytorch: https://hub.docker.com/r/rocm/pytorch
+TF: https://hub.docker.com/r/rocm/tensorflow**
+```

+### Bind stuff ...
+
+- bind all your folder/data/volumes you need ...
+- make user aware of possible python packages installed in his/her user directory
+
+
+### Use your Python modules
+
+Depending on the use case.
+dev:
+    - PYTHONPATH or
+    - install in editable mode, always bind the package at the same location
+    - warning when working with compiled files
+static:
+    - Copy files into the container
+
+
+## Use containers in RVS
+
+You can use the [IPython kernel](https://ipython.readthedocs.io/en/stable/install/kernel_install.html) in your container with a generic Jupyter Lab installation by providing a `kernel.json` spec file.
+
+> :warning: Make sure the `apptainer` module is loaded when running a RVS session! 
+> You can specify the default modules for your sessions during the initialization step.
+
+
+### 1. Setting up the container
+
+Make sure you install ipython and ipykernel in your container:
+```
+pip install ipython ipykernel
+```
+
+### 2. Setting up RVS
+
+Load apptainer module when initializing your RVS session.
+
+### 3. Creating the kernel
+
+Create a kernel spec file
+```bash
+vim ~/.local/share/jupyter/kernels/my-kernel/kernel.json
+```
+that should look something like this
+```json
+{
+ "argv": [
+  "apptainer",
+  "exec",
+  "--nv",
+  "--bind",
+  "{connection_file}:/tmp/connection_spec,/ptmp/<your user name>",
+  "/absolute/path/to/your/container.sif",
+  "python",
+  "-m",
+  "ipykernel_launcher",
+  "-f",
+  "/tmp/connection_spec"
+ ],
+ "display_name": "Name of your kernel",
+ "language": "python",
+ "metadata": {
+  "debugger": true
+ },
+ "env": {
+  "PYTHONPATH": "/add/custom/modules/here"
+ }
+}
+```
+
+The next time you request a jupyter session, you can choose the generic jupyter version, and use your custom kernel.
+Keep in mind that you are inside the container.
+If you want to access files outside your home directory, you have to bind them explicitly in the kernel spec file when calling the apptainer command.
+For example, in the kernel spec file above we bind your `ptmp` folder.
+
+## Use containers in slurm jobs
+
+- `module load apptainer`
 - `srun apptainer ...`

+
+### Distributed training with containers
+
+- mention torchrun, accelerate launch
+
+
 ## Port containers: Local <-> MPCDF

 - recommend docker for Windows/Macs

--- a/docs/example.py
+++ b/docs/example.py
-import torch
-from torch import nn
-from torch.utils.data import DataLoader
-from torchvision import datasets
-from torchvision.transforms import ToTensor
-
-
-# Download training data from open datasets.
-training_data = datasets.FashionMNIST(
-    root="data",
-    train=True,
-    download=True,
-    transform=ToTensor(),
-)
-
-batch_size = 64
-
-# Create data loaders.
-train_dataloader = DataLoader(training_data, batch_size=batch_size)
-
-# Define model
-class NeuralNetwork(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.flatten = nn.Flatten()
-        self.linear_relu_stack = nn.Sequential(
-            nn.Linear(28*28, 512),
-            nn.ReLU(),
-            nn.Linear(512, 512),
-            nn.ReLU(),
-            nn.Linear(512, 10)
-        )
-
-    def forward(self, x):
-        x = self.flatten(x)
-        logits = self.linear_relu_stack(x)
-        return logits
-
-model = NeuralNetwork()
-
-loss_fn = nn.CrossEntropyLoss()
-optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-
-def train(dataloader, model, loss_fn, optimizer):
-    size = len(dataloader.dataset)
-    model.train()
-    for batch, (X, y) in enumerate(dataloader):
-        # Compute prediction error
-        pred = model(X)
-        loss = loss_fn(pred, y)
-
-        # Backpropagation
-        loss.backward()
-        optimizer.step()
-        optimizer.zero_grad()
-
-        if batch % 100 == 0:
-            loss, current = loss.item(), (batch + 1) * len(X)
-            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
-
-
-epochs = 5
-for t in range(epochs):
-    print(f"Epoch {t+1}\n-------------------------------")
-    train(train_dataloader, model, loss_fn, optimizer)
-print("Done!")
\ No newline at end of file
--- a/nvidia_pytorch/README.md
+++ b/nvidia_pytorch/README.md
 ## Build
 ```shell
-apptainer build --fakeroot nvidia_pytorch.sif nvidia_pytorch.def
+apptainer build --build-arg baseimage=$IMAGE_SIF pytorch_accelerate.sif pytorch_accelerate.def
 ```

 ## Run
 ```shell
-apptainer exec --nv nvidia_pytorch.sif python -c "import torch; print(torch.cuda.device_count())"
+apptainer exec --nv pytorch_accelerate.sif python -c "import torch; print(torch.cuda.device_count())"
 ```

 ## Examples

--- a/nvidia_pytorch/nvidia_pytorch.def
+++ b/nvidia_pytorch/nvidia_pytorch.def
-BootStrap: docker
-From: nvcr.io/nvidia/pytorch:23.09-py3
+BootStrap: localimage
+From: {{ baseimage }}

 %post
+  pip install accelerate

 %environment


--- a/pytorch_mnist/main.py
+++ b/pytorch_mnist/main.py
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torch.optim.lr_scheduler import StepLR
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+def train(args, model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+            if args.dry_run:
+                break
+
+
+def test(model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
+            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset),
+        100. * correct / len(test_loader.dataset)))
+
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                        help='input batch size for testing (default: 1000)')
+    parser.add_argument('--epochs', type=int, default=14, metavar='N',
+                        help='number of epochs to train (default: 14)')
+    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
+                        help='learning rate (default: 1.0)')
+    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
+                        help='Learning rate step gamma (default: 0.7)')
+    parser.add_argument('--no-cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--no-mps', action='store_true', default=False,
+                        help='disables macOS GPU training')
+    parser.add_argument('--dry-run', action='store_true', default=False,
+                        help='quickly check a single pass')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                        help='how many batches to wait before logging training status')
+    parser.add_argument('--save-model', action='store_true', default=False,
+                        help='For Saving the current Model')
+    args = parser.parse_args()
+    use_cuda = not args.no_cuda and torch.cuda.is_available()
+    use_mps = not args.no_mps and torch.backends.mps.is_available()
+
+    torch.manual_seed(args.seed)
+
+    if use_cuda:
+        device = torch.device("cuda")
+    elif use_mps:
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+
+    train_kwargs = {'batch_size': args.batch_size}
+    test_kwargs = {'batch_size': args.test_batch_size}
+    if use_cuda:
+        cuda_kwargs = {'num_workers': 1,
+                       'pin_memory': True,
+                       'shuffle': True}
+        train_kwargs.update(cuda_kwargs)
+        test_kwargs.update(cuda_kwargs)
+
+    transform=transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+        ])
+    dataset1 = datasets.MNIST('data', train=True, download=True,
+                       transform=transform)
+    dataset2 = datasets.MNIST('data', train=False,
+                       transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
+
+    model = Net().to(device)
+    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
+
+    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+    for epoch in range(1, args.epochs + 1):
+        train(args, model, device, train_loader, optimizer, epoch)
+        test(model, device, test_loader)
+        scheduler.step()
+
+    if args.save_model:
+        torch.save(model.state_dict(), "mnist_cnn.pt")
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/transformers/Dockerfile
+++ b/transformers/Dockerfile
--- a/transformers/README.md
+++ b/transformers/README.md
--- a/transformers/develop.ipynb
+++ b/transformers/develop.ipynb
--- a/transformers/distributed_training.slurm
+++ b/transformers/distributed_training.slurm
--- a/transformers/download_and_preprocess.py
+++ b/transformers/download_and_preprocess.py
--- a/transformers/kernel.json
+++ b/transformers/kernel.json
--- a/transformers/train.py
+++ b/transformers/train.py
--- a/transformers/transformers.def
+++ b/transformers/transformers.def