Example 3: Multiple Models in a Vollo Program
Vollo supports putting multiple models on a single accelerator.
Multiple NNIRs can be compiled into a single program:
import torch
import torch.nn as nn
import torch.nn.functional as F
import vollo_torch
class MLP(nn.Module):
def __init__(self, input_size, output_size, hidden_size):
super().__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = F.relu(self.fc1(x))
residual = x
x = F.relu(self.fc2(x)) + residual
return self.out(x)
# Instantiate an MLP
input_size = 784
output_size = 10
hidden_size = 128
mlp_model = MLP(input_size, output_size, hidden_size)
mlp_input = torch.randn(input_size)
(mlp_model, mlp_expected_output) = vollo_torch.fx.prepare_shape(mlp_model, mlp_input)
mlp_nnir = vollo_torch.fx.nnir.to_nnir(mlp_model)
class CNN(nn.Module):
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size=3):
super().__init__()
self.conv1 = vollo_torch.nn.PaddedConv1d(in_channels, hidden_channels, kernel_size)
self.conv2 = vollo_torch.nn.PaddedConv1d(hidden_channels, out_channels, kernel_size)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
return x
# Instantiate a CNN
in_channels = 32
out_channels = 1
hidden_channels = 128
cnn_model = CNN(in_channels, out_channels, hidden_channels)
batch_size = 1
sequence_length = 5
cnn_input = torch.randn(batch_size, in_channels, sequence_length)
(cnn_model, cnn_expected_output) = vollo_torch.fx.prepare_shape(cnn_model, cnn_input)
cnn_nnir = vollo_torch.fx.nnir.to_nnir(cnn_model)
(cnn_nnir, output_axis) = cnn_nnir.streaming_transform(2)
# Compile the multi-model program
import vollo_compiler
program_builder = vollo_compiler.ProgramBuilder(vollo_compiler.Config.ia_420f_c6b32())
program_builder.add_nnir(mlp_nnir)
program_builder.add_nnir(cnn_nnir)
multi_model_program = program_builder.to_program()
The vollo_compiler.ProgramBuilder
allows you to create a multi-model program. Building a multi-model program may give an allocation error if
the models can't fit on the given Config
. Generally each individual model will only have a small latency overhead compared to running it as an individual program. This overhead comes from selecting which model to run.
A model_index
can be provided when running inferences on the accelerator or on the VM. The models appear in the order in which they were added to the ProgramBuilder
. For example on the VM:
vm = multi_model_program.to_vm()
mlp_vm_output = vm.run(mlp_input.detach().numpy(), model_index = 0)
torch.testing.assert_close(mlp_expected_output, torch.from_numpy(mlp_vm_output))
cnn_vm_outputs = []
for i in range(5):
cnn_vm_outputs.append(vm.run(cnn_input[:, :, i].detach().numpy(), model_index = 1))
torch.testing.assert_close(
cnn_expected_output,
torch.stack(
[torch.from_numpy(output) for output in cnn_vm_outputs],
axis=output_axis,
),
)