Quantized model C++ export#
[ ]:
%pip install requests numpy ipywidgets ipycanvas
[ ]:
# Aidge modules imports
import aidge_core
import aidge_backend_cpu
import aidge_onnx
import aidge_export_cpp
import aidge_quantization
from aidge_export_cpp.export_utils import (
cpp_fuse_to_metaops,
set_nodes_names,
set_nodes_datatypes,
exclude_unwanted_producers)
from aidge_core.export_utils import remove_optional_inputs
# For database
from torchvision import transforms, datasets
import numpy as np
[ ]:
# Global variables
USE_CUDA = False
if USE_CUDA:
import aidge_backend_cuda
BACKEND = "cuda" if USE_CUDA else "cpu"
MODEL_NAME = "lenet"
EXPORT_FOLDER = f"{MODEL_NAME}_export_int8"
## PTQ Variables
NB_TEST = 10
NB_CALIB = 20
NB_BITS = 8
TARGET_TYPE = aidge_core.dtype.int32
OPTIM_SIGN = False
CLIPPING = aidge_quantization.Clipping.MSE # 'MAX'
SINGLE_SHIFT = True
[ ]:
def propagate(model, scheduler, tensor):
"""
Propagate the given tensor into the model and return the
output tensor.
"""
print(f"Propagate: {tensor.backend()}")
# Run the inference
scheduler.forward(True, [tensor])
# Gather the results
output_node = model.get_output_nodes().pop()
output_tensor = output_node.get_operator().get_output(0).clone()
output_tensor.set_backend("cpu")
return np.array(output_tensor)
Download the model#
[ ]:
file_url = "https://huggingface.co/EclipseAidge/LeNet/resolve/main/lenet_mnist.onnx?download=true"
file_path = MODEL_NAME + "_mnist.onnx"
aidge_core.utils.download_file(file_path, file_url)
Create database to quantize model#
[ ]:
transform = transforms.ToTensor()
test_set = datasets.MNIST(root='./data', train=False, transform=transform, download=True)
tensors = []
labels = []
index = 0
for in_tensor, label in test_set:
array = np.array(in_tensor)
array = np.reshape(array, (1, 1, 28, 28))
tensor = aidge_core.Tensor(array)
tensor.set_backend(BACKEND)
tensor.set_datatype(aidge_core.dtype.float32)
tensors.append(tensor)
labels.append(label)
index += 1
if (index == max(NB_TEST, NB_CALIB)):
break
Load the model in Aidge and manipulate it#
[ ]:
model = aidge_onnx.load_onnx(file_path, verbose=False)
aidge_core.remove_flatten(model)
aidge_core.fuse_batchnorm(model)
aidge_core.expand_metaops(model)
model.set_datatype(aidge_core.dtype.float32)
model.set_backend(BACKEND)
model.save("imported_model")
Checking accuracy:
[ ]:
scheduler = aidge_core.SequentialScheduler(model)
print('\n EXAMPLE INFERENCES :')
nb_valid = 0
base_values = []
for i in range(NB_TEST):
output_array = propagate(model, scheduler, tensors[i])
print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array))
base_values.append(np.max(output_array))
if (labels[i] == np.argmax(output_array)):
nb_valid += 1
accuracy = nb_valid / NB_TEST
print('\n MODEL ACCURACY = ', accuracy * 100, '%')
Quantize the model to int8#
[ ]:
aidge_quantization.quantize_network(
network = model,
nb_bits = NB_BITS,
calibration_set = tensors[0:NB_CALIB],
clipping_mode = CLIPPING,
target_type = TARGET_TYPE,
no_quant = False,
optimize_signs = OPTIM_SIGN,
single_shift = SINGLE_SHIFT,
use_cuda = USE_CUDA,
fold_graph = True)
Once the quantization is done, the graph now only accepts integer inputs. So we need to rescale the dataset for the data to be within [0, 255]. Also, tensors should be casted to be the same type as TARGET_TYPE.
[ ]:
rescaling = 2**(NB_BITS-1)-1
for i in range(NB_TEST):
tensors[i].set_backend("cpu")
array = np.array(tensors[i]) * rescaling
array = np.round(array).astype(int)
tensors[i] = aidge_core.Tensor(array)
tensors[i].set_datatype(TARGET_TYPE)
tensors[i].set_backend("cpu")
# Setting model to CPU for export
model.set_backend("cpu")
Each time the graph has been change, it has to be reset. Here some Quantizer and Cast nodes have been added.
[ ]:
input_node = model.get_ordered_inputs()[0]
input_node[0].get_operator().set_input(0, tensors[0])
scheduler.reset_scheduling()
And now we can assess the new performances after quantization:
[ ]:
print('\n QUANTIZED EXAMPLE INFERENCES:')
nb_valid = 0
post_values = []
for i in range(NB_TEST):
print(f"QEI: {tensors[i].backend()}")
output_array = propagate(model, scheduler, tensors[i])
print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array))
post_values.append(np.max(output_array))
if (labels[i] == np.argmax(output_array)):
nb_valid += 1
quant_accuracy = nb_valid / NB_TEST
print('\n MODEL ACCURACY = ', accuracy * 100, '%')
print('\n QUANTIZED ACCURACY = ', quant_accuracy * 100, '%')
[ ]:
exclude_unwanted_producers(model)
# Fuse nodes
cpp_fuse_to_metaops(model)
remove_optional_inputs(model)
scheduler.reset_scheduling()
scheduler.generate_scheduling() # Scheduler needs to be generated as it has just been reset
set_nodes_names(scheduler)
[ ]:
output_array = propagate(model, scheduler, tensors[0])
print("### Exported Sample ###")
print("Aidge prediction :", np.argmax(output_array), "(" + str(np.max(output_array)) + ")")
print("Label :", labels[0])
set_nodes_datatypes(model)
[31]:
aidge_export_cpp.export(EXPORT_FOLDER,
model,
scheduler,
labels = aidge_core.Tensor(labels[0]),
dev_mode = False,
aidge_cmp = False)
[ ]:
from subprocess import CalledProcessError
print("\n### Compiling the export ###")
try:
for std_line in aidge_core.utils.run_command(["make"], cwd=EXPORT_FOLDER):
print(std_line, end="")
except CalledProcessError as e:
raise RuntimeError(0, f"An error occurred, failed to build export.") from e
print("\n### Running the export ###")
try:
for std_line in aidge_core.utils.run_command(["./bin/run_export"], cwd=EXPORT_FOLDER):
print(std_line, end="")
except CalledProcessError as e:
raise RuntimeError(0, f"An error occurred, failed to run export.") from e