Skip to content

Commit

Permalink
code split into files
Browse files Browse the repository at this point in the history
  • Loading branch information
fanis-khafizov committed Jan 23, 2025
1 parent 81efc6d commit 8a7d670
Show file tree
Hide file tree
Showing 6 changed files with 695 additions and 0 deletions.
296 changes: 296 additions & 0 deletions code/ResNet/compressors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
import torch
from descent import gradient_descent, mirror_descent

class TopK:
"""
A class used to compress gradients by selecting the top-k values.
Attributes
----------
k : int
The number of top values to select.
Methods
-------
compress(name, param)
Compresses the gradient of the given parameter by selecting the top-k values.
"""
def __init__(self, k):
"""
Initializes the compressor with a given parameter.
Args:
k (int): The parameter to initialize the compressor with.
"""
self.k = k

def update(self, *args, **kwargs):
"""
Placeholder for the update method.
"""
pass

def compress(self, name, param):
"""
Compresses the gradient tensor by retaining only the top-k absolute values.
Args:
name (str): The name of the parameter (not used in the current implementation).
param (torch.nn.Parameter): The parameter whose gradient tensor is to be compressed.
Returns:
torch.Tensor: The compressed gradient tensor with only the top-k absolute values retained.
"""
k = int(self.k * param.numel())
tensor = param.grad.view(-1) # Flatten the tensor to a vector
topk_values, topk_indices = tensor.abs().topk(k)
mask = torch.zeros_like(tensor, dtype=torch.bool)
mask.scatter_(0, topk_indices, True)
compressed_tensor = tensor * mask
compressed_tensor = compressed_tensor.view(param.grad.size()) # Reshape back to original size
return compressed_tensor


class RandK:
"""
A class used to represent a Random K Compressor.
Attributes
----------
k : float
The fraction of elements to keep in the tensor during compression.
Methods
-------
compress(name, param)
Compresses the gradient of the given parameter by randomly keeping a fraction of elements.
"""
def __init__(self, k):
"""
Initializes the compressor with a given parameter.
Args:
k (int): The parameter to initialize the compressor.
"""
self.k = k

def update(self, *args, **kwargs):
"""
Placeholder for the update method.
"""
pass

def compress(self, name, param):
"""
Compresses the gradient tensor by randomly masking elements.
Args:
name (str): The name of the parameter (not used in the current implementation).
param (torch.nn.Parameter): The parameter whose gradient tensor will be compressed.
Returns:
torch.Tensor: The compressed gradient tensor with a fraction of elements randomly masked.
"""
k = int(self.k * param.numel())
tensor = param.grad
mask = torch.randperm(tensor.numel()) < k
mask = mask.view(tensor.size())
compressed_tensor = tensor * mask
return compressed_tensor


class ImpK_b:
"""
A class used to perform importance-based compression on model parameters.
Attributes
----------
model : torch.nn.Module
The neural network model whose parameters are to be compressed.
k : float
The fraction of parameters to retain after compression.
w : dict
A dictionary containing the importance weights for each parameter in the model.
mode : int, optional
The mode of compression (default is 0).
Methods
-------
update(X_train, y_train, criterion, lr, eta, num_steps)
Updates the importance weights using mirror descent.
compress(name, param)
Compresses the given parameter tensor based on the importance weights.
"""
def __init__(self, model, k, mode=0):
"""
Initializes the compressor with the given model, compression factor, and mode.
Args:
model (torch.nn.Module): The neural network model to be compressed.
k (int): The compression factor.
mode (int, optional): The mode of compression. Defaults to 0.
"""
self.model = model
self.k = k
self.w = {name: (imp := torch.ones_like(param)) / imp.sum()
for name, param in model.named_parameters()
}
self.mode = mode

def update(self, X_train, y_train, criterion, lr, eta, num_steps):
"""
Update the model parameters using mirror descent optimization on a simplex.
Parameters:
-----------
X_train : torch.Tensor
The input training data.
y_train : torch.Tensor
The target training labels.
criterion : torch.nn.Module
The loss function used to evaluate the model.
lr : float
The learning rate for the optimization.
eta : float
The step size parameter for mirror descent.
num_steps : int
The number of steps to perform in the mirror descent optimization.
Returns:
--------
None
"""
for name, param in self.model.named_parameters():
self.w[name] = mirror_descent(
model=self.model,
param_name=name,
impact=self.w[name],
lr=lr,
eta=eta,
lambda_value=0.1,
num_steps=num_steps,
X_train=X_train,
y_train=y_train,
criterion=criterion
)

def compress(self, name, param):
"""
Compresses the gradient tensor of a parameter based on the specified mode and weight.
Args:
name (str): The name of the parameter.
param (torch.nn.Parameter): The parameter whose gradient tensor is to be compressed.
Returns:
torch.Tensor: The compressed gradient tensor.
Notes:
- If mode is 0, the compression is based on the top-k elements of the weight tensor.
- If mode is 1, the compression is based on the top-k elements of the element-wise product of the gradient tensor and the weight tensor.
"""
k = int(self.k * param.numel())
if self.mode == 0:
tensor = param.grad
topk_indices = torch.argsort(self.w[name].flatten(), descending=True)[:k]
elif self.mode == 1:
tensor = param.grad * self.w[name] * (param.numel() / self.w[name].sum())
topk_indices = torch.argsort(tensor.abs().flatten(), descending=True)[:k]

mask = torch.zeros_like(tensor.flatten(), dtype=torch.bool)
mask[topk_indices] = True
mask = mask.view(tensor.size())

# Apply mask to tensor
compressed_tensor = tensor * mask
return compressed_tensor

class ImpK_c:
"""
A class used to perform importance-based compression on model parameters.
Attributes
----------
model : torch.nn.Module
The neural network model whose parameters are to be compressed.
k : float
The fraction of parameters to retain after compression.
w : dict
A dictionary containing the importance weights for each parameter in the model.
mode : int, optional
The mode of compression (default is 0).
Methods
-------
update(X_train, y_train, criterion, lr, eta, num_steps)
Updates the importance weights using gradient descent.
compress(name, param)
Compresses the given parameter tensor based on the importance weights.
"""
def __init__(self, model, k, mode=0):
"""
Initializes the compressor with the given model, compression factor, and mode.
Args:
model (torch.nn.Module): The neural network model to be compressed.
k (int): The compression factor.
mode (int, optional): The mode of compression. Defaults to 0.
"""
self.model = model
self.k = k
self.w = {name: (imp := torch.ones_like(param)) / 2
for name, param in model.named_parameters()
}
self.mode = mode

def update(self, X_train, y_train, criterion, lr, eta, num_steps):
"""
Update the model parameters using gradient descent optimization on a cube.
Parameters:
-----------
X_train : torch.Tensor
The input training data.
y_train : torch.Tensor
The target training labels.
criterion : torch.nn.Module
The loss function used to evaluate the model.
lr : float
The learning rate for the optimization.
eta : float
The step size parameter for gradient descent.
num_steps : int
The number of steps to perform in the gradient descent optimization.
Returns:
--------
None
"""
for name, param in self.model.named_parameters():
self.w[name] = gradient_descent(
model=self.model,
param_name=name,
impact=self.w[name],
lr=lr,
eta=eta,
num_steps=num_steps,
X_train=X_train,
y_train=y_train,
criterion=criterion
)

def compress(self, name, param):
"""
Compresses the gradient tensor of a parameter based on the specified mode and weight.
Args:
name (str): The name of the parameter.
param (torch.nn.Parameter): The parameter whose gradient tensor is to be compressed.
Returns:
torch.Tensor: The compressed gradient tensor.
Notes:
- If mode is 0, the compression is based on the top-k elements of the weight tensor.
- If mode is 1, the compression is based on the top-k elements of the element-wise product of the gradient tensor and the weight tensor.
"""
k = int(self.k * param.numel())
if self.mode == 0:
tensor = param.grad
topk_indices = torch.argsort(self.w[name].flatten(), descending=True)[:k]
elif self.mode == 1:
tensor = param.grad * self.w[name]
topk_indices = torch.argsort(tensor.abs().flatten(), descending=True)[:k]

mask = torch.zeros_like(tensor.flatten(), dtype=torch.bool)
mask[topk_indices] = True
mask = mask.view(tensor.size())

# Apply mask to tensor
compressed_tensor = tensor * mask
return compressed_tensor
65 changes: 65 additions & 0 deletions code/ResNet/descent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import torch
from torch.func import functional_call

def mirror_descent(model, X_train, y_train, param_name, impact: torch.Tensor, lr, eta, lambda_value, num_steps, criterion):
impact = impact.clone().detach().requires_grad_(True)
original_param = dict(model.named_parameters())[param_name]

outputs = model(X_train)
loss = criterion(outputs, y_train)
param_grad = torch.autograd.grad(loss, original_param, create_graph=True)[0]
new_params = {name: param.clone() for name, param in model.named_parameters()}

for _ in range(num_steps):
# Update parameter using impact
param_new = original_param - lr * impact * param_grad
# Create new parameter dictionary
new_params[param_name] = param_new
# Compute outputs with new parameters
outputs_new = functional_call(model, new_params, (X_train,))
# Compute new loss
loss_new = criterion(outputs_new, y_train)

# Compute gradient of new loss w.r.t. impact
grad_impact = torch.autograd.grad(loss_new, impact)[0]

with torch.no_grad():
impact_update = torch.pow(impact, 1/(1+eta*lambda_value)) * torch.exp(-(eta/(1+eta*lambda_value)) * (grad_impact))
impact = impact_update / impact_update.sum()

# Ensure impact requires grad for the next iteration
impact.requires_grad_(True)

return impact.detach()


def gradient_descent(model, X_train, y_train, param_name, impact: torch.Tensor, lr, eta, num_steps, criterion):
impact = impact.clone().detach().requires_grad_(True)
original_param = dict(model.named_parameters())[param_name]

outputs = model(X_train)
loss = criterion(outputs, y_train)
param_grad = torch.autograd.grad(loss, original_param, create_graph=True)[0]
new_params = {name: param.clone() for name, param in model.named_parameters()}

for _ in range(num_steps):
# Update parameter using impact
param_new = original_param - lr * impact * param_grad
# Create new parameter dictionary
new_params[param_name] = param_new
# Compute outputs with new parameters
outputs_new = functional_call(model, new_params, (X_train,))
# Compute new loss
loss_new = criterion(outputs_new, y_train)

# Compute gradient of new loss w.r.t. impact
grad_impact = torch.autograd.grad(loss_new, impact)[0]

with torch.no_grad():
impact -= eta * lr * grad_impact
impact = torch.clip(impact, 0, 1)

# Ensure impact requires grad for the next iteration
impact.requires_grad_(True)

return impact.detach()
Loading

0 comments on commit 8a7d670

Please sign in to comment.