123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168 |
- """
- Utility functions for torch.
- """
-
- import torch
- from torch import nn, optim
- from torch.optim.optimizer import Optimizer
-
- ### class
- class MyAdagrad(Optimizer):
- """My modification of the Adagrad optimizer that allows to specify an initial
- accumulater value. This mimics the behavior of the default Adagrad implementation
- in Tensorflow. The default PyTorch Adagrad uses 0 for initial acculmulator value.
-
- Arguments:
- params (iterable): iterable of parameters to optimize or dicts defining
- parameter groups
- lr (float, optional): learning rate (default: 1e-2)
- lr_decay (float, optional): learning rate decay (default: 0)
- init_accu_value (float, optional): initial accumulater value.
- weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
- """
-
- def __init__(self, params, lr=1e-2, lr_decay=0, init_accu_value=0.1, weight_decay=0):
- defaults = dict(lr=lr, lr_decay=lr_decay, init_accu_value=init_accu_value, \
- weight_decay=weight_decay)
- super(MyAdagrad, self).__init__(params, defaults)
-
- for group in self.param_groups:
- for p in group['params']:
- state = self.state[p]
- state['step'] = 0
- state['sum'] = torch.ones(p.data.size()).type_as(p.data) *\
- init_accu_value
-
- def share_memory(self):
- for group in self.param_groups:
- for p in group['params']:
- state = self.state[p]
- state['sum'].share_memory_()
-
- def step(self, closure=None):
- """Performs a single optimization step.
-
- Arguments:
- closure (callable, optional): A closure that reevaluates the model
- and returns the loss.
- """
- loss = None
- if closure is not None:
- loss = closure()
-
- for group in self.param_groups:
- for p in group['params']:
- if p.grad is None:
- continue
-
- grad = p.grad.data
- state = self.state[p]
-
- state['step'] += 1
-
- if group['weight_decay'] != 0:
- if p.grad.data.is_sparse:
- raise RuntimeError("weight_decay option is not compatible with sparse gradients ")
- grad = grad.add(group['weight_decay'], p.data)
-
- clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])
-
- if p.grad.data.is_sparse:
- grad = grad.coalesce() # the update is non-linear so indices must be unique
- grad_indices = grad._indices()
- grad_values = grad._values()
- size = torch.Size([x for x in grad.size()])
-
- def make_sparse(values):
- constructor = type(p.grad.data)
- if grad_indices.dim() == 0 or values.dim() == 0:
- return constructor()
- return constructor(grad_indices, values, size)
- state['sum'].add_(make_sparse(grad_values.pow(2)))
- std = state['sum']._sparse_mask(grad)
- std_values = std._values().sqrt_().add_(1e-10)
- p.data.add_(-clr, make_sparse(grad_values / std_values))
- else:
- state['sum'].addcmul_(1, grad, grad)
- std = state['sum'].sqrt().add_(1e-10)
- p.data.addcdiv_(-clr, grad, std)
-
- return loss
-
- ### torch specific functions
- def get_optimizer(name, parameters, lr, l2=0):
- if name == 'sgd':
- return torch.optim.SGD(parameters, lr=lr, weight_decay=l2)
- elif name in ['adagrad', 'myadagrad']:
- # use my own adagrad to allow for init accumulator value
- return MyAdagrad(parameters, lr=lr, init_accu_value=0.1, weight_decay=l2)
- elif name == 'adam':
- return torch.optim.Adam(parameters, weight_decay=l2) # use default lr
- elif name == 'adamax':
- return torch.optim.Adamax(parameters, weight_decay=l2) # use default lr
- elif name == 'adadelta':
- return torch.optim.Adadelta(parameters, lr=lr, weight_decay=l2)
- else:
- raise Exception("Unsupported optimizer: {}".format(name))
-
-
- def change_lr(optimizer, new_lr):
- for param_group in optimizer.param_groups:
- param_group['lr'] = new_lr
-
-
- def flatten_indices(seq_lens, width):
- flat = []
- for i, l in enumerate(seq_lens):
- for j in range(l):
- flat.append(i * width + j)
- return flat
-
-
- def set_cuda(var, cuda):
- if cuda:
- return var.cuda()
- return var
-
-
- def keep_partial_grad(grad, topk):
- """
- Keep only the topk rows of grads.
- """
- assert topk < grad.size(0)
- grad.data[topk:].zero_()
- return grad
-
- ### model IO
- def save(model, optimizer, opt, filename):
- params = {
- 'model': model.state_dict(),
- 'optimizer': optimizer.state_dict(),
- 'config': opt
- }
- try:
- torch.save(params, filename)
- except BaseException:
- print("[ Warning: model saving failed. ]")
-
-
- def load(model, optimizer, filename):
- try:
- dump = torch.load(filename)
- except BaseException:
- print("[ Fail: model loading failed. ]")
- if model is not None:
- model.load_state_dict(dump['model'])
- if optimizer is not None:
- optimizer.load_state_dict(dump['optimizer'])
- opt = dump['config']
- return model, optimizer, opt
-
-
- def load_config(filename):
- try:
- dump = torch.load(filename)
- except BaseException:
- print("[ Fail: model loading failed. ]")
- return dump['config']
|