In [None]:
!gdown 18AzsP1DOBECBL3vjzUhokro40mtu4Ggn # vit_s_k710_dl_from_giant.pth

Downloading...
From: https://drive.google.com/uc?id=18AzsP1DOBECBL3vjzUhokro40mtu4Ggn
To: /content/vit_s_k710_dl_from_giant.pth
100% 44.3M/44.3M [00:00<00:00, 76.7MB/s]


In [None]:
!pip install decord deepspeed einops timm==0.4.12 tensorboardX mpi4py transformers



In [None]:
%cd /content
!git clone https://github.com/OpenGVLab/VideoMAEv2

/content
fatal: destination path 'VideoMAEv2' already exists and is not an empty directory.


In [None]:
class_to_label = {}

with open("/content/ucfTrainTestlist/classInd.txt", 'r') as f:
    for line in f:
        label, class_name = line.split()
        class_to_label[class_name] = label

modes = ['train', 'test']

for mode in modes:
    files = [
        f"/content/ucfTrainTestlist/{mode}list01.txt",
        f"/content/ucfTrainTestlist/{mode}list02.txt",
        f"/content/ucfTrainTestlist/{mode}list03.txt",
    ]

    output_file_path = f"/content/UCF101/{'val' if mode == 'test' else mode}.csv"

    with open(output_file_path, 'w') as outfile:
        for file_path in files:
            with open(file_path, 'r') as infile:
                for line in infile:
                    line_text = line.strip()
                    if mode == 'train':
                        video_path, label = line_text.split(' ')
                    else:
                        video_path = line_text
                        class_name = os.path.dirname(line_text)
                        label = class_to_label[class_name]
                    outfile.write(f"/content/UCF101/{video_path} {label}\n")

In [None]:
%cd /content/VideoMAEv2
from dataset.datasets import VideoClsDataset

from models.modeling_finetune import VisionTransformer, _cfg
from utils import (
    load_state_dict,
)
from optim_factory import (
    LayerDecayValueAssigner,
    get_parameter_groups,
)
%cd /content/

/content/VideoMAEv2
/content


In [None]:
import os
import sys
import json
import warnings
import math
import argparse
import logging
import random
import gc
import tqdm

from collections import OrderedDict

import numpy as np
import pandas as pd
import deepspeed
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.utils.checkpoint as cp
from torch.utils.data import Dataset
from torch.utils.data._utils.collate import default_collate
from torchvision import transforms
from timm.models import create_model
from timm.models.layers import trunc_normal_
from timm.models.registry import register_model
from timm.loss import SoftTargetCrossEntropy
from functools import partial
from datetime import datetime

from transformers import AutoTokenizer, CLIPModel

[2023-11-05 21:20:09,304] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [None]:
class EnhancedVisionTransformer(VisionTransformer):
  def get_embeddings(self, x):
    B, _, T, H, W = x.shape

    x = self.patch_embed(x)

    if self.pos_embed is not None:
        x = x + self.pos_embed.expand(B, -1, -1).type_as(x).to(x.device).clone().detach()
    x = self.pos_drop(x)

    for blk in self.blocks:
        if self.with_cp:
            x = cp.checkpoint(blk, x)
        else:
            x = blk(x)

    B, num_patches, embed_dim = x.shape

    T = T // self.tubelet_size
    x = x.view(B, T, num_patches // T, embed_dim)
    x = x.reshape(B, T, -1)

    return x

@register_model
def vit_small_patch16_224(pretrained=False, **kwargs):
    model = EnhancedVisionTransformer(
        patch_size=16,
        embed_dim=384,
        depth=12,
        num_heads=6,
        mlp_ratio=4,
        qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, eps=1e-6),
        **kwargs)
    model.default_cfg = _cfg()
    return model

In [None]:
def add_space_before_uppercase(s):
    result = [s[0]]

    for char in s[1:]:
        if char.isupper():
            result.append(' ')
        result.append(char)

    return ''.join(result)

class EnhancedVideoClsDataset(VideoClsDataset):
  def __getitem__(self, index):
    original_data = super().__getitem__(index)

    video_filename = self.dataset_samples[index].split('/')[-1].split('.')[0].split('_')[1]

    label = add_space_before_uppercase(video_filename)

    return (*original_data, ''.join(label))

In [None]:
args = argparse.Namespace()
args.model = 'vit_small_patch16_224'
args.data_set = 'UCF101'
args.nb_classes = 101
args.data_path = '/content/UCF101/'
args.finetune = '/content/vit_s_k710_dl_from_giant.pth'
args.batch_size = 6
args.input_size = 224
args.short_side_size = 224
args.num_frames = 16
args.sampling_rate = 10
args.num_sample = 2
args.num_workers = 2
args.opt = 'adamw'
args.opt_eps = 1e-8
args.opt_betas = [0.9, 0.999]
args.lr = 1e-3
args.min_lr = 1e-6
args.drop = 0.0
args.attn_drop_rate = 0.0
args.drop_path = 0.35
args.clip_grad = None # 5.0
args.aa = 'rand-m7-n4-mstd0.5-inc1'
args.layer_decay = 0.92 # 0.9
args.weight_decay = 0.06 # 0.05
args.epochs = 5

args.tubelet_size = 2
args.with_checkpoint = True
args.train_interpolation = 'bicubic'
args.reprob = 0.25
args.remode = 'pixel'
args.recount = 1
args.data_root = ''

args.num_segments = 1

args.start_epoch = 0

args.pin_mem = True

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.manual_seed(0)
np.random.seed(0)
random.seed(0)
cudnn.benchmark = True

In [None]:
def multiple_samples_collate(batch):
    inputs, labels, video_idx, extra_data, video_file_names = zip(*batch)
    inputs = [item for sublist in inputs for item in sublist]
    labels = [item for sublist in labels for item in sublist]
    video_idx = [item for sublist in video_idx for item in sublist]
    inputs, labels, video_idx, extra_data, video_file_names = (
        default_collate(inputs),
        default_collate(labels),
        default_collate(video_idx),
        default_collate(extra_data),
        default_collate(video_file_names),
    )
    return inputs, labels, video_idx, extra_data, video_file_names

In [None]:
train_dataset = EnhancedVideoClsDataset(
    anno_path=os.path.join(args.data_path, 'train.csv'),
    data_root=args.data_root,
    clip_len=args.num_frames,
    frame_sample_rate=args.sampling_rate,
    num_segment=1,
    crop_size=args.input_size,
    short_side_size=args.short_side_size,
    mode='train',
    args=args)


data_loader_train = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    num_workers=args.num_workers,
    pin_memory=args.pin_mem,
    shuffle=True,
    drop_last=True,
    collate_fn=partial(multiple_samples_collate),
    persistent_workers=True)

In [None]:
class FrameScorePredictor(nn.Module):
    def __init__(self, create_model_fn, args):
        super().__init__()


        self.base_model = create_model_fn(
            args.model,
            img_size=args.input_size,
            pretrained=False,
            all_frames=args.num_frames * args.num_segments,
            tubelet_size=args.tubelet_size,
            drop_rate=args.drop,
            drop_path_rate=args.drop_path,
            attn_drop_rate=args.attn_drop_rate,
            drop_block_rate=None,
            with_cp=args.with_checkpoint,
        )

        self.base_embedding_dim = self.base_model.embed_dim * 14 ** 2 # TODO: use parameters
        self.embedding_frame_number = args.num_frames // args.tubelet_size

        self.score_predictor = nn.Sequential(
            nn.Linear(self.base_embedding_dim * self.embedding_frame_number, self.embedding_frame_number)
        )

    def forward(self, x):
        batch_size, num_channels, num_frames, height, width = x.shape

        embeddings = self.base_model.get_embeddings(x)

        embeddings = embeddings.reshape(batch_size, self.base_embedding_dim * self.embedding_frame_number)

        frame_scores = self.score_predictor(embeddings)
        return frame_scores

    def load_base_model_state_dict(self, checkpoint_model):
        load_state_dict(self.base_model, checkpoint_model)

In [None]:
model = FrameScorePredictor(create_model, args=args)

checkpoint = torch.load(args.finetune, map_location='cpu')

checkpoint_model = checkpoint['model'] if 'model' in checkpoint else checkpoint['module']

model.load_base_model_state_dict(checkpoint_model)

model = model.to(device)

del checkpoint
del checkpoint_model

gc.collect()

size mismatch for head.weight: copying a param with shape torch.Size([710, 384]) from checkpoint, the shape in current model is torch.Size([1000, 384]).
size mismatch for head.bias: copying a param with shape torch.Size([710]) from checkpoint, the shape in current model is torch.Size([1000]).


124

In [None]:
class PairwiseLoss(torch.nn.Module):
    def __init__(self, margin=0.2):
        super(PairwiseLoss, self).__init__()
        self.margin = margin

    def forward(self, scores, labels):
        pairwise_diff = scores.unsqueeze(1) - scores.unsqueeze(0)

        positive_mask = (labels.unsqueeze(1) - labels.unsqueeze(0)) > self.margin

        positive_diffs = pairwise_diff[positive_mask]

        loss = torch.clamp(positive_diffs, min=0).sum()

        num_positive_pairs = positive_mask.sum()
        return loss / num_positive_pairs if num_positive_pairs > 0 else loss


In [None]:
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

In [None]:
all_classes = [add_space_before_uppercase(text) for text in class_to_label.keys()]
all_classes_inputs = tokenizer(all_classes, padding=True, truncation=True, return_tensors="pt").to(device)
with torch.no_grad():
    text_embeddings = clip_model.get_text_features(**all_classes_inputs)

all_classes_clip_embedding = {text: emb for text, emb in zip(all_classes, text_embeddings)}
all_classes_clip_embedding_list = torch.stack(list(all_classes_clip_embedding.values())).to(device)

In [None]:
def compute_accuracies(frame_features, target_index):
    similarities = torch.matmul(frame_features, all_classes_clip_embedding_list.T)

    top5_indices = torch.topk(similarities, 5, largest=True).indices

    top1 = int(top5_indices[0] == target_index)
    top5 = int(target_index in top5_indices)

    return top1, top5

In [None]:
embedding_frame_number = args.num_frames // args.tubelet_size
ranking_loss_scale = embedding_frame_number**3-embedding_frame_number

# mse_criterion = nn.MSELoss()
# margin_ranking_loss = nn.MarginRankingLoss(margin=1)

optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

gc.collect()
torch.cuda.empty_cache()

model.train()

bce_loss_sum = 0.0
mse_loss_sum = 0.0
rank_loss_sum = 0.0
combined_loss_sum = 0.0
print_step = 10

strategies_items = ['random', 'best_pseudo', 'best_predicted']

acc1_sum = {strategy_name: 0 for strategy_name in strategies_items}
acc5_sum = {strategy_name: 0 for strategy_name in strategies_items}
acc_counter = {strategy_name: 0 for strategy_name in strategies_items}

for step, (samples, targets, _, _, classes) in enumerate(data_loader_train):
    optimizer.zero_grad()

    pseudo_labels = []
    samples = samples.to(device)
    targets = targets.to(device)
    inputs = tokenizer(classes, padding=True, return_tensors="pt").to(device)
    text_features = clip_model.get_text_features(**inputs)

    all_frames_features = []

    for i in range(samples.shape[0]):
        video_frames = samples[i].permute(1, 0, 2, 3)
        video_frames = video_frames[1::2]
        frames_features = clip_model.get_image_features(pixel_values=video_frames)
        all_frames_features.append(frames_features)
        similarity = torch.matmul(frames_features, text_features[i // args.num_sample])
        pseudo_labels.append(similarity)
    pseudo_labels = torch.stack(pseudo_labels).to(device)

    predicted_scores = model(samples)

    # mse_loss = mse_criterion(predicted_scores, pseudo_labels)

    best_frame_targets = torch.zeros_like(pseudo_labels)
    best_frame_indices = pseudo_labels.argmax(dim=1)
    best_frame_targets[torch.arange(pseudo_labels.size(0)), best_frame_indices] = 1
    bce_loss = F.binary_cross_entropy_with_logits(predicted_scores, best_frame_targets)

    # target = (pseudo_labels[1:] > pseudo_labels[:-1]).float() * 2 - 1
    # rank_loss = margin_ranking_loss(predicted_scores[:-1], predicted_scores[1:], target)

    # combined_loss = bce_loss + 0.02 * mse_loss + rank_loss

    # combined_loss.backward()
    bce_loss.backward()
    optimizer.step()

    # mse_loss_sum += mse_loss.item()
    bce_loss_sum += bce_loss.item()
    # rank_loss_sum += rank_loss.item()
    # combined_loss_sum += combined_loss.item()

    strategies = {
        'random': torch.randint(low=0, high=args.num_frames // 2, size=(samples.size(0),)).to(device),
        'best_pseudo': pseudo_labels.argmax(dim=1),
        'best_predicted': predicted_scores.argmax(dim=1)
    }

    for strategy_name, frame_indices in strategies.items():
        for frames_features, frame_indice, target in zip(all_frames_features, frame_indices, targets):
            frame_features = frames_features[frame_indice]
            acc1, acc5 = compute_accuracies(frame_features, target - 1)

            acc1_sum[strategy_name] += acc1
            acc5_sum[strategy_name] += acc5
            acc_counter[strategy_name] += 1

    if (step + 1) % print_step == 0:
        print(
          f'Step {step}/{len(data_loader_train)}\n'
          # f'Average MSE Loss: {mse_loss_sum / print_step}\n'
          f'Average BCE Loss: {bce_loss_sum / print_step}\n'
          # f'Average Ranking Loss: {rank_loss_sum / print_step}\n'
          # f'Average Combined Loss: {combined_loss_sum / print_step}'
        )

        for strategy_name in strategies.keys():
          acc1_avg = acc1_sum[strategy_name] / acc_counter[strategy_name]
          acc5_avg = acc5_sum[strategy_name] / acc_counter[strategy_name]

          print(f"{strategy_name} Average Frame Acc@1: {acc1_avg:.2f}%, Acc@5: {acc5_avg:.2f}%")

        # mse_loss_sum = 0.0
        bce_loss_sum = 0.0
        # rank_loss_sum = 0.0
        # combined_loss_sum = 0.0


        acc1_sum = {strategy_name: 0 for strategy_name in strategies_items}
        acc5_sum = {strategy_name: 0 for strategy_name in strategies_items}
        acc_counter = {strategy_name: 0 for strategy_name in strategies_items}

        print('------------------')
        print()


    gc.collect()
    torch.cuda.empty_cache()