模型:
timm/vit_small_patch14_dinov2.lvd142m
一种视觉转换器(ViT)图像特征模型。使用自监督的DINOv2方法在LVD-142M上进行了预训练。
from urllib.request import urlopen from PIL import Image import timm img = Image.open(urlopen( 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png' )) model = timm.create_model('vit_small_patch14_dinov2.lvd142m', pretrained=True) model = model.eval() # get model specific transforms (normalization, resize) data_config = timm.data.resolve_model_data_config(model) transforms = timm.data.create_transform(**data_config, is_training=False) output = model(transforms(img).unsqueeze(0)) # unsqueeze single image into batch of 1 top5_probabilities, top5_class_indices = torch.topk(output.softmax(dim=1) * 100, k=5)
from urllib.request import urlopen from PIL import Image import timm img = Image.open(urlopen( 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png' )) model = timm.create_model( 'vit_small_patch14_dinov2.lvd142m', pretrained=True, num_classes=0, # remove classifier nn.Linear ) model = model.eval() # get model specific transforms (normalization, resize) data_config = timm.data.resolve_model_data_config(model) transforms = timm.data.create_transform(**data_config, is_training=False) output = model(transforms(img).unsqueeze(0)) # output is (batch_size, num_features) shaped tensor # or equivalently (without needing to set num_classes=0) output = model.forward_features(transforms(img).unsqueeze(0)) # output is unpooled, a (1, 1370, 384) shaped tensor output = model.forward_head(output, pre_logits=True) # output is a (1, num_features) shaped tensor
在timm上探索此模型的数据集和运行时指标: model results
@misc{oquab2023dinov2, title={DINOv2: Learning Robust Visual Features without Supervision}, author={Oquab, Maxime and Darcet, Timothée and Moutakanni, Theo and Vo, Huy V. and Szafraniec, Marc and Khalidov, Vasil and Fernandez, Pierre and Haziza, Daniel and Massa, Francisco and El-Nouby, Alaaeldin and Howes, Russell and Huang, Po-Yao and Xu, Hu and Sharma, Vasu and Li, Shang-Wen and Galuba, Wojciech and Rabbat, Mike and Assran, Mido and Ballas, Nicolas and Synnaeve, Gabriel and Misra, Ishan and Jegou, Herve and Mairal, Julien and Labatut, Patrick and Joulin, Armand and Bojanowski, Piotr}, journal={arXiv:2304.07193}, year={2023} }
@article{dosovitskiy2020vit, title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil}, journal={ICLR}, year={2021} }
@misc{rw2019timm, author = {Ross Wightman}, title = {PyTorch Image Models}, year = {2019}, publisher = {GitHub}, journal = {GitHub repository}, doi = {10.5281/zenodo.4414861}, howpublished = {\url{https://github.com/huggingface/pytorch-image-models}} }