Back to Sentence Transformers

Zero-Shot Image Classification

examples/sentence_transformer/applications/image-search/Image_Classification.ipynb

5.6.010.8 KB
Original Source

Zero-Shot Image Classification

This example shows how SentenceTransformers can be used to map images and texts to the same vector space.

We can use this to perform zero-shot image classification by providing the names for the labels.

The images in this example are from Unsplash.

python
import torch.nn.functional as F
from IPython.display import Image as IPImage
from IPython.display import display

from sentence_transformers import SentenceTransformer

Monolingual examples

As model, we use the OpenAI CLIP Model, which was trained on a large set of images and image alt texts.

python
# We use the original CLIP model for computing image embeddings and English text embeddings
clip = SentenceTransformer("sentence-transformers/clip-ViT-B-32")
python
# We point to some images in our repository which we want to classify
img_names = ["eiffel-tower-day.jpg", "eiffel-tower-night.jpg", "two_dogs_in_snow.jpg", "cat.jpg"]
url = "https://raw.githubusercontent.com/huggingface/sentence-transformers/main/examples/sentence_transformer/applications/image-search/"
image_urls = [url + filename for filename in img_names]

# And compute the embeddings for these images
img_emb = clip.encode(image_urls, convert_to_tensor=True)
python
# Then, we define our labels as text. Here, we use 4 labels
labels = ["dog", "cat", "Paris at night", "Paris"]

# And compute the text embeddings for these labels
clip_emb = clip.encode(labels, convert_to_tensor=True)

# Now, we compute the cosine similarity between the images and the labels
cos_scores = clip.similarity(img_emb, clip_emb)

# Convert the scores to confidence percentages via softmax. We scale by the model's
# logit scale (~100 for CLIP) so the probabilities are sharp instead of near-uniform.
confidence_scores = F.softmax(cos_scores * 100, dim=1)

# Finally we output each image with its labels, sorted by confidence
for i in range(len(img_names)):
    display(IPImage(url=image_urls[i], width=200))

    print(f"{'Label':<20} {'Confidence':>10}")
    sorted_results = sorted(zip(labels, confidence_scores[i].tolist()), key=lambda x: -x[1])
    for label, score in sorted_results:
        print(f"{label:<20} {score:.1%}")

    best_label, best_score = sorted_results[0]
    print(f"\nPredicted: {best_label} ({best_score:.1%})")
    print("-" * 30 + "\n")

Multilingual examples

The original CLIP Model only works for English, hence, we used Multilingual Knowledge Distillation to make this model work with 50+ languages.

For this, we must load the clip-ViT-B-32-multilingual-v1 model to encode our labels. We can define our labels in 50+ languages and can also mix the languages we have

python
# We provide the CLIP model and image loading codes again for those who use this multilingual
# version directly without having used the monolingual language.

# Import models
multi_model = SentenceTransformer("sentence-transformers/clip-ViT-B-32-multilingual-v1")
clip = SentenceTransformer("sentence-transformers/clip-ViT-B-32")

# Then, we define our labels as text. Here, we use 4 labels
labels = [
    "Hund",  # German: dog
    "gato",  # Spanish: cat
    "巴黎晚上",  # Chinese: Paris at night
    "Париж",  # Russian: Paris
]

# We point to some images in our repository which we want to classify
img_names = ["eiffel-tower-day.jpg", "eiffel-tower-night.jpg", "two_dogs_in_snow.jpg", "cat.jpg"]
url = "https://raw.githubusercontent.com/huggingface/sentence-transformers/main/examples/sentence_transformer/applications/image-search/"
image_urls = [url + filename for filename in img_names]


# We compute the text embeddings for these labels
txt_emb = multi_model.encode(labels, convert_to_tensor=True)
img_emb = clip.encode(image_urls, convert_to_tensor=True)

# Cosine similarity + confidence
cos_scores = multi_model.similarity(img_emb, txt_emb)
# CLIP's logit scale is ~100, which sharpens the softmax into clear probabilities.
confidence = F.softmax(cos_scores * 100, dim=-1)

# Display results
for image_url, scores in zip(image_urls, confidence):
    display(IPImage(url=image_url, width=200))
    print(f"{'Label':<20} {'Confidence':>10}")
    sorted_results = sorted(zip(labels, scores.tolist()), key=lambda x: -x[1])
    for label, score in sorted_results:
        print(f"{label:<20} {score:.1%}")
    print(f"\nPredicted: {sorted_results[0][0]} ({sorted_results[0][1]:.1%})\n")
    print("-" * 30 + "\n")

We can also try a more recent multilingual model: MetaCLIP 2 (mid 2025).
It's a 390M parameters model (smallest version of the family) whereas clip-ViT-B-32-multilingual-v1 is a 223M one.
As an alternative, you can check the models with the zero-shot-image-classification pipeline tag on the 🤗 Hub.

python
# Here we use the same model for images and text
model_name = "facebook/metaclip-2-worldwide-s16-384"
multi_model = SentenceTransformer(model_name)

labels = [
    "Hund",  # German: dog
    "gato",  # Spanish: cat
    "巴黎晚上",  # Chinese: Paris at night
    "Париж",  # Russian: Paris
]
img_names = ["eiffel-tower-day.jpg", "eiffel-tower-night.jpg", "two_dogs_in_snow.jpg", "cat.jpg"]
url = "https://raw.githubusercontent.com/huggingface/sentence-transformers/main/examples/sentence_transformer/applications/image-search/"
image_urls = [url + filename for filename in img_names]

# Encode texts
txt_emb = multi_model.encode(labels, convert_to_tensor=True)

# Encode images
img_emb = multi_model.encode(image_urls, convert_to_tensor=True)

# Cosine similarity + confidence
cos_scores = multi_model.similarity(img_emb, txt_emb)
# MetaCLIP 2's temperature is lower than CLIP's (logit scale ~exp(2.66)=14),
# so we scale by ~10 rather than 100.
confidence = F.softmax(cos_scores * 10, dim=-1)

# Display results
for image_url, scores in zip(image_urls, confidence):
    display(IPImage(url=image_url, width=200))
    print(f"{'Label':<20} {'Confidence':>10}")
    sorted_results = sorted(zip(labels, scores.tolist()), key=lambda x: -x[1])
    for label, score in sorted_results:
        print(f"{label:<20} {score:.1%}")
    print(f"\nPredicted: {sorted_results[0][0]} ({sorted_results[0][1]:.1%})\n")
    print("-" * 30 + "\n")

Trimmed models

Trimmed models are models where with cut tokens in the vocabulary of an original model to create a monolingual or a n-lingual new model.
The advantage of this technique is that we obtain a smaller model which retains the same performance for preserved languages (but then loses those in non-preserved languages).
To learn more about this technique, you can check out this blog post.

Let's try a trimmed version of the models of the previous section, with for example a French version of the sentence-transformers/clip-ViT-B-32-multilingual-v1 which goes from 223M parameters to 144M, which is a reduction of 35.53%.

Note here that we illustrate with French but you can find any language obtainable from the original sentence-transformers/clip-ViT-B-32-multilingual-v1 by consulting this collection Hugging Face.

And that beyond the clip-ViT-B-32-multilingual-v1, other visual embedding models can be found via this Space (99 differents languages available).

python
# We provide the CLIP model and image loading codes again for those who use this multilingual
# version directly without having used the monolingual language.

# Import models
multi_model = SentenceTransformer("alphaedge-ai/clip-ViT-B-32-multilingual-v1-fr-32768")
clip = SentenceTransformer("sentence-transformers/clip-ViT-B-32")

# Then, we define our labels as text. Here, we use 4 labels
labels = [
    "chien",  # dog
    "chat",  # cat
    "Paris de nuit",  # Paris at night
    "Paris",  # Paris
]

# We point to some images in our repository which we want to classify
img_names = ["eiffel-tower-day.jpg", "eiffel-tower-night.jpg", "two_dogs_in_snow.jpg", "cat.jpg"]
url = "https://raw.githubusercontent.com/huggingface/sentence-transformers/main/examples/sentence_transformer/applications/image-search/"
image_urls = [url + filename for filename in img_names]


# We compute the text embeddings for these labels
txt_emb = multi_model.encode(labels, convert_to_tensor=True)
img_emb = clip.encode(image_urls, convert_to_tensor=True)

# Cosine similarity + confidence
cos_scores = multi_model.similarity(img_emb, txt_emb)
# CLIP's logit scale is ~100, which sharpens the softmax into clear probabilities.
confidence = F.softmax(cos_scores * 100, dim=-1)

# Display results
for image_url, scores in zip(image_urls, confidence):
    display(IPImage(url=image_url, width=200))
    print(f"{'Label':<20} {'Confidence':>10}")
    sorted_results = sorted(zip(labels, scores.tolist()), key=lambda x: -x[1])
    for label, score in sorted_results:
        print(f"{label:<20} {score:.1%}")
    print(f"\nPredicted: {sorted_results[0][0]} ({sorted_results[0][1]:.1%})\n")
    print("-" * 30 + "\n")

For MetaCLIP 2, a model trimmed to keep only French allows to reduce from 390M of parameters to 50M, lowering memory costs but keeping inference speed identical.

python
# Here we use the same model for images and text
model_name = "alphaedge-ai/metaclip-2-worldwide-s16-384-fra-32768"
multi_model = SentenceTransformer(model_name)

# Then, we define our labels as text. Here, we use 4 labels
labels = [
    "chien",  # dog
    "chat",  # cat
    "Paris de nuit",  # Paris at night
    "Paris",  # Paris
]

img_names = ["eiffel-tower-day.jpg", "eiffel-tower-night.jpg", "two_dogs_in_snow.jpg", "cat.jpg"]
url = "https://raw.githubusercontent.com/huggingface/sentence-transformers/main/examples/sentence_transformer/applications/image-search/"
image_urls = [url + filename for filename in img_names]

# Encode texts
txt_emb = multi_model.encode(labels, convert_to_tensor=True)

# Encode images
img_emb = multi_model.encode(image_urls, convert_to_tensor=True)

# Cosine similarity + confidence
cos_scores = multi_model.similarity(img_emb, txt_emb)
# MetaCLIP 2's temperature is lower than CLIP's (logit scale ~exp(2.66)=14),
# so we scale by ~10 rather than 100.
confidence = F.softmax(cos_scores * 10, dim=-1)

# Display results
for image_url, scores in zip(image_urls, confidence):
    display(IPImage(url=image_url, width=200))
    print(f"{'Label':<20} {'Confidence':>10}")
    sorted_results = sorted(zip(labels, scores.tolist()), key=lambda x: -x[1])
    for label, score in sorted_results:
        print(f"{label:<20} {score:.1%}")
    print(f"\nPredicted: {sorted_results[0][0]} ({sorted_results[0][1]:.1%})\n")
    print("-" * 30 + "\n")