Joint Image & Text Embeddings

This example shows how SentenceTransformer can be used to map images and texts to the same vector space.

As model, we use the OpenAI CLIP Model, which was trained on a large set of images and image alt texts.

As a source for fotos, we use the Unsplash Dataset Lite, which contains about 25k images. See the License about the Unsplash images.

Note: 25k images is rather small. If you search for really specific terms, the chance are high that no such photo exist in the collection.

python

import glob
import os
import pickle
import zipfile

import torch
from IPython.display import Image as IPImage
from IPython.display import display
from PIL import Image
from tqdm.autonotebook import tqdm

from sentence_transformers import SentenceTransformer, util

torch.set_num_threads(4)


# First, we load the respective CLIP model
model = SentenceTransformer("sentence-transformers/clip-ViT-B-32")

python

# Next, we get about 25k images from Unsplash
img_folder = "photos/"
if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:
    os.makedirs(img_folder, exist_ok=True)

    photo_filename = "unsplash-25k-photos.zip"
    if not os.path.exists(photo_filename):  # Download dataset if does not exist
        util.http_get("http://sbert.net/datasets/" + photo_filename, photo_filename)

    # Extract all images
    with zipfile.ZipFile(photo_filename, "r") as zf:
        for member in tqdm(zf.infolist(), desc="Extracting"):
            zf.extract(member, img_folder)

python

# Now, we need to compute the embeddings
# To speed things up, we distribute pre-computed embeddings
# Otherwise you can also encode the images yourself.
# To encode an image, you can use the following code:
# from PIL import Image
# img_emb = model.encode(Image.open(filepath))

use_precomputed_embeddings = True

if use_precomputed_embeddings:
    emb_filename = "unsplash-25k-photos-embeddings.pkl"
    if not os.path.exists(emb_filename):  # Download dataset if does not exist
        util.http_get("http://sbert.net/datasets/" + emb_filename, emb_filename)

    with open(emb_filename, "rb") as fIn:
        img_names, img_emb = pickle.load(fIn)
    print("Images:", len(img_names))
else:
    img_names = list(glob.glob("unsplash/photos/*.jpg"))
    print("Images:", len(img_names))
    img_emb = model.encode(
        [Image.open(filepath) for filepath in img_names],
        batch_size=128,
        convert_to_tensor=True,
        show_progress_bar=True,
    )

python

# Next, we define a search function.
def search(query, k=3):
    # First, we encode the query (which can either be an image or a text string)
    query_emb = model.encode([query], convert_to_tensor=True, show_progress_bar=False)

    # Then, we use the util.semantic_search function, which computes the cosine-similarity
    # between the query embedding and all image embeddings.
    # It then returns the top_k highest ranked images, which we output
    hits = util.semantic_search(query_emb, img_emb, top_k=k)[0]

    print("Query:")
    display(query)
    for hit in hits:
        print(img_names[hit["corpus_id"]])
        display(IPImage(os.path.join(img_folder, img_names[hit["corpus_id"]]), width=200))

python

search("Two dogs playing in the snow")

python

search("A sunset on the beach")

python

search("London")

python

search("A dog in a park")

python

search("A beach with palm trees")

Image-to-Image Search

You can use the method also for image-to-image search.

To achieve this, you pass Image.open('path/to/image.jpg') to the search method.

It will then return similar images

python

search(Image.open(os.path.join(img_folder, "lyStEjlKNSw.jpg")), k=5)