examples/sentence_transformer/applications/image-search/Image_Search.ipynb
This example shows how SentenceTransformer can be used to map images and texts to the same vector space.
As model, we use the OpenAI CLIP Model, which was trained on a large set of images and image alt texts.
As a source for fotos, we use the Unsplash Dataset Lite, which contains about 25k images. See the License about the Unsplash images.
Note: 25k images is rather small. If you search for really specific terms, the chance are high that no such photo exist in the collection.
import glob
import os
import pickle
import zipfile
import torch
from IPython.display import Image as IPImage
from IPython.display import display
from PIL import Image
from tqdm.autonotebook import tqdm
from sentence_transformers import SentenceTransformer, util
torch.set_num_threads(4)
# First, we load the respective CLIP model
model = SentenceTransformer("sentence-transformers/clip-ViT-B-32")
# Next, we get about 25k images from Unsplash
img_folder = "photos/"
if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:
os.makedirs(img_folder, exist_ok=True)
photo_filename = "unsplash-25k-photos.zip"
if not os.path.exists(photo_filename): # Download dataset if does not exist
util.http_get("http://sbert.net/datasets/" + photo_filename, photo_filename)
# Extract all images
with zipfile.ZipFile(photo_filename, "r") as zf:
for member in tqdm(zf.infolist(), desc="Extracting"):
zf.extract(member, img_folder)
# Now, we need to compute the embeddings
# To speed things up, we distribute pre-computed embeddings
# Otherwise you can also encode the images yourself.
# To encode an image, you can use the following code:
# from PIL import Image
# img_emb = model.encode(Image.open(filepath))
use_precomputed_embeddings = True
if use_precomputed_embeddings:
emb_filename = "unsplash-25k-photos-embeddings.pkl"
if not os.path.exists(emb_filename): # Download dataset if does not exist
util.http_get("http://sbert.net/datasets/" + emb_filename, emb_filename)
with open(emb_filename, "rb") as fIn:
img_names, img_emb = pickle.load(fIn)
print("Images:", len(img_names))
else:
img_names = list(glob.glob("unsplash/photos/*.jpg"))
print("Images:", len(img_names))
img_emb = model.encode(
[Image.open(filepath) for filepath in img_names],
batch_size=128,
convert_to_tensor=True,
show_progress_bar=True,
)
# Next, we define a search function.
def search(query, k=3):
# First, we encode the query (which can either be an image or a text string)
query_emb = model.encode([query], convert_to_tensor=True, show_progress_bar=False)
# Then, we use the util.semantic_search function, which computes the cosine-similarity
# between the query embedding and all image embeddings.
# It then returns the top_k highest ranked images, which we output
hits = util.semantic_search(query_emb, img_emb, top_k=k)[0]
print("Query:")
display(query)
for hit in hits:
print(img_names[hit["corpus_id"]])
display(IPImage(os.path.join(img_folder, img_names[hit["corpus_id"]]), width=200))
search("Two dogs playing in the snow")
search("A sunset on the beach")
search("London")
search("A dog in a park")
search("A beach with palm trees")
You can use the method also for image-to-image search.
To achieve this, you pass Image.open('path/to/image.jpg') to the search method.
It will then return similar images
search(Image.open(os.path.join(img_folder, "lyStEjlKNSw.jpg")), k=5)