examples/research_projects/gligen/demo.ipynb
%load_ext autoreload
%autoreload 2
from diffusers import StableDiffusionGLIGENPipeline
from transformers import CLIPTextModel, CLIPTokenizer
import diffusers
from diffusers import (
AutoencoderKL,
DDPMScheduler,
EulerDiscreteScheduler,
UNet2DConditionModel,
)
# pretrained_model_name_or_path = 'masterful/gligen-1-4-generation-text-box'
pretrained_model_name_or_path = "/root/data/zhizhonghuang/checkpoints/models--masterful--gligen-1-4-generation-text-box/snapshots/d2820dc1e9ba6ca082051ce79cfd3eb468ae2c83"
tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
text_encoder = CLIPTextModel.from_pretrained(pretrained_model_name_or_path, subfolder="text_encoder")
vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
# unet = UNet2DConditionModel.from_pretrained(
# pretrained_model_name_or_path, subfolder="unet"
# )
noise_scheduler = EulerDiscreteScheduler.from_config(noise_scheduler.config)
unet = UNet2DConditionModel.from_pretrained("/root/data/zhizhonghuang/ckpt/GLIGEN_Text_Retrain_COCO")
pipe = StableDiffusionGLIGENPipeline(
vae,
text_encoder,
tokenizer,
unet,
noise_scheduler,
safety_checker=None,
feature_extractor=None,
)
pipe = pipe.to("cuda")
import numpy as np
# prompt = 'A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky'
# gen_boxes = [('a green car', [21, 281, 211, 159]), ('a blue truck', [269, 283, 209, 160]), ('a red air balloon', [66, 8, 145, 135]), ('a bird', [296, 42, 143, 100])]
# prompt = 'A realistic top-down view of a wooden table with two apples on it'
# gen_boxes = [('a wooden table', [20, 148, 472, 216]), ('an apple', [150, 226, 100, 100]), ('an apple', [280, 226, 100, 100])]
# prompt = 'A realistic scene of three skiers standing in a line on the snow near a palm tree'
# gen_boxes = [('a skier', [5, 152, 139, 168]), ('a skier', [278, 192, 121, 158]), ('a skier', [148, 173, 124, 155]), ('a palm tree', [404, 105, 103, 251])]
prompt = "An oil painting of a pink dolphin jumping on the left of a steam boat on the sea"
gen_boxes = [("a steam boat", [232, 225, 257, 149]), ("a jumping pink dolphin", [21, 249, 189, 123])]
boxes = np.array([x[1] for x in gen_boxes])
boxes = boxes / 512
boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
boxes = boxes.tolist()
gligen_phrases = [x[0] for x in gen_boxes]
images = pipe(
prompt=prompt,
gligen_phrases=gligen_phrases,
gligen_boxes=boxes,
gligen_scheduled_sampling_beta=1.0,
output_type="pil",
num_inference_steps=50,
negative_prompt="artifacts, blurry, smooth texture, bad quality, distortions, unrealistic, distorted image, bad proportions, duplicate",
num_images_per_prompt=16,
).images
diffusers.utils.make_image_grid(images, 4, len(images) // 4)