pip install fiftyone openai replicate
DALL-E2 和 BLIP 之间的 AI 电话游戏中的图像进展
import replicate
class Text2Image(object):
"""Wrapper for a Text2Image model."""
def __init__(self):
self.name = None
self.model_name = None
def generate_image(self, text):
response = replicate.run(self.model_name, input={"prompt": text})
if type(response) == list:
response = response[0]
return response
class StableDiffusion(Text2Image):
"""Wrapper for a StableDiffusion model."""
def __init__(self):
self.name = "stable-diffusion"
self.model_name = "stability-ai/stable-diffusion:27b93a2413e7f36cd83da926f3656280b2931564ff050bf9575f1fdf9bcd7478"
import openai
class DALLE2(Text2Image):
"""Wrapper for a DALL-E 2 model."""
def __init__(self):
self.name = "dalle-2"
def generate_image(self, text):
response = openai.Image.create(
prompt=text,
n=1,
size="512x512"
)
return response['data'][0]['url']
class Image2Text(object):
"""Wrapper for an Image2Text model."""
def __init__(self):
self.name = None
self.model_name = None
self.task_description = "Write a detailed description of this image."
def generate_text(self, image_url):
response = replicate.run(
self.model_name,
input={
"image": image_url,
"prompt": self.task_description,
}
)
return response
class BLIP(Image2Text):
"""Wrapper for a BLIP model."""
def __init__(self):
super().__init__()
self.name = "blip"
self.model_name = "salesforce/blip:2e1dddc8621f72155f24cf2e0adbde548458d3cab9f00c0139eea840d0ac4746"
DALL-E2与mPLUG-Owl在AI电话游戏中的图像进展。
I'm playing a game of telephone using text-to-image and image-to-text AI models.
I want to evaluate these models based on their ability to retain complex semantic
information over the course of long conversations. Your job is to give me 10 text
prompts that I can use to run these games of telephone. You must give me one 3
easy, 3 medium, 3 hard, and 1 ultra-hard prompt
Easy:
"A red apple sitting on a wooden table with sunlight streaming in from a window."
Medium:
"An astronaut floating in the International Space Station, looking out at Earth through the window, with a space capsule docked in the background."
Hard:
"A bustling marketplace in an ancient Middle Eastern city. Traders haggling over spices and silks, camels carrying goods, the sun setting behind a mosque with a crescent moon visible."
Impossible:
"A panoramic scene of an advanced alien civilization on a distant exoplanet. Interstellar vehicles flying in an indigo sky above towering crystalline structures. Aliens with varying physical features are interacting, engaging in activities like exchanging energy orbs, communicating through light patterns, and tending to exotic, bio-luminescent flora. The planet’s twin moons are visible in the horizon over a glistening alien ocean."
class Prompt(object):
def __init__(self, text, level):
self.text = text
self.level = level
levels = ["easy", "medium", "hard", "impossible"]
level_prompts = [easy_texts, medium_texts, hard_texts, impossible_texts]
def get_prompts():
prompts = []
for level, texts in zip(levels, level_prompts):
for text in texts:
prompts.append(Prompt(text, level))
return prompts
VQGAN-CLIP与MiniGPT-4在AI电话游戏中的图像进展
import os
import hashlib
import fiftyone as fo
from fiftyone import ViewField as F
class TelephoneLine(object):
"""Class for playing telephone with AI."""
def __init__(self, t2i, i2t):
self.t2i = t2i
self.i2t = i2t
self.name = f"{t2i.name}_{i2t.name}"
self.conversations = {}
def get_conversation_name(self, text):
full_name = f"{self.name}{text}"
hashed_name = hashlib.md5(full_name.encode())
return hashed_name.hexdigest()[:6]
def play(self, prompt, nturns = 10):
"""Play a game of telephone."""
print(f"Connecting {self.t2i.name} <-> {self.i2t.name} with prompt: {prompt.text[:20]}...")
texts = [prompt.text]
image_urls = []
for _ in range(nturns):
image_url = self.t2i.generate_image(texts[-1])
text = self.i2t.generate_text(image_url)
texts.append(text)
image_urls.append(image_url)
conversation_name = self.get_conversation_name(prompt.text)
self.conversations[conversation_name] = {
"texts": texts,
"image_urls": image_urls,
"level": prompt.level
}
def save_conversations_to_dataset(self, dataset):
"""Save conversations to a dataset."""
for conversation_name in self.conversations.keys():
conversation = self.conversations[conversation_name]
prompt = conversation["texts"][0]
level = conversation["level"]
image_urls = conversation["image_urls"]
texts = conversation["texts"]
for i in range(len(image_urls)):
filename = f"{conversation_name}_{i}.jpg"
filepath = os.path.join(IMAGES_DIR, filename)
download_image(image_urls[i], filepath)
sample = fo.Sample(
filepath = filepath,
conversation_name = conversation_name,
prompt = prompt,
level = level,
t2i_model = self.t2i.name,
i2t_model = self.i2t.name,
step_number = i,
text_before = texts[i],
text_after = texts[i+1]
)
dataset.add_sample(sample)
AI电话游戏中稳定扩散与CLIP前缀字幕之间的图像进展
## Image2Text models
mplug_owl = MPLUGOwl()
blip = BLIP()
clip_prefix = CLIPPrefix()
mini_gpt4 = MiniGPT4()
image2text_models = [mplug_owl, blip, clip_prefix, mini_gpt4]
## Text2Image models
vqgan_clip = VQGANCLIP()
sd = StableDiffusion()
dalle2 = DALLE2()
text2image_models = [sd, dalle2, vqgan_clip]
combos = [(t2i, i2t) for t2i in text2image_models for i2t in image2text_models]
lines = [TelephoneLine(*combo) for combo in combos]
prompts = get_prompts()
import fiftyone as fo
dataset = fo.Dataset(name = 'telephone', persistent=True)
dataset.add_sample_field("conversation_name", fo.StringField)
dataset.add_sample_field("prompt", fo.StringField)
dataset.add_sample_field("level", fo.StringField)
dataset.add_sample_field("t2i_model", fo.StringField)
dataset.add_sample_field("i2t_model", fo.StringField)
dataset.add_sample_field("step_number", fo.IntField)
dataset.add_sample_field("text_before", fo.StringField)
dataset.add_sample_field("text_after", fo.StringField)
from tqdm import tqdm
for line in tqdm(lines):
for prompt in prompts:
line.play(prompt, nturns = 10)
line.save_conversations_to_dataset(dataset)
session = fo.launch_app(dataset)
from scipy.spatial.distance import cosine as cosine_distance
MODEL_NAME = "daanelson/imagebind:0383f62e173dc821ec52663ed22a076d9c970549c209666ac3db181618b7a304"
def embed_text(text):
response = replicate.run(
MODEL_NAME,
input={
"text_input": text,
"modality": "text"
}
)
return np.array(response)
import hashlib
def hash_prompt(prompt):
return hashlib.md5(prompt.encode()).hexdigest()[:6]
### Embed initial prompts
prompt_embeddings = {}
dataset.add_sample_field("prompt_hash", fo.StringField)
## Group samples by initial prompt
## Add hash to all samples in group
prompt_groups = dataset.group_by("prompt")
for pg in prompt_groups.iter_dynamic_groups():
prompt = pg.first().prompt
hash = hash_prompt(prompt)
prompt_embeddings[hash] = embed_text(prompt)
view = pg.set_field("prompt_hash", hash)
view.save("prompt_hash")
dataset.add_sample_field("text_after_dist", fo.FloatField)
prompt_groups = dataset.group_by("conversation_name")
for cg in conversation_groups.iter_dynamic_groups(progress=True):
hash = cg.first().prompt_hash
prompt_embedding = prompt_embeddings[hash]
ordered_samples = cg.sort_by("step_number")
for sample in ordered_samples.iter_samples(autosave=True):
text_embedding = embed_text(sample.text_after)
sample["text_embedding"] = text_embedding
sample.text_after_dist = cosine_distance(
prompt_embedding,
text_embedding
)
AI 电话可提供简单的提示,具有成对的文本到图像和图像到文本模型
AI电话为中等难度提示,具有文本到图像和图像到文本模型对
AI 电话用于硬提示,具有成对的文本到图像和图像到文本模型
AI 电话用于“不可能”提示,具有文本到图像和图像到文本模型对