TimescaleDB
For an end-to-end guide follow this link: https://www.timescale.com/blog/building-a-reverse-video-search-system-with-mixpeek-postgresql/
Video
We’ll be using vuse-generic-v1
to build a collection of 1 second interval video chunks into a 768 dimension embedding collection.
You’ll need to create a TimescaleDB database with the vector
extension
installed. We’ll create a hypertable called video_embeddings
with a vector
column of 768 dimensions and a time column for efficient time-series
operations.
Ingest
from mixpeek import Mixpeek
import psycopg2
from psycopg2.extras import execute_values
# Initialize the Mixpeek client with your API key
mixpeek = Mixpeek("YOUR_API_KEY")
# Connect to TimescaleDB
conn = psycopg2.connect("YOUR_TIMESCALEDB_CONNECTION_STRING")
cur = conn.cursor()
# Create table and hypertable if not exists
cur.execute("""
CREATE TABLE IF NOT EXISTS video_embeddings (
id SERIAL PRIMARY KEY,
embedding vector(768),
start_time TIMESTAMPTZ,
end_time TIMESTAMPTZ
);
""")
cur.execute("SELECT create_hypertable('video_embeddings', 'start_time', if_not_exists => TRUE);")
# Process video chunks
processed_chunks = mixpeek.tools.video.process(
video_source="https://mixpeek-public-demo.s3.us-east-2.amazonaws.com/starter/Jurassic+Park+(2).mp4",
chunk_interval=1, # 1 second intervals
resolution=[720, 1280]
)
embeddings_data = []
for chunk in processed_chunks:
print(f"Processing video chunk: {chunk['start_time']}")
# embed each chunk
embed_response = mixpeek.embed.video(
model_id="vuse-generic-v1",
input=chunk['base64_chunk'],
input_type="base64"
)
embeddings_data.append((
embed_response['embedding'],
chunk["start_time"],
chunk["end_time"]
))
# Insert embeddings into TimescaleDB
execute_values(cur, """
INSERT INTO video_embeddings (embedding, start_time, end_time)
VALUES %s
""", embeddings_data)
conn.commit()
cur.close()
conn.close()
Text Query
query = "two boys inside a car"
embed_response = mixpeek.embed.video(
model_id="vuse-generic-v1",
input=query,
input_type="text"
)
conn = psycopg2.connect("YOUR_TIMESCALEDB_CONNECTION_STRING")
cur = conn.cursor()
cur.execute("""
SELECT id, start_time, end_time, l2_distance(embedding, %s) AS distance
FROM video_embeddings
ORDER BY distance
LIMIT 10
""", (embed_response['embedding'],))
results = cur.fetchall()
for result in results:
print(result)
cur.close()
conn.close()
Video Query
# we'll use a cartoon version of jurassic park
file_url = "https://mixpeek-public-demo.s3.us-east-2.amazonaws.com/timescaledb/rabbit-jurassic.mp4"
embed_response = mixpeek.embed.video(
model_id="vuse-generic-v1",
input=file_url,
input_type="url"
)
conn = psycopg2.connect("YOUR_TIMESCALEDB_CONNECTION_STRING")
cur = conn.cursor()
cur.execute("""
SELECT id, start_time, end_time, l2_distance(embedding, %s) AS distance
FROM video_embeddings
ORDER BY distance
LIMIT 10
""", (embed_response['embedding'],))
results = cur.fetchall()
for result in results:
print(result)
cur.close()
conn.close()
Image
We’ll be using openai-clip-vit-base-patch32
to build a collection of image embeddings with 512 dimensions.
You’ll need to create a TimescaleDB hypertable with a vector column of 512
dimensions for openai-clip-vit-base-patch32
model embeddings. We’re calling
it image_embeddings
.
Ingest
from mixpeek import Mixpeek
import psycopg2
from psycopg2.extras import execute_values
# Initialize the Mixpeek client with your API key
mixpeek = Mixpeek("YOUR_API_KEY")
# Connect to TimescaleDB
conn = psycopg2.connect("YOUR_TIMESCALEDB_CONNECTION_STRING")
cur = conn.cursor()
# Create table and hypertable if not exists
cur.execute("""
CREATE TABLE IF NOT EXISTS image_embeddings (
id SERIAL PRIMARY KEY,
embedding vector(512),
image_url TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
""")
cur.execute("SELECT create_hypertable('image_embeddings', 'created_at', if_not_exists => TRUE);")
# List of image URLs to process
image_urls = [
"https://mixpeek-public-demo.s3.us-east-2.amazonaws.com/timescaledb/image1.jpg",
"https://mixpeek-public-demo.s3.us-east-2.amazonaws.com/timescaledb/image2.jpg",
"https://mixpeek-public-demo.s3.us-east-2.amazonaws.com/timescaledb/image3.jpg",
"https://mixpeek-public-demo.s3.us-east-2.amazonaws.com/timescaledb/image4.jpg"
]
embeddings_data = []
for url in image_urls:
print(f"Processing image: {url}")
# Embed each image
embed_response = mixpeek.embed.image(
model_id="openai-clip-vit-base-patch32",
input=url,
input_type="url"
)
embeddings_data.append((
embed_response['embedding'],
url
))
# Insert embeddings into TimescaleDB
execute_values(cur, """
INSERT INTO image_embeddings (embedding, image_url)
VALUES %s
""", embeddings_data)
conn.commit()
cur.close()
conn.close()
Text Query
query = "a cat sitting on a windowsill"
embed_response = mixpeek.embed.image(
model_id="openai-clip-vit-base-patch32",
input=query,
input_type="text"
)
conn = psycopg2.connect("YOUR_TIMESCALEDB_CONNECTION_STRING")
cur = conn.cursor()
cur.execute("""
SELECT id, image_url, l2_distance(embedding, %s) AS distance
FROM image_embeddings
ORDER BY distance
LIMIT 5
""", (embed_response['embedding'],))
results = cur.fetchall()
for result in results:
print(result)
cur.close()
conn.close()
Image Query
# Use an image from the same bucket as a query
query_image = "https://mixpeek-public-demo.s3.us-east-2.amazonaws.com/timescaledb/query_image.jpg"
embed_response = mixpeek.embed.image(
model_id="openai-clip-vit-base-patch32",
input=query_image,
input_type="url"
)
conn = psycopg2.connect("YOUR_TIMESCALEDB_CONNECTION_STRING")
cur = conn.cursor()
cur.execute("""
SELECT id, image_url, l2_distance(embedding, %s) AS distance
FROM image_embeddings
ORDER BY distance
LIMIT 5
""", (embed_response['embedding'],))
results = cur.fetchall()
for result in results:
print(result)
cur.close()
conn.close()
Was this page helpful?