pvv-chan/image.py

118 lines
4.1 KiB
Python
Raw Permalink Normal View History

2024-05-27 18:42:18 +02:00
import cv2
import os
import numpy as np
import base64
import face_recognition
from PIL import Image
from io import BytesIO
from langchain_community.chat_models import ChatOllama
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
camera = int(os.environ.get("IMAGE_DESCRIPTION_CAMERA", "0")) # Define your camera here, 0 is usually the built-in webcam
ollamaUrl = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
imagemodel = os.environ.get("IMAGE_DESCRIPTION_MODEL", "llava")
def capture():
"""
Capture an image from the webcam.
:return: Captured image
"""
cap = cv2.VideoCapture(camera)
ret, frame = cap.read()
cap.release()
return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) if ret else None
def encode_image(image):
"""
Encode the given image to base64.
:param image: Image to encode
:return: Base64 encoded image
"""
pil_image = Image.fromarray(image)
buffered = BytesIO()
pil_image.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue())
def resize_image(encoded_image, size=(672, 672)):
"""
Resize the given image to the specified size.
:param encoded_image: Base64 encoded image to resize
:param size: New size for the image
:return: Resized image
"""
image_data = base64.b64decode(encoded_image)
pil_image = Image.open(BytesIO(image_data))
resized_image = pil_image.resize(size)
buffered = BytesIO()
resized_image.save(buffered, format="JPEG")
return buffered.getvalue()
def capture_encoded_image():
"""
Capture an image from the webcam, resize it, and encode it to base64.
:return: Base64 encoded image
"""
image = capture()
if image is not None:
encoded_image = encode_image(image)
resized_image = resize_image(encoded_image)
return base64.b64encode(resized_image).decode("utf-8")
def get_image_from_encoded(encoded_image):
"""
Get an image from the given base64 encoded image.
:param encoded_image: Base64 encoded image
:return: Image
"""
image_data = base64.b64decode(encoded_image)
return cv2.imdecode(np.frombuffer(image_data, np.uint8), cv2.IMREAD_COLOR)
def get_faces(image):
"""
Get the faces from the given image and assign a persistent ID to each face.
:param image: Image to get faces from
:return: List of dictionaries containing the ID and location of each face
"""
face_cascade = cv2.CascadeClassifier(os.environ.get("HARA_CASCADE_FACE_PATH", "haarcascade_frontalface_default.xml"))
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
print(faces)#TODO: remove this debug print
face_data = []
for (x, y, w, h) in faces:
face_image = image[y:y+h, x:x+w]
face_encoding = face_recognition.face_encodings(face_image)
if face_encoding:
face_id = hash(tuple(face_encoding[0]))
face_data.append({
"id": face_id,
"location": (x, y, w, h)
})
return face_data
def describe(temperature=0, prompt="Briefely explain this image like it is your eyes. Use fewer words if possible. What is visible, and where are items located. Describe the pepole in the scene in some more detail. Refer to the camera as you."):
image_b64 = capture_encoded_image()
llava = ChatOllama(model=imagemodel, temperature=temperature, base_url=ollamaUrl)
def prompt_func(data):
text = data["text"]
image = data["image"]
image_part = {
"type": "image_url",
"image_url": f"data:image/jpeg;base64,{image}",
}
content_parts = []
text_part = {"type": "text", "text": text}
content_parts.append(image_part)
content_parts.append(text_part)
return [HumanMessage(content=content_parts)]
chain = prompt_func | llava | StrOutputParser()
query_chain = chain.invoke(
{"text": prompt, "image": image_b64}
)
return query_chain
if __name__ == "__main__":
# print(capture_encoded_image())
print(describe())