118 lines
4.1 KiB
Python
118 lines
4.1 KiB
Python
|
import cv2
|
||
|
import os
|
||
|
import numpy as np
|
||
|
import base64
|
||
|
import face_recognition
|
||
|
from PIL import Image
|
||
|
from io import BytesIO
|
||
|
from langchain_community.chat_models import ChatOllama
|
||
|
from langchain_core.messages import HumanMessage
|
||
|
from langchain_core.output_parsers import StrOutputParser
|
||
|
|
||
|
camera = int(os.environ.get("IMAGE_DESCRIPTION_CAMERA", "0")) # Define your camera here, 0 is usually the built-in webcam
|
||
|
ollamaUrl = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
|
||
|
imagemodel = os.environ.get("IMAGE_DESCRIPTION_MODEL", "llava")
|
||
|
|
||
|
|
||
|
def capture():
|
||
|
"""
|
||
|
Capture an image from the webcam.
|
||
|
:return: Captured image
|
||
|
"""
|
||
|
cap = cv2.VideoCapture(camera)
|
||
|
ret, frame = cap.read()
|
||
|
cap.release()
|
||
|
return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) if ret else None
|
||
|
|
||
|
def encode_image(image):
|
||
|
"""
|
||
|
Encode the given image to base64.
|
||
|
:param image: Image to encode
|
||
|
:return: Base64 encoded image
|
||
|
"""
|
||
|
pil_image = Image.fromarray(image)
|
||
|
buffered = BytesIO()
|
||
|
pil_image.save(buffered, format="JPEG")
|
||
|
return base64.b64encode(buffered.getvalue())
|
||
|
|
||
|
def resize_image(encoded_image, size=(672, 672)):
|
||
|
"""
|
||
|
Resize the given image to the specified size.
|
||
|
:param encoded_image: Base64 encoded image to resize
|
||
|
:param size: New size for the image
|
||
|
:return: Resized image
|
||
|
"""
|
||
|
image_data = base64.b64decode(encoded_image)
|
||
|
pil_image = Image.open(BytesIO(image_data))
|
||
|
resized_image = pil_image.resize(size)
|
||
|
buffered = BytesIO()
|
||
|
resized_image.save(buffered, format="JPEG")
|
||
|
return buffered.getvalue()
|
||
|
|
||
|
def capture_encoded_image():
|
||
|
"""
|
||
|
Capture an image from the webcam, resize it, and encode it to base64.
|
||
|
:return: Base64 encoded image
|
||
|
"""
|
||
|
image = capture()
|
||
|
if image is not None:
|
||
|
encoded_image = encode_image(image)
|
||
|
resized_image = resize_image(encoded_image)
|
||
|
return base64.b64encode(resized_image).decode("utf-8")
|
||
|
|
||
|
def get_image_from_encoded(encoded_image):
|
||
|
"""
|
||
|
Get an image from the given base64 encoded image.
|
||
|
:param encoded_image: Base64 encoded image
|
||
|
:return: Image
|
||
|
"""
|
||
|
image_data = base64.b64decode(encoded_image)
|
||
|
return cv2.imdecode(np.frombuffer(image_data, np.uint8), cv2.IMREAD_COLOR)
|
||
|
|
||
|
|
||
|
def get_faces(image):
|
||
|
"""
|
||
|
Get the faces from the given image and assign a persistent ID to each face.
|
||
|
:param image: Image to get faces from
|
||
|
:return: List of dictionaries containing the ID and location of each face
|
||
|
"""
|
||
|
face_cascade = cv2.CascadeClassifier(os.environ.get("HARA_CASCADE_FACE_PATH", "haarcascade_frontalface_default.xml"))
|
||
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||
|
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
|
||
|
print(faces)#TODO: remove this debug print
|
||
|
face_data = []
|
||
|
for (x, y, w, h) in faces:
|
||
|
face_image = image[y:y+h, x:x+w]
|
||
|
face_encoding = face_recognition.face_encodings(face_image)
|
||
|
if face_encoding:
|
||
|
face_id = hash(tuple(face_encoding[0]))
|
||
|
face_data.append({
|
||
|
"id": face_id,
|
||
|
"location": (x, y, w, h)
|
||
|
})
|
||
|
return face_data
|
||
|
|
||
|
def describe(temperature=0, prompt="Briefely explain this image like it is your eyes. Use fewer words if possible. What is visible, and where are items located. Describe the pepole in the scene in some more detail. Refer to the camera as you."):
|
||
|
image_b64 = capture_encoded_image()
|
||
|
llava = ChatOllama(model=imagemodel, temperature=temperature, base_url=ollamaUrl)
|
||
|
def prompt_func(data):
|
||
|
text = data["text"]
|
||
|
image = data["image"]
|
||
|
image_part = {
|
||
|
"type": "image_url",
|
||
|
"image_url": f"data:image/jpeg;base64,{image}",
|
||
|
}
|
||
|
content_parts = []
|
||
|
text_part = {"type": "text", "text": text}
|
||
|
content_parts.append(image_part)
|
||
|
content_parts.append(text_part)
|
||
|
return [HumanMessage(content=content_parts)]
|
||
|
chain = prompt_func | llava | StrOutputParser()
|
||
|
query_chain = chain.invoke(
|
||
|
{"text": prompt, "image": image_b64}
|
||
|
)
|
||
|
return query_chain
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
# print(capture_encoded_image())
|
||
|
print(describe())
|