This commit is contained in:
Adrian Gunnar Lauterer 2024-05-27 18:42:18 +02:00
commit 418d6d044d
Signed by: adriangl
GPG Key ID: D33368A59745C2F0
9 changed files with 667 additions and 0 deletions

10
.gitignore vendored Normal file
View File

@ -0,0 +1,10 @@
*/__pycache__/*
__pycache__/*
piper-models/*
piper-models
models
index
models
index
history

40
README.md Normal file
View File

@ -0,0 +1,40 @@
This is a simple chatbot project.
The aim is to recreate something similar to neurosama, running on local hardware on a minimal amount of compute.
The bot is designed to be modular, with the ability to add new modules easily.
You need to supply a backup mediawiki xml. this is used to gather information to the chatbot.
A strong computer with cuda and a fair bit of vrm is adviced to get response times down.
Most settings are configured through enviroment variables from the flake.nix file.
## Modules
### stt
The stt module is responsible for converting speech to text.
Whisper-cpp-stream is used to stream audio through the whisper stt engine.
whisper-cpp-stream is a c++ program that reads audio from a microphone, and sends it to the whisper stt engine.
It is run through a python subprocess.
### llm
The llm module is responsible for crafting a response to the user's input. It uses a rag based on a supplied mediawiki wiki xml file, and in the future, included chat history.
langchain is the pyhton module that interfaces with the rag, and llm.
ollama is used on the backend to interface with a llama model.
future work will include giving astructured response, to include emotions, and metadata for a future image module.
### tts
piper is used as the tts engine.
It does not have proper python bindings in nixpkgs, so it is run with subprocess.
text is echoed into piper's stdin, and the output is played with aplay.
### image
The image module is responsible for processing images.
It captures the image using pygame, b64 encodes it and sends it to a multimodal model for descriptions.
Future work is to test out using opencv or something similar for image tagging instead, as the multimodal model halucinates a lot, and is also way too slow.

37
assistant.py Normal file
View File

@ -0,0 +1,37 @@
import os
import llm
import image
import stt
import tts
def main():
print("Waiting for STT input...")
line = ""
while not line.strip():
line = stt.get_buffer()
if not line.strip():
continue
print(f"STT buffer: {line}")
# Get the response from the LLM chat module
llm_output = llm.chat(question=line, image_description=image_description)
print("LLM:", llm_output["response"])
# Run the LLM output through the TTS (text-to-speech) module
tts.speak(llm_output["response"].replace("*", "")) # Remove asterisks for better TTS understanding
if __name__ == "__main__":
# Initialize the LLM index if it doesn't exist
if not os.path.exists("index"):
llm.init_index() # Initialize the LLM index module (only needed with updated inputs)
# Initialize the LLM chat module
llm.init_chat()
# Describe the image and store the description
image_description = "" # image.describe()
print("Image description:", image_description)
stt.start()
while True:
main()

26
flake.lock Normal file
View File

@ -0,0 +1,26 @@
{
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1713714899,
"narHash": "sha256-+z/XjO3QJs5rLE5UOf015gdVauVRQd2vZtsFkaXBq2Y=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "6143fc5eeb9c4f00163267708e26191d1e918932",
"type": "github"
},
"original": {
"id": "nixpkgs",
"ref": "nixos-unstable",
"type": "indirect"
}
},
"root": {
"inputs": {
"nixpkgs": "nixpkgs"
}
}
},
"root": "root",
"version": 7
}

104
flake.nix Normal file
View File

@ -0,0 +1,104 @@
{
description = "A simple flake";
inputs.nixpkgs.url = "nixpkgs/nixos-unstable";
outputs = { self, nixpkgs }: {
defaultPackage.x86_64-linux = let
pkgs = nixpkgs.legacyPackages.x86_64-linux;
python = pkgs.python311;
pythonPackages = python.pkgs;
piper_model_url = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/hfc_female/medium/en_US-hfc_female-medium.onnx?download=true";
piper_model_json_url = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/hfc_female/medium/en_US-hfc_female-medium.onnx.json?download=true.json";
whisper_model_tiny_url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin?download=true";
whisper_model_base_url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin?download=true";
whisper_model_small_url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin?download=true";
whisper_model_small_tdrz_url = "https://huggingface.co/akashmjn/tinydiarize-whisper.cpp/resolve/main/ggml-small.en-tdrz.bin?download=true";
haracascade_face_url = "https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml";
piper_model = pkgs.fetchurl {
name = "hfc_female/medium/en_US-hfc_female-medium.onnx";
url = piper_model_url;
sha256 = "sha256-kUxHN4j8H6i2Os4c3NtEWI9K5SPTqzffFTZhaDWhQLc="; # replace with the correct sha256
};
piper_model_json = pkgs.fetchurl {
name = "hfc_female/medium/en_US-hfc_female-medium.onnx.json";
url = piper_model_json_url;
sha256 = "sha256-A/H6BiK4BGMoNZLZesqfbomuw0WlxWtyV3I+AJPFi2w="; # replace with the correct sha256
};
whisper_model_tiny = pkgs.fetchurl {
name = "ggml-tiny.bin";
url = whisper_model_tiny_url;
sha256 = "sha256-vgfgSOHlma1GNByNKhNWRQl6U4IhZ4t6zdGxkZxuGyE=";
};
whisper_model_base = pkgs.fetchurl {
name = "ggml-base.bin";
url = whisper_model_base_url;
sha256 = "sha256-YO1bw90U7qhWST0zQ0m0BXgt3K8AKNS130CINF+6Lv4=";
};
whisper_model_small = pkgs.fetchurl {
name = "ggml-small.bin";
url = whisper_model_small_url;
sha256 = "sha256-G+OpsgY4Z7k35k4ux0gzZKeZF+FX+pjF2UtcH//qmHs=";
};
whisper_model_small_tdrz = pkgs.fetchurl {
name = "ggml-small.en-tdrz.bin";
url = whisper_model_small_url;
sha256 = "sha256-G+OpsgY4Z7k35k4ux0gzZKeZF+FX+pjF2UtcH//qmHs=";
};
haracascade_face = pkgs.fetchurl {
name = "haarcascade_frontalface_default.xml";
url = haracascade_face_url;
sha256 = "sha256-D31FJ4ROtRTUpJSOgi2pD7sWo0oLu7xq3GSYdHpar7A=";
};
in pkgs.mkShell {
nativeBuildInputs = with pkgs; [
python
piper-tts
alsa-utils # for aplay for piper-tts to stream to
openai-whisper-cpp # for stt
opencv
(pythonPackages.numpy)
(pythonPackages.pytorch)
(pythonPackages.langchain)
(pythonPackages.mwxml)
(pythonPackages.mwparserfromhell) #dependency langchain document fetcher
(pythonPackages.sentence-transformers) #dependency for langchain embedding
(pythonPackages.chromadb) #vector search
(pythonPackages.opencv4)
(pythonPackages.pillow)
];
WHISPER_AUDIO_DEVICE = "1";
WHISPER_MODEL_PATH = whisper_model_tiny;
PIPER_MODEL_PATH = piper_model;
PIPER_MODEL_JSON_PATH =piper_model_json;
HARA_CASCADE_FACE_PATH = haracascade_face;
OLLAMA_HOST = "http://localhost:11434";
IMAGE_DESCRIPTION_CAMERA = "0";
IMAGE_DESCRIPTION_MODEL = "llava";
shellHook = ''
#need to set ollama url first, before pulling models needed for the project
ollama pull llava
ollama pull llama3
'';
};
};
}

118
image.py Normal file
View File

@ -0,0 +1,118 @@
import cv2
import os
import numpy as np
import base64
import face_recognition
from PIL import Image
from io import BytesIO
from langchain_community.chat_models import ChatOllama
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
camera = int(os.environ.get("IMAGE_DESCRIPTION_CAMERA", "0")) # Define your camera here, 0 is usually the built-in webcam
ollamaUrl = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
imagemodel = os.environ.get("IMAGE_DESCRIPTION_MODEL", "llava")
def capture():
"""
Capture an image from the webcam.
:return: Captured image
"""
cap = cv2.VideoCapture(camera)
ret, frame = cap.read()
cap.release()
return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) if ret else None
def encode_image(image):
"""
Encode the given image to base64.
:param image: Image to encode
:return: Base64 encoded image
"""
pil_image = Image.fromarray(image)
buffered = BytesIO()
pil_image.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue())
def resize_image(encoded_image, size=(672, 672)):
"""
Resize the given image to the specified size.
:param encoded_image: Base64 encoded image to resize
:param size: New size for the image
:return: Resized image
"""
image_data = base64.b64decode(encoded_image)
pil_image = Image.open(BytesIO(image_data))
resized_image = pil_image.resize(size)
buffered = BytesIO()
resized_image.save(buffered, format="JPEG")
return buffered.getvalue()
def capture_encoded_image():
"""
Capture an image from the webcam, resize it, and encode it to base64.
:return: Base64 encoded image
"""
image = capture()
if image is not None:
encoded_image = encode_image(image)
resized_image = resize_image(encoded_image)
return base64.b64encode(resized_image).decode("utf-8")
def get_image_from_encoded(encoded_image):
"""
Get an image from the given base64 encoded image.
:param encoded_image: Base64 encoded image
:return: Image
"""
image_data = base64.b64decode(encoded_image)
return cv2.imdecode(np.frombuffer(image_data, np.uint8), cv2.IMREAD_COLOR)
def get_faces(image):
"""
Get the faces from the given image and assign a persistent ID to each face.
:param image: Image to get faces from
:return: List of dictionaries containing the ID and location of each face
"""
face_cascade = cv2.CascadeClassifier(os.environ.get("HARA_CASCADE_FACE_PATH", "haarcascade_frontalface_default.xml"))
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
print(faces)#TODO: remove this debug print
face_data = []
for (x, y, w, h) in faces:
face_image = image[y:y+h, x:x+w]
face_encoding = face_recognition.face_encodings(face_image)
if face_encoding:
face_id = hash(tuple(face_encoding[0]))
face_data.append({
"id": face_id,
"location": (x, y, w, h)
})
return face_data
def describe(temperature=0, prompt="Briefely explain this image like it is your eyes. Use fewer words if possible. What is visible, and where are items located. Describe the pepole in the scene in some more detail. Refer to the camera as you."):
image_b64 = capture_encoded_image()
llava = ChatOllama(model=imagemodel, temperature=temperature, base_url=ollamaUrl)
def prompt_func(data):
text = data["text"]
image = data["image"]
image_part = {
"type": "image_url",
"image_url": f"data:image/jpeg;base64,{image}",
}
content_parts = []
text_part = {"type": "text", "text": text}
content_parts.append(image_part)
content_parts.append(text_part)
return [HumanMessage(content=content_parts)]
chain = prompt_func | llava | StrOutputParser()
query_chain = chain.invoke(
{"text": prompt, "image": image_b64}
)
return query_chain
if __name__ == "__main__":
# print(capture_encoded_image())
print(describe())

228
llm.py Normal file
View File

@ -0,0 +1,228 @@
#!python3
import os
import shutil
import datetime
from langchain_community.chat_models import ChatOllama
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import MWDumpLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import Ollama
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_transformers.embeddings_redundant_filter import EmbeddingsRedundantFilter
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain_core.prompts import HumanMessagePromptTemplate
from langchain_core.prompts import SystemMessagePromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain.utils.html import (PREFIXES_TO_IGNORE_REGEX,
SUFFIXES_TO_IGNORE_REGEX)
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.llms import Ollama
from enum import Enum
text_model = "llama3" #ollama
embedding_model ="all-MiniLM-L6-v2" #hugginface
wikilocation=os.environ.get("RAG_WIKI_LOCATION", "wiki/current.xml") #mediawiki xml to index
index_dir = "./index"
date = datetime.datetime.now().strftime("%Y-%m-%d")
global conversation
conversation = None
ollamaUrl = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
class Emotion(str, Enum):
NEUTRAL = "neutral"
HAPPY = "happy"
SAD = "sad"
ANGRY = "angry"
SURPRISED = "surprised"
CONFUSED = "confused"
EXCITED = "excited"
CALM = "calm"
class Action(str, Enum):
NOTHING = "nothing"
STUTTER = "stutter"
SQUEAL = "squeal"
MEWOW = "mewow"
SMUG = "smug"
WAGS_TAIL = "wags_tail"
WINK = "wink"
NOD = "nod"
LAUGH = "laugh"
SIGH = "sigh"
GLOOMY = "gloomy"
LOOK_AWAY = "look_away"
LOOK_TOWARDS_YOU = "look_towards_you"
def chat(question="Hello, how are you today?", image_description=""):
#example with rag
global conversation
global format_instructions
#TODO: implement chat history and memory, with storage to disk
chat_history = []
response = conversation({"question": question, "image_description": image_description, "chat_history": chat_history, "format_instructions": format_instructions, "date": date})
# print(response)
answer = response['answer']
result = output_parser.parse(answer)
#enforce keys.
try:
result["emotion"] = Emotion(result["emotion"])
except:
print(f"Could not parse emotion: {result['emotion']}")
result["emotion"] = Emotion.NEUTRAL
for i in range(len(result["actions"])):
try:
result["actions"][i] = Action(result["actions"][i])
except:
print(f"Could not parse action: {result['actions'][i]}")
result["actions"][i] = Action.NOTHING
# print(result)
return result
#some toy functions to interact with the llm
endstring = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" #think the llama3 end tokens are not properly implemented, in langchain yet.
def simple_stream(prompt="A question to ask the model", temperature=0.5,):
llm = Ollama(model=text_model, temperature=temperature, base_url=ollamaUrl, stop=["<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>", "<|reserved_special_token|>"]) #stop needs to be manually given for llama3 for now.
return llm.stream(prompt)
def simple(prompt="A question to ask the model", temperature=0.5,):
stream = simple_stream(prompt, temperature)
result = ""
for line in stream:
result += line.rstrip("\n")
if result.endswith(endstring):
result = result.replace(endstring, "")
return result
return result
#https://scribe.rip/rahasak/build-rag-application-using-a-llm-running-on-local-computer-with-ollama-and-langchain-e6513853fda0
def init_index():
# remove the current index
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
# Load data from MediaWiki dump
documents = MWDumpLoader(wikilocation).load()
#TODO: add chat history to the documents
# Split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(documents)
# Apply the redundant filter
embeddings = HuggingFaceEmbeddings(
model_name=embedding_model,
multi_process=True,
# encode_kwargs={"normalize_embeddings": True},
)
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
documents = redundant_filter.transform_documents(documents)
vectordb = Chroma.from_documents(
documents=documents,
embedding=embeddings,
persist_directory=index_dir,
collection_name="pvv-wiki"
)
vectordb.persist()
response_schemas = [
ResponseSchema(
name="response",
description="reply to the user's question or statement.",
),
ResponseSchema(
name="emotion",
description=f"emotion expressed in the response, selected from a set of possible options {list(str(e.value) for e in Emotion)}",
type="Emotion",
),
ResponseSchema(
name="actions",
description=f"List of actions to take at random in response to the user's question or statement from the set {list(str(a.value) for a in Action)}",
type="List[Action]",
),
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()
general_system_template = r"""
Given a specific context, please give a short answer to the question, use relevant context to try and find a possible outcome. If the data does not help, be uncertain in the final answear.
The current date is {date}
----
{context}
----
You may refer to what you can see in front of you in the description below. Any reference to the camera or image should be interpreted as "you" or "your eyes" or you can se:
{image_description}
____
Do not refer to yourself as an ai.
awoid expressions in the response.
You are a cute anime carachter named pvv chan, you like programming linux, opensource and board games.
Without refering to yourself, reply to the human talking to you.
{format_instructions}
"""
general_user_template = "Question:```{question}```"
messages = [
SystemMessagePromptTemplate.from_template(general_system_template),
HumanMessagePromptTemplate.from_template(general_user_template)
]
qa_prompt = ChatPromptTemplate.from_messages( messages )
def init_chat():
global conversation
#load index from local directory
embeddings = HuggingFaceEmbeddings(
model_name=embedding_model,
multi_process=True,
# encode_kwargs={"normalize_embeddings": True},
)
vectordb = Chroma(persist_directory=index_dir, embedding_function=embeddings)
llm = Ollama(
model=text_model,
base_url=ollamaUrl,
verbose=True,
)
# create conversation
conversation = ConversationalRetrievalChain.from_llm(
llm,
retriever=vectordb.as_retriever(search_kwargs={"k": 2} ), #amount of documents to use for the response
# retriever=vectordb.as_retriever(),
return_source_documents=True,
verbose=True,
combine_docs_chain_kwargs={"prompt": qa_prompt},
)
if __name__ == "__main__":
#print(simple(prompt="What is the meaning of life. (answear short)"))
print("inittialising index")
# init_index()
print("initialising chat")
init_chat()
print("chatting")
print(chat(question="Hello, how are you today? What is our dns server named?"))

83
stt.py Normal file
View File

@ -0,0 +1,83 @@
import os
import subprocess
import multiprocessing
import atexit
audio_device = os.getenv("WHISPER_AUDIO_DEVICE", "-1")
whisper_model = os.getenv("WHISPER_MODEL_PATH", "models/ggml-tiny.bin")
command = ["whisper-cpp-stream", "-kc", "-m", whisper_model, "-c", audio_device, "-t", "4"]
filter_strings = ["", "*", "\r", "\n","\t", "(inaudible)", "[BLANK_AUDIO]", "[Start speaking]", "(gunshot)", "(wind howling)", "[Music]", "(footsteps)"] # Example strings to filter out
class SharedString:
def __init__(self):
manager = multiprocessing.Manager()
self.namespace = manager.Namespace()
self.namespace.value = ""
def get_value(self):
with multiprocessing.Lock():
return self.namespace.value
def set_value(self, new_value):
with multiprocessing.Lock():
self.namespace.value = new_value
def append(self, append_value):
with multiprocessing.Lock():
self.namespace.value += append_value
buffer = SharedString()
process = None
process_thread = None
def read_output(proc, buffer):
while True:
output = proc.stdout.readline()
if output == b"" and proc.poll() is not None:
break
if output:
# print(output.decode("utf-8"))
buffer.append(output.decode("utf-8"))
def start():
global process, process_thread
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
process_thread = multiprocessing.Process(target=read_output, args=(process, buffer))
process_thread.start()
# Register cleanup function to be called when script exits
atexit.register(stop)
def stop():
global process, process_thread
if process:
process.terminate()
process_thread.join()
process = None
process_thread = None
def filter_buffer(data):
for f_str in filter_strings:
data = data.replace(f_str, "")
return data.strip()
def get_buffer():
data = buffer.get_value()
buffer.set_value("")
return filter_buffer(data)
#return data
def main():
start()
try:
while process.poll() is None:
data = get_buffer()
if data:
print(data)
except KeyboardInterrupt:
stop()
if __name__ == "__main__":
main()

21
tts.py Normal file
View File

@ -0,0 +1,21 @@
import subprocess
import os
piper_model_path = os.getenv("PIPER_MODEL_PATH")
piper_model_json_path = os.getenv("PIPER_MODEL_JSON_PATH")
def speak(text):
# some text cleanup
illegal_chars = ["\n", "\r", "\t", "", "*", "`", "[", "]", "{", "}", "\"", "\'"]
for char in illegal_chars:
text = text.replace(char, "")
#remove emojis
text = text.encode('ascii', 'ignore').decode('ascii')
command = f"echo \"{text}. \" | piper -q -m {piper_model_path} -c {piper_model_json_path} --output-raw | aplay -q -r 22050 -f S16_LE -t raw -"
process = subprocess.run(command, shell=True, check=True)
if __name__ == "__main__":
speak("Hello, world. This is a tts test.")