init

2024-05-27 18:42:18 +02:00 · 2024-05-27 18:42:18 +02:00 · 418d6d044d
commit 418d6d044d
9 changed files with 667 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,10 @@
 */__pycache__/*
 __pycache__/*
 piper-models/*
 piper-models
 models
 index
 models
 index
 history
--- a/README.md
+++ b/README.md
@ -0,0 +1,40 @@
 This is a simple chatbot project.
 The aim is to recreate something similar to neurosama, running on local hardware on a minimal amount of compute. 
 The bot is designed to be modular, with the ability to add new modules easily.
 You need to supply a backup mediawiki xml. this is used to gather information to the chatbot.
 A strong computer with cuda and a fair bit of vrm is adviced to get response times down.
 Most settings are configured through enviroment variables from the flake.nix file.
 ## Modules
 ### stt
 The stt module is responsible for converting speech to text. 
 Whisper-cpp-stream is used to stream audio through the whisper stt engine.
 whisper-cpp-stream is a c++ program that reads audio from a microphone, and sends it to the whisper stt engine.
 It is run through a python subprocess.
 ### llm
 The llm module is responsible for crafting a response to the user's input. It uses a rag based on a supplied mediawiki wiki xml file, and in the future, included chat history.
 langchain is the pyhton module that interfaces with the rag, and llm.
 ollama is used on the backend to interface with a llama model.
 future work will include giving astructured response, to include emotions, and metadata for a future image module.
 ### tts
 piper is used as the tts engine. 
 It does not have proper python bindings in nixpkgs, so it is run with subprocess.
 text is echoed into piper's stdin, and the output is played with aplay.
 ### image
 The image module is responsible for processing images.
 It captures the image using pygame, b64 encodes it and sends it to a multimodal model for descriptions.
 Future work is to test out using opencv or something similar for image tagging instead, as the multimodal model halucinates a lot, and is also way too slow.
--- a/assistant.py
+++ b/assistant.py
@ -0,0 +1,37 @@
 import os
 import llm
 import image
 import stt
 import tts
 def main():
        print("Waiting for STT input...")
        line = ""
        while not line.strip():
            line = stt.get_buffer()
            if not line.strip():
                continue
        print(f"STT buffer: {line}")
        # Get the response from the LLM chat module
        llm_output = llm.chat(question=line, image_description=image_description) 
        print("LLM:", llm_output["response"])
        # Run the LLM output through the TTS (text-to-speech) module
        tts.speak(llm_output["response"].replace("*", ""))  # Remove asterisks for better TTS understanding
 if __name__ == "__main__":
    # Initialize the LLM index if it doesn't exist
    if not os.path.exists("index"):
        llm.init_index()  # Initialize the LLM index module (only needed with updated inputs)
    # Initialize the LLM chat module
    llm.init_chat()
    # Describe the image and store the description
    image_description = ""  # image.describe()
    print("Image description:", image_description)
    stt.start()
    while True:
        main()
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,26 @@
 {
  "nodes": {
    "nixpkgs": {
      "locked": {
        "lastModified": 1713714899,
        "narHash": "sha256-+z/XjO3QJs5rLE5UOf015gdVauVRQd2vZtsFkaXBq2Y=",
        "owner": "NixOS",
        "repo": "nixpkgs",
        "rev": "6143fc5eeb9c4f00163267708e26191d1e918932",
        "type": "github"
      },
      "original": {
        "id": "nixpkgs",
        "ref": "nixos-unstable",
        "type": "indirect"
      }
    },
    "root": {
      "inputs": {
        "nixpkgs": "nixpkgs"
      }
    }
  },
  "root": "root",
  "version": 7
 }
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,104 @@
 {
  description = "A simple flake";
  inputs.nixpkgs.url = "nixpkgs/nixos-unstable";
  outputs = { self, nixpkgs }: {
    defaultPackage.x86_64-linux = let
      pkgs = nixpkgs.legacyPackages.x86_64-linux;
      python = pkgs.python311;
      pythonPackages = python.pkgs;
      piper_model_url = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/hfc_female/medium/en_US-hfc_female-medium.onnx?download=true";
      piper_model_json_url = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/hfc_female/medium/en_US-hfc_female-medium.onnx.json?download=true.json";
      whisper_model_tiny_url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin?download=true";
      whisper_model_base_url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin?download=true";
      whisper_model_small_url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin?download=true";
      whisper_model_small_tdrz_url = "https://huggingface.co/akashmjn/tinydiarize-whisper.cpp/resolve/main/ggml-small.en-tdrz.bin?download=true";
      haracascade_face_url = "https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml";
      piper_model = pkgs.fetchurl {
        name = "hfc_female/medium/en_US-hfc_female-medium.onnx";
        url = piper_model_url;
        sha256 = "sha256-kUxHN4j8H6i2Os4c3NtEWI9K5SPTqzffFTZhaDWhQLc="; # replace with the correct sha256
      };
      piper_model_json = pkgs.fetchurl {
        name = "hfc_female/medium/en_US-hfc_female-medium.onnx.json";
        url = piper_model_json_url;
        sha256 = "sha256-A/H6BiK4BGMoNZLZesqfbomuw0WlxWtyV3I+AJPFi2w="; # replace with the correct sha256
      };
      whisper_model_tiny = pkgs.fetchurl {
        name = "ggml-tiny.bin";
        url = whisper_model_tiny_url;
        sha256 = "sha256-vgfgSOHlma1GNByNKhNWRQl6U4IhZ4t6zdGxkZxuGyE=";
      };
      whisper_model_base = pkgs.fetchurl {
        name = "ggml-base.bin";
        url = whisper_model_base_url;
        sha256 = "sha256-YO1bw90U7qhWST0zQ0m0BXgt3K8AKNS130CINF+6Lv4=";
      };
      whisper_model_small = pkgs.fetchurl {
        name = "ggml-small.bin";
        url = whisper_model_small_url;
        sha256 = "sha256-G+OpsgY4Z7k35k4ux0gzZKeZF+FX+pjF2UtcH//qmHs=";
      };
      whisper_model_small_tdrz = pkgs.fetchurl {
        name = "ggml-small.en-tdrz.bin";
        url = whisper_model_small_url;
        sha256 = "sha256-G+OpsgY4Z7k35k4ux0gzZKeZF+FX+pjF2UtcH//qmHs=";
      };
      haracascade_face = pkgs.fetchurl {
        name = "haarcascade_frontalface_default.xml";
        url = haracascade_face_url;
        sha256 = "sha256-D31FJ4ROtRTUpJSOgi2pD7sWo0oLu7xq3GSYdHpar7A=";
      };
    in pkgs.mkShell {
      nativeBuildInputs = with pkgs; [
        python
        piper-tts
        alsa-utils # for aplay for piper-tts to stream to
        openai-whisper-cpp # for stt
        opencv
        (pythonPackages.numpy)
        (pythonPackages.pytorch)
        (pythonPackages.langchain)
        (pythonPackages.mwxml)
        (pythonPackages.mwparserfromhell) #dependency langchain document fetcher
        (pythonPackages.sentence-transformers) #dependency for langchain embedding
        (pythonPackages.chromadb) #vector search
        (pythonPackages.opencv4)
        (pythonPackages.pillow)
      ];
      WHISPER_AUDIO_DEVICE = "1";
      WHISPER_MODEL_PATH = whisper_model_tiny;
      PIPER_MODEL_PATH = piper_model;
      PIPER_MODEL_JSON_PATH =piper_model_json;
      HARA_CASCADE_FACE_PATH = haracascade_face;
      OLLAMA_HOST = "http://localhost:11434";
      IMAGE_DESCRIPTION_CAMERA = "0";
      IMAGE_DESCRIPTION_MODEL = "llava";
      shellHook = ''
        #need to set ollama url first, before pulling models needed for the project
        ollama pull llava
        ollama pull llama3
      '';
    };
  };
 }
--- a/image.py
+++ b/image.py
@ -0,0 +1,118 @@
 import cv2
 import os
 import numpy as np
 import base64
 import face_recognition
 from PIL import Image
 from io import BytesIO
 from langchain_community.chat_models import ChatOllama
 from langchain_core.messages import HumanMessage
 from langchain_core.output_parsers import StrOutputParser
 camera = int(os.environ.get("IMAGE_DESCRIPTION_CAMERA", "0"))  # Define your camera here, 0 is usually the built-in webcam
 ollamaUrl = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
 imagemodel = os.environ.get("IMAGE_DESCRIPTION_MODEL", "llava")
 def capture():
    """
    Capture an image from the webcam.
    :return: Captured image
    """
    cap = cv2.VideoCapture(camera)
    ret, frame = cap.read()
    cap.release()
    return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) if ret else None
 def encode_image(image):
    """
    Encode the given image to base64.
    :param image: Image to encode
    :return: Base64 encoded image
    """
    pil_image = Image.fromarray(image)
    buffered = BytesIO()
    pil_image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue())
 def resize_image(encoded_image, size=(672, 672)):
    """
    Resize the given image to the specified size.
    :param encoded_image: Base64 encoded image to resize
    :param size: New size for the image
    :return: Resized image
    """
    image_data = base64.b64decode(encoded_image)
    pil_image = Image.open(BytesIO(image_data))
    resized_image = pil_image.resize(size)
    buffered = BytesIO()
    resized_image.save(buffered, format="JPEG")
    return buffered.getvalue()
 def capture_encoded_image():
    """
    Capture an image from the webcam, resize it, and encode it to base64.
    :return: Base64 encoded image
    """
    image = capture()
    if image is not None:
        encoded_image = encode_image(image)
        resized_image = resize_image(encoded_image)
        return base64.b64encode(resized_image).decode("utf-8")
 def get_image_from_encoded(encoded_image):
    """
    Get an image from the given base64 encoded image.
    :param encoded_image: Base64 encoded image
    :return: Image
    """
    image_data = base64.b64decode(encoded_image)
    return cv2.imdecode(np.frombuffer(image_data, np.uint8), cv2.IMREAD_COLOR)
 def get_faces(image):
    """
    Get the faces from the given image and assign a persistent ID to each face.
    :param image: Image to get faces from
    :return: List of dictionaries containing the ID and location of each face
    """
    face_cascade = cv2.CascadeClassifier(os.environ.get("HARA_CASCADE_FACE_PATH", "haarcascade_frontalface_default.xml"))
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    print(faces)#TODO: remove this debug print
    face_data = []
    for (x, y, w, h) in faces:
        face_image = image[y:y+h, x:x+w]
        face_encoding = face_recognition.face_encodings(face_image)
        if face_encoding:
            face_id = hash(tuple(face_encoding[0]))
            face_data.append({
                "id": face_id,
                "location": (x, y, w, h)
            })
    return face_data    
 def describe(temperature=0, prompt="Briefely explain this image like it is your eyes. Use fewer words if possible. What is visible, and where are items located. Describe the pepole in the scene in some more detail. Refer to the camera as you."):
    image_b64 = capture_encoded_image()
    llava = ChatOllama(model=imagemodel, temperature=temperature, base_url=ollamaUrl)
    def prompt_func(data):
        text = data["text"]
        image = data["image"]
        image_part = {
            "type": "image_url",
            "image_url": f"data:image/jpeg;base64,{image}",
        }
        content_parts = []
        text_part = {"type": "text", "text": text}
        content_parts.append(image_part)
        content_parts.append(text_part)
        return [HumanMessage(content=content_parts)]
    chain = prompt_func | llava | StrOutputParser()
    query_chain = chain.invoke(
        {"text": prompt, "image": image_b64}
    )
    return query_chain
 if __name__ == "__main__":
    # print(capture_encoded_image())
    print(describe())
--- a/llm.py
+++ b/llm.py
@ -0,0 +1,228 @@
 #!python3
 import os
 import shutil
 import datetime
 from langchain_community.chat_models import ChatOllama
 from langchain_core.messages import HumanMessage
 from langchain_core.output_parsers import StrOutputParser
 from langchain_community.document_loaders import MWDumpLoader
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.llms import Ollama
 from langchain_community.document_loaders import DirectoryLoader
 from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_transformers.embeddings_redundant_filter import EmbeddingsRedundantFilter
 from langchain_community.vectorstores import Chroma
 from langchain.chains import ConversationalRetrievalChain
 from langchain_core.prompts import HumanMessagePromptTemplate
 from langchain_core.prompts import SystemMessagePromptTemplate
 from langchain.memory import ConversationBufferMemory
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.embeddings import OllamaEmbeddings
 from langchain.utils.html import (PREFIXES_TO_IGNORE_REGEX,
                                  SUFFIXES_TO_IGNORE_REGEX)
 from langchain.output_parsers import ResponseSchema, StructuredOutputParser
 from langchain_core.prompts import PromptTemplate
 from langchain_community.llms import Ollama
 from enum import Enum
 text_model = "llama3" #ollama
 embedding_model ="all-MiniLM-L6-v2" #hugginface
 wikilocation=os.environ.get("RAG_WIKI_LOCATION", "wiki/current.xml")  #mediawiki xml to index
 index_dir = "./index"
 date = datetime.datetime.now().strftime("%Y-%m-%d")
 global conversation
 conversation = None
 ollamaUrl = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
 class Emotion(str, Enum):
    NEUTRAL = "neutral"
    HAPPY = "happy"
    SAD = "sad"
    ANGRY = "angry"
    SURPRISED = "surprised"
    CONFUSED = "confused"
    EXCITED = "excited"
    CALM = "calm"
 class Action(str, Enum):
    NOTHING = "nothing"
    STUTTER = "stutter"
    SQUEAL = "squeal"
    MEWOW = "mewow"
    SMUG = "smug"
    WAGS_TAIL = "wags_tail"
    WINK = "wink"
    NOD = "nod"
    LAUGH = "laugh"
    SIGH = "sigh"
    GLOOMY = "gloomy"
    LOOK_AWAY = "look_away"
    LOOK_TOWARDS_YOU = "look_towards_you"
 def chat(question="Hello, how are you today?", image_description=""):
    #example with rag
    global conversation
    global format_instructions
    #TODO: implement chat history and memory, with storage to disk
    chat_history = []
    response = conversation({"question": question, "image_description": image_description, "chat_history": chat_history, "format_instructions": format_instructions, "date": date})
    # print(response)
    answer = response['answer']
    result = output_parser.parse(answer)
    #enforce keys.
    try:
        result["emotion"] = Emotion(result["emotion"])
    except:
        print(f"Could not parse emotion: {result['emotion']}")
        result["emotion"] = Emotion.NEUTRAL
    for i in range(len(result["actions"])):
        try:
            result["actions"][i] = Action(result["actions"][i])
        except:
            print(f"Could not parse action: {result['actions'][i]}")
            result["actions"][i] = Action.NOTHING
    # print(result)
    return result
 #some toy functions to interact with the llm
 endstring = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" #think the llama3 end tokens are not properly implemented, in langchain yet. 
 def simple_stream(prompt="A question to ask the model", temperature=0.5,):
    llm = Ollama(model=text_model, temperature=temperature, base_url=ollamaUrl, stop=["<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>", "<|reserved_special_token|>"]) #stop needs to be manually given for llama3 for now.
    return llm.stream(prompt)
 def simple(prompt="A question to ask the model", temperature=0.5,):
    stream = simple_stream(prompt, temperature)
    result = ""
    for line in stream:
        result += line.rstrip("\n")
        if result.endswith(endstring):
            result = result.replace(endstring, "")
            return result
    return result
 #https://scribe.rip/rahasak/build-rag-application-using-a-llm-running-on-local-computer-with-ollama-and-langchain-e6513853fda0
 def init_index():    
    # remove the current index
    if os.path.exists(index_dir):
        shutil.rmtree(index_dir)
    # Load data from MediaWiki dump
    documents = MWDumpLoader(wikilocation).load()
    #TODO: add chat history to the documents
    # Split text
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    documents = text_splitter.split_documents(documents)
    # Apply the redundant filter
    embeddings = HuggingFaceEmbeddings(
            model_name=embedding_model,
            multi_process=True,
            # encode_kwargs={"normalize_embeddings": True},
        )
    redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
    documents = redundant_filter.transform_documents(documents)
    vectordb = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=index_dir,
        collection_name="pvv-wiki"
    )
    vectordb.persist()
 response_schemas = [
    ResponseSchema(
        name="response",
        description="reply to the user's question or statement.",
    ),
    ResponseSchema(
        name="emotion",
        description=f"emotion expressed in the response, selected from a set of possible options {list(str(e.value) for e in Emotion)}",
        type="Emotion",
    ),
    ResponseSchema(
        name="actions",
        description=f"List of actions to take at random in response to the user's question or statement from the set {list(str(a.value) for a in Action)}",
        type="List[Action]",
    ),
 ]
 output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
 format_instructions = output_parser.get_format_instructions()
 general_system_template = r""" 
 Given a specific context, please give a short answer to the question, use relevant context to try and find a possible outcome. If the data does not help, be uncertain in the final answear. 
 The current date is {date}
 ----
 {context}
 ----
 You may refer to what you can see in front of you in the description below. Any reference to the camera or image should be interpreted as "you" or "your eyes" or you can se:
 {image_description}
 ____
 Do not refer to yourself as an ai.
 awoid expressions in the response. 
 You are a cute anime carachter named pvv chan, you like programming linux, opensource and board games.
 Without refering to yourself, reply to the human talking to you.
 {format_instructions}
 """
 general_user_template = "Question:```{question}```"
 messages = [
            SystemMessagePromptTemplate.from_template(general_system_template),
            HumanMessagePromptTemplate.from_template(general_user_template)
 ]
 qa_prompt = ChatPromptTemplate.from_messages( messages )
 def init_chat():
    global conversation
    #load index from local directory
    embeddings = HuggingFaceEmbeddings(
            model_name=embedding_model,
            multi_process=True,
            # encode_kwargs={"normalize_embeddings": True},
        )
    vectordb = Chroma(persist_directory=index_dir, embedding_function=embeddings)
    llm = Ollama(
        model=text_model,
        base_url=ollamaUrl,
        verbose=True,
    )
    # create conversation
    conversation = ConversationalRetrievalChain.from_llm(
        llm,
        retriever=vectordb.as_retriever(search_kwargs={"k": 2} ), #amount of documents to use for the response
        # retriever=vectordb.as_retriever(),
        return_source_documents=True,
        verbose=True,
        combine_docs_chain_kwargs={"prompt": qa_prompt},
    )
 if __name__ == "__main__":
    #print(simple(prompt="What is the meaning of life. (answear short)"))
    print("inittialising index")
    # init_index()
    print("initialising chat")
    init_chat()
    print("chatting")
    print(chat(question="Hello, how are you today? What is our dns server named?"))
--- a/stt.py
+++ b/stt.py
@ -0,0 +1,83 @@
 import os
 import subprocess
 import multiprocessing
 import atexit
 audio_device = os.getenv("WHISPER_AUDIO_DEVICE", "-1")
 whisper_model = os.getenv("WHISPER_MODEL_PATH", "models/ggml-tiny.bin")
 command = ["whisper-cpp-stream", "-kc", "-m", whisper_model, "-c", audio_device, "-t", "4"]
 filter_strings = ["", "*", "\r", "\n","\t", "(inaudible)", "[BLANK_AUDIO]", "[Start speaking]", "(gunshot)", "(wind howling)", "[Music]", "(footsteps)"]  # Example strings to filter out
 class SharedString:
    def __init__(self):
        manager = multiprocessing.Manager()
        self.namespace = manager.Namespace()
        self.namespace.value = ""
    def get_value(self):
        with multiprocessing.Lock():
            return self.namespace.value
    def set_value(self, new_value):
        with multiprocessing.Lock():
            self.namespace.value = new_value
    def append(self, append_value):
        with multiprocessing.Lock():
            self.namespace.value += append_value
 buffer = SharedString()
 process = None
 process_thread = None
 def read_output(proc, buffer):
    while True:
        output = proc.stdout.readline()
        if output == b"" and proc.poll() is not None:
            break
        if output:
            # print(output.decode("utf-8"))
            buffer.append(output.decode("utf-8"))
 def start():
    global process, process_thread
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    process_thread = multiprocessing.Process(target=read_output, args=(process, buffer))
    process_thread.start()
    # Register cleanup function to be called when script exits
    atexit.register(stop)
 def stop():
    global process, process_thread
    if process:
        process.terminate()
        process_thread.join()
        process = None
        process_thread = None
 def filter_buffer(data):
    for f_str in filter_strings:
        data = data.replace(f_str, "")
    return data.strip()
 def get_buffer():
    data = buffer.get_value()
    buffer.set_value("")
    return filter_buffer(data)
    #return data
 def main():
    start()
    try:
        while process.poll() is None:
            data = get_buffer()
            if data:
                print(data)
    except KeyboardInterrupt:
        stop()
 if __name__ == "__main__":
    main()
--- a/tts.py
+++ b/tts.py
@ -0,0 +1,21 @@
 import subprocess
 import os
 piper_model_path = os.getenv("PIPER_MODEL_PATH")
 piper_model_json_path = os.getenv("PIPER_MODEL_JSON_PATH")
 def speak(text):
    # some text cleanup
    illegal_chars = ["\n", "\r", "\t", "", "*", "`", "[", "]", "{", "}", "\"", "\'"]
    for char in illegal_chars:
        text = text.replace(char, "")
    #remove emojis
    text = text.encode('ascii', 'ignore').decode('ascii')
    command = f"echo \"{text}. \" | piper -q -m {piper_model_path} -c {piper_model_json_path} --output-raw | aplay -q -r 22050 -f S16_LE -t raw -"
    process = subprocess.run(command, shell=True, check=True)
 if __name__ == "__main__":
    speak("Hello, world. This is a tts test.")