Source code for darca_embeddings.embedding

import os
from abc import ABC, abstractmethod

import openai
from darca_exception.exception import DarcaException
from darca_log_facility.logger import DarcaLogger
from openai import OpenAIError

# === Custom Exceptions ===


[docs] class EmbeddingException(DarcaException): """ Base class for all darca-embeddings exceptions. This exception can be extended or raised for any errors that occur within the darca-embeddings package. """
[docs] class EmbeddingAPIKeyMissing(EmbeddingException): """ Raised when the API key is missing for the selected embeddings provider. Typically occurs when the environment variable (e.g. ``OPENAI_API_KEY``) is not set or is invalid. """
[docs] class EmbeddingResponseError(EmbeddingException): """ Raised when an embeddings API request fails or returns malformed data. For example, it may be raised if the network request times out, the API returns a non-200 status code, or the response is not in the expected format. """
# === Abstract Base Client ===
[docs] class BaseEmbeddingClient(ABC): """ Abstract base class for Embedding providers. Implementations of this class must provide methods to generate embeddings from text input, typically by interacting with an external model or service. """
[docs] @abstractmethod def get_embedding(self, text: str) -> list[float]: """ Generate a vector embedding for a single piece of text. :param text: The input string to be embedded. :type text: str :returns: A floating-point vector (list) representation of the input text. :rtype: list[float] :raises EmbeddingException: If something goes wrong with the embedding request. """ raise NotImplementedError("Must be overridden in subclass.")
[docs] def get_embeddings(self, texts: list[str]) -> list[list[float]]: """ Generate vector embeddings for a list of input texts. By default, this method calls :meth:`get_embedding` for each string in ``texts``. Subclasses may override this method to provide more efficient batch processing. :param texts: A list of input strings to embed. :type texts: list[str] :returns: A list of embedding vectors, where each vector is a list of floats. :rtype: list[list[float]] :raises EmbeddingException: If something goes wrong while generating embeddings. """ return [self.get_embedding(t) for t in texts]
# === OpenAI Implementation ===
[docs] class OpenAIEmbeddingClient(BaseEmbeddingClient): """ Embedding provider that uses OpenAI's Embedding API. This client relies on the ``OPENAI_API_KEY`` environment variable for authentication. It uses the default model ``text-embedding-ada-002`` unless otherwise specified. :param model: The OpenAI Embedding model to use. :type model: str :raises EmbeddingAPIKeyMissing: If the required API key environment variable is not found. """ def __init__(self, model="text-embedding-ada-002"): self.logger = DarcaLogger("darca-embeddings").get_logger() api_key = os.getenv("OPENAI_API_KEY") if not api_key: raise EmbeddingAPIKeyMissing( message="OPENAI_API_KEY environment variable is not set.", error_code="EMBEDDING_API_KEY_MISSING", metadata={"provider": "openai"}, ) openai.api_key = api_key self.model = model
[docs] def get_embedding(self, text: str) -> list[float]: """ Generate an embedding for a single piece of text using OpenAI. :param text: The input text to be embedded. :type text: str :returns: A list of floats representing the embedding of the text. :rtype: list[float] :raises EmbeddingResponseError: - If the OpenAI API call fails or returns an unexpected format. - If an unknown error occurs during processing. """ try: self.logger.debug( "Requesting embedding from OpenAI", extra={"model": self.model} ) response = openai.embeddings.create(input=text, model=self.model) # Each 'data' item has an 'embedding' list embedding = response.data[0].embedding self.logger.debug("Received embedding from OpenAI") return embedding except OpenAIError as oe: raise EmbeddingResponseError( message="OpenAI embedding API returned an error.", error_code="EMBEDDING_API_REQUEST_FAILED", metadata={"model": self.model, "text_preview": text[:100]}, cause=oe, ) except Exception as e: raise EmbeddingResponseError( message=( "Unexpected failure during OpenAI " "embedding response parsing." ), error_code="EMBEDDING_RESPONSE_PARSE_ERROR", metadata={"model": self.model}, cause=e, )
# === EmbeddingClient Wrapper ===
[docs] class EmbeddingClient: """ A unified client for generating embeddings using the darca pluggable backend system. Defaults to OpenAI. This class wraps the functionality of various embedding providers behind a single interface. By specifying the ``backend`` argument, you can switch between multiple embeddings backends (e.g., OpenAI, HuggingFace, etc.). :param backend: The backend provider to use, e.g. ``openai`` or ``huggingface``. :type backend: str :param kwargs: Additional keyword arguments passed to the backend client's constructor. :raises EmbeddingException: If the chosen backend is not implemented or unsupported. """ def __init__(self, backend: str = "openai", **kwargs): """ Initialize the EmbeddingClient with the given backend and model settings. :param backend: The embedding backend to use (default: ``openai``). :type backend: str :param kwargs: Additional parameters for the backend's constructor. """ if backend == "openai": self._client = OpenAIEmbeddingClient(**kwargs) elif backend == "huggingface": # Placeholder for a future Hugging Face implementation raise EmbeddingException( message=( f"Embedding backend '{backend}' is" f" not implemented yet." ), error_code="EMBEDDING_UNIMPLEMENTED_BACKEND", metadata={"requested_backend": backend}, ) else: raise EmbeddingException( message=f"Embedding backend '{backend}' is not supported.", error_code="EMBEDDING_UNSUPPORTED_BACKEND", metadata={"requested_backend": backend}, ) def __getattr__(self, name): """ Delegate calls to the selected backend client. For instance, if you call :meth:`get_embedding` on an :class:`EmbeddingClient` instance, it will route to the current backend's implementation (e.g. :class:`OpenAIEmbeddingClient`). :param name: Name of the method or attribute to be retrieved. :type name: str :returns: The underlying backend client's method or attribute. """ return getattr(self._client, name)