diff --git a/python/zvec/extension/__init__.py b/python/zvec/extension/__init__.py index 597f91be..cc9401f8 100644 --- a/python/zvec/extension/__init__.py +++ b/python/zvec/extension/__init__.py @@ -15,6 +15,8 @@ from .bm25_embedding_function import BM25EmbeddingFunction from .embedding_function import DenseEmbeddingFunction, SparseEmbeddingFunction +from .jina_embedding_function import JinaDenseEmbedding +from .jina_function import JinaFunctionBase from .multi_vector_reranker import RrfReRanker, WeightedReRanker from .openai_embedding_function import OpenAIDenseEmbedding from .openai_function import OpenAIFunctionBase @@ -35,6 +37,8 @@ "DefaultLocalReRanker", "DefaultLocalSparseEmbedding", "DenseEmbeddingFunction", + "JinaDenseEmbedding", + "JinaFunctionBase", "OpenAIDenseEmbedding", "OpenAIFunctionBase", "QwenDenseEmbedding", diff --git a/python/zvec/extension/jina_embedding_function.py b/python/zvec/extension/jina_embedding_function.py new file mode 100644 index 00000000..2f8b02aa --- /dev/null +++ b/python/zvec/extension/jina_embedding_function.py @@ -0,0 +1,240 @@ +# Copyright 2025-present the zvec project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from functools import lru_cache +from typing import Optional + +from ..common.constants import TEXT, DenseVectorType +from .embedding_function import DenseEmbeddingFunction +from .jina_function import JinaFunctionBase + + +class JinaDenseEmbedding(JinaFunctionBase, DenseEmbeddingFunction[TEXT]): + """Dense text embedding function using Jina AI API. + + This class provides text-to-vector embedding capabilities using Jina AI's + embedding models. It inherits from ``DenseEmbeddingFunction`` and implements + dense text embedding via the Jina Embeddings API (OpenAI-compatible). + + Jina Embeddings v5 models support task-specific embedding through the + ``task`` parameter, which optimizes the embedding for different use cases + such as retrieval, text matching, or classification. They also support + Matryoshka Representation Learning, allowing flexible output dimensions. + + Args: + model (str, optional): Jina embedding model identifier. + Defaults to ``"jina-embeddings-v5-text-nano"``. Available models: + - ``"jina-embeddings-v5-text-nano"``: 768 dims, 239M params, 8K context + - ``"jina-embeddings-v5-text-small"``: 1024 dims, 677M params, 32K context + dimension (Optional[int], optional): Desired output embedding dimension. + If ``None``, uses model's default dimension. Supports Matryoshka + dimensions: 32, 64, 128, 256, 512, 768 (nano) / 1024 (small). + Defaults to ``None``. + api_key (Optional[str], optional): Jina API authentication key. + If ``None``, reads from ``JINA_API_KEY`` environment variable. + Obtain your key from: https://jina.ai/api-dashboard + task (Optional[str], optional): Task type to optimize embeddings for. + Defaults to ``None``. Valid values: + - ``"retrieval.query"``: For search queries + - ``"retrieval.passage"``: For documents/passages to be searched + - ``"text-matching"``: For symmetric text similarity + - ``"classification"``: For text classification + - ``"separation"``: For clustering/separation tasks + + Attributes: + dimension (int): The embedding vector dimension. + data_type (DataType): Always ``DataType.VECTOR_FP32`` for this implementation. + model (str): The Jina model name being used. + task (Optional[str]): The task type for embedding optimization. + + Raises: + ValueError: If API key is not provided and not found in environment, + if task is not a valid task type, or if API returns an error response. + TypeError: If input to ``embed()`` is not a string. + RuntimeError: If network error or Jina service error occurs. + + Note: + - Requires Python 3.10, 3.11, or 3.12 + - Requires the ``openai`` package: ``pip install openai`` + - Jina API is OpenAI-compatible, so it uses the ``openai`` Python client + - Embedding results are cached (LRU cache, maxsize=10) to reduce API calls + - For retrieval tasks, use ``"retrieval.query"`` for queries and + ``"retrieval.passage"`` for documents + - API usage requires a Jina API key from https://jina.ai/api-dashboard + + Examples: + >>> # Basic usage with default model + >>> from zvec.extension import JinaDenseEmbedding + >>> import os + >>> os.environ["JINA_API_KEY"] = "jina_..." + >>> + >>> emb_func = JinaDenseEmbedding() + >>> vector = emb_func.embed("Hello, world!") + >>> len(vector) + 768 + + >>> # Retrieval use case: embed queries and documents differently + >>> query_emb = JinaDenseEmbedding(task="retrieval.query") + >>> doc_emb = JinaDenseEmbedding(task="retrieval.passage") + >>> + >>> query_vector = query_emb.embed("What is machine learning?") + >>> doc_vector = doc_emb.embed("Machine learning is a subset of AI...") + + >>> # Using larger model with custom dimension (Matryoshka) + >>> emb_func = JinaDenseEmbedding( + ... model="jina-embeddings-v5-text-small", + ... dimension=256, + ... api_key="jina_...", + ... task="text-matching", + ... ) + >>> vector = emb_func.embed("Semantic similarity comparison") + >>> len(vector) + 256 + + >>> # Using with zvec collection + >>> import zvec + >>> emb_func = JinaDenseEmbedding(task="retrieval.passage") + >>> schema = zvec.CollectionSchema( + ... name="docs", + ... vectors=zvec.VectorSchema( + ... "embedding", zvec.DataType.VECTOR_FP32, emb_func.dimension + ... ), + ... ) + >>> collection = zvec.create_and_open(path="./my_docs", schema=schema) + + See Also: + - ``DenseEmbeddingFunction``: Base class for dense embeddings + - ``OpenAIDenseEmbedding``: Alternative using OpenAI API + - ``QwenDenseEmbedding``: Alternative using Qwen/DashScope API + - ``DefaultLocalDenseEmbedding``: Local model without API calls + """ + + def __init__( + self, + model: str = "jina-embeddings-v5-text-nano", + dimension: Optional[int] = None, + api_key: Optional[str] = None, + task: Optional[str] = None, + **kwargs, + ): + """Initialize the Jina dense embedding function. + + Args: + model (str): Jina model name. Defaults to "jina-embeddings-v5-text-nano". + dimension (Optional[int]): Target embedding dimension or None for default. + api_key (Optional[str]): API key or None to use environment variable. + task (Optional[str]): Task type for embedding optimization or None. + **kwargs: Additional parameters for API calls. + + Raises: + ValueError: If API key is not provided and not in environment, + or if task is not a valid task type. + """ + # Initialize base class for API connection + JinaFunctionBase.__init__(self, model=model, api_key=api_key, task=task) + + # Store dimension configuration + self._custom_dimension = dimension + + # Determine actual dimension + if dimension is None: + self._dimension = self._MODEL_DIMENSIONS.get(model, 768) + else: + self._dimension = dimension + + # Store extra attributes + self._extra_params = kwargs + + @property + def dimension(self) -> int: + """int: The expected dimensionality of the embedding vector.""" + return self._dimension + + @property + def extra_params(self) -> dict: + """dict: Extra parameters for model-specific customization.""" + return self._extra_params + + def __call__(self, input: TEXT) -> DenseVectorType: + """Make the embedding function callable.""" + return self.embed(input) + + @lru_cache(maxsize=10) + def embed(self, input: TEXT) -> DenseVectorType: + """Generate dense embedding vector for the input text. + + This method calls the Jina Embeddings API to convert input text + into a dense vector representation. Results are cached to improve + performance for repeated inputs. + + Args: + input (TEXT): Input text string to embed. Must be non-empty after + stripping whitespace. Maximum length depends on model: + 8192 tokens for v5-nano, 32768 tokens for v5-small. + + Returns: + DenseVectorType: A list of floats representing the embedding vector. + Length equals ``self.dimension``. Example: + ``[0.123, -0.456, 0.789, ...]`` + + Raises: + TypeError: If ``input`` is not a string. + ValueError: If input is empty/whitespace-only, or if the API returns + an error or malformed response. + RuntimeError: If network connectivity issues or Jina service + errors occur. + + Examples: + >>> emb = JinaDenseEmbedding(task="retrieval.query") + >>> vector = emb.embed("What is deep learning?") + >>> len(vector) + 768 + >>> isinstance(vector[0], float) + True + + >>> # Error: empty input + >>> emb.embed(" ") + ValueError: Input text cannot be empty or whitespace only + + >>> # Error: non-string input + >>> emb.embed(123) + TypeError: Expected 'input' to be str, got int + + Note: + - This method is cached (maxsize=10). Identical inputs return cached results. + - The cache is based on exact string match (case-sensitive). + - Task type affects embedding optimization but not caching behavior. + """ + if not isinstance(input, TEXT): + raise TypeError(f"Expected 'input' to be str, got {type(input).__name__}") + + input = input.strip() + if not input: + raise ValueError("Input text cannot be empty or whitespace only") + + # Call API + embedding_vector = self._call_text_embedding_api( + input=input, + dimension=self._custom_dimension, + ) + + # Verify dimension + if len(embedding_vector) != self.dimension: + raise ValueError( + f"Dimension mismatch: expected {self.dimension}, " + f"got {len(embedding_vector)}" + ) + + return embedding_vector diff --git a/python/zvec/extension/jina_function.py b/python/zvec/extension/jina_function.py new file mode 100644 index 00000000..f20b679c --- /dev/null +++ b/python/zvec/extension/jina_function.py @@ -0,0 +1,182 @@ +# Copyright 2025-present the zvec project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import os +from typing import ClassVar, Optional + +from ..common.constants import TEXT +from ..tool import require_module + + +class JinaFunctionBase: + """Base class for Jina AI functions. + + This base class provides common functionality for calling Jina AI APIs + and handling responses. It supports embeddings (dense) operations via + the OpenAI-compatible Jina Embeddings API. + + This class is not meant to be used directly. Use concrete implementations: + - ``JinaDenseEmbedding`` for dense embeddings + + Args: + model (str): Jina embedding model identifier. + api_key (Optional[str]): Jina API authentication key. + task (Optional[str]): Task type for the embedding model. + + Note: + - This is an internal base class for code reuse across Jina features + - Subclasses should inherit from appropriate Protocol + - Provides unified API connection and response handling + - Jina API is OpenAI-compatible, using the ``openai`` Python client + """ + + _BASE_URL: ClassVar[str] = "https://api.jina.ai/v1" + + # Model default dimensions + _MODEL_DIMENSIONS: ClassVar[dict[str, int]] = { + "jina-embeddings-v5-text-nano": 768, + "jina-embeddings-v5-text-small": 1024, + } + + # Model max tokens + _MODEL_MAX_TOKENS: ClassVar[dict[str, int]] = { + "jina-embeddings-v5-text-nano": 8192, + "jina-embeddings-v5-text-small": 32768, + } + + # Valid task types + _VALID_TASKS: ClassVar[tuple[str, ...]] = ( + "retrieval.query", + "retrieval.passage", + "text-matching", + "classification", + "separation", + ) + + def __init__( + self, + model: str, + api_key: Optional[str] = None, + task: Optional[str] = None, + ): + """Initialize the base Jina functionality. + + Args: + model (str): Jina model name. + api_key (Optional[str]): API key or None to use environment variable. + task (Optional[str]): Task type for the embedding model. + Valid values: "retrieval.query", "retrieval.passage", + "text-matching", "classification", "separation". + + Raises: + ValueError: If API key is not provided and not in environment, + or if task is not a valid task type. + """ + self._model = model + self._api_key = api_key or os.environ.get("JINA_API_KEY") + self._task = task + + if not self._api_key: + raise ValueError( + "Jina API key is required. Please provide 'api_key' parameter " + "or set the 'JINA_API_KEY' environment variable. " + "Get your key from: https://jina.ai/api-dashboard" + ) + + if task is not None and task not in self._VALID_TASKS: + raise ValueError( + f"Invalid task '{task}'. Valid tasks: {', '.join(self._VALID_TASKS)}" + ) + + @property + def model(self) -> str: + """str: The Jina model name currently in use.""" + return self._model + + @property + def task(self) -> Optional[str]: + """Optional[str]: The task type for the embedding model.""" + return self._task + + def _get_client(self): + """Get OpenAI-compatible client instance configured for Jina API. + + Returns: + OpenAI: Configured OpenAI client pointing to Jina API. + + Raises: + ImportError: If openai package is not installed. + """ + openai = require_module("openai") + return openai.OpenAI(api_key=self._api_key, base_url=self._BASE_URL) + + def _call_text_embedding_api( + self, + input: TEXT, + dimension: Optional[int] = None, + ) -> list: + """Call Jina Embeddings API. + + Args: + input (TEXT): Input text to embed. + dimension (Optional[int]): Target dimension for Matryoshka embeddings. + + Returns: + list: Embedding vector as list of floats. + + Raises: + RuntimeError: If API call fails. + ValueError: If API returns error response. + """ + try: + client = self._get_client() + + # Prepare embedding parameters + params = {"model": self.model, "input": input} + + # Add dimension parameter for Matryoshka support + if dimension is not None: + params["dimensions"] = dimension + + # Add task parameter via extra_body + if self._task is not None: + params["extra_body"] = {"task": self._task} + + # Call Jina API (OpenAI-compatible) + response = client.embeddings.create(**params) + + except Exception as e: + # Check if it's an OpenAI API error + openai = require_module("openai") + if isinstance(e, (openai.APIError, openai.APIConnectionError)): + raise RuntimeError(f"Failed to call Jina API: {e!s}") from e + raise RuntimeError(f"Unexpected error during API call: {e!s}") from e + + # Extract embedding from response + try: + if not response.data: + raise ValueError("Invalid API response: no embedding data returned") + + embedding_vector = response.data[0].embedding + + if not isinstance(embedding_vector, list): + raise ValueError( + "Invalid API response: embedding is not a list of numbers" + ) + + return embedding_vector + + except (AttributeError, IndexError, TypeError) as e: + raise ValueError(f"Failed to parse API response: {e!s}") from e