diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index e8c28e61f5..a2ae304494 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -19,7 +19,7 @@ from __future__ import annotations import json -from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union +from typing import Any, Iterable, List, Literal, Mapping, Optional, Tuple, Union import pandas as pd @@ -387,6 +387,91 @@ def generate_double( return series_list[0]._apply_nary_op(operator, series_list[1:]) +@log_adapter.method_logger(custom_base_name="bigquery_ai") +def generate_embedding( + model_name: str, + data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], + *, + output_dimensionality: Optional[int] = None, + start_second: Optional[float] = None, + end_second: Optional[float] = None, + interval_seconds: Optional[float] = None, +) -> dataframe.DataFrame: + """ + Creates embeddings that describe an entity—for example, a piece of text or an image. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]}) + >>> bbq.ai.generate_embedding( + ... "project.dataset.model_name", + ... df + ... ) # doctest: +SKIP + + Args: + model_name (str): + The name of a remote model over a Vertex AI multimodalembedding@001 model. + data (DataFrame or Series): + The data to generate embeddings for. If a Series is provided, it is treated as the 'content' column. + If a DataFrame is provided, it must contain a 'content' column, or you must rename the column you wish to embed to 'content'. + output_dimensionality (int, optional): + The number of dimensions to use when generating embeddings. Valid values are 128, 256, 512, and 1408. The default value is 1408. + start_second (float, optional): + The second in the video at which to start the embedding. The default value is 0. + end_second (float, optional): + The second in the video at which to end the embedding. The default value is 120. + interval_seconds (float, optional): + The interval to use when creating embeddings. The default value is 16. + + Returns: + bigframes.dataframe.DataFrame: + A new DataFrame with the generated embeddings. It contains the input table columns and the following columns: + * "embedding": an ARRAY value that contains the generated embedding vector. + * "status": a STRING value that contains the API response status for the corresponding row. + * "video_start_sec": for video content, an INT64 value that contains the starting second. + * "video_end_sec": for video content, an INT64 value that contains the ending second. + """ + if isinstance(data, (pd.DataFrame, pd.Series)): + data = bpd.read_pandas(data) + + if isinstance(data, series.Series): + # Rename series to 'content' and convert to DataFrame + data_df = data.rename("content").to_frame() + elif isinstance(data, dataframe.DataFrame): + data_df = data + else: + raise ValueError(f"Unsupported data type: {type(data)}") + + # We need to get the SQL for the input data to pass as a subquery to the TVF + source_sql = data_df.sql + + struct_fields = [] + if output_dimensionality is not None: + struct_fields.append(f"{output_dimensionality} AS output_dimensionality") + if start_second is not None: + struct_fields.append(f"{start_second} AS start_second") + if end_second is not None: + struct_fields.append(f"{end_second} AS end_second") + if interval_seconds is not None: + struct_fields.append(f"{interval_seconds} AS interval_seconds") + + struct_args = ", ".join(struct_fields) + + # Construct the TVF query + query = f""" + SELECT * + FROM AI.GENERATE_EMBEDDING( + MODEL `{model_name}`, + ({source_sql}), + STRUCT({struct_args}) + ) + """ + + return data_df._session.read_gbq(query) + + @log_adapter.method_logger(custom_base_name="bigquery_ai") def if_( prompt: PROMPT_TYPE, diff --git a/tests/unit/bigquery/test_ai.py b/tests/unit/bigquery/test_ai.py new file mode 100644 index 0000000000..c9c046664f --- /dev/null +++ b/tests/unit/bigquery/test_ai.py @@ -0,0 +1,135 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pandas as pd +import pytest + +import bigframes.bigquery._operations.ai as ai_ops +import bigframes.dataframe +import bigframes.series +import bigframes.session + + +@pytest.fixture +def mock_session(): + return mock.create_autospec(spec=bigframes.session.Session) + + +@pytest.fixture +def mock_dataframe(mock_session): + df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) + df._session = mock_session + df.sql = "SELECT * FROM my_table" + return df + + +@pytest.fixture +def mock_series(mock_session): + s = mock.create_autospec(spec=bigframes.series.Series) + s._session = mock_session + # Mock to_frame to return a mock dataframe + df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) + df._session = mock_session + df.sql = "SELECT my_col AS content FROM my_table" + s.rename.return_value.to_frame.return_value = df + return s + + +def test_generate_embedding_with_dataframe(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + ai_ops.generate_embedding( + model_name, + mock_dataframe, + output_dimensionality=256, + ) + + mock_session.read_gbq.assert_called_once() + query = mock_session.read_gbq.call_args[0][0] + + # Normalize whitespace for comparison + query = " ".join(query.split()) + + expected_part_1 = "SELECT * FROM AI.GENERATE_EMBEDDING(" + expected_part_2 = f"MODEL `{model_name}`," + expected_part_3 = "(SELECT * FROM my_table)," + expected_part_4 = "STRUCT(256 AS output_dimensionality)" + + assert expected_part_1 in query + assert expected_part_2 in query + assert expected_part_3 in query + assert expected_part_4 in query + + +def test_generate_embedding_with_series(mock_series, mock_session): + model_name = "project.dataset.model" + + ai_ops.generate_embedding( + model_name, + mock_series, + start_second=0.0, + end_second=10.0, + interval_seconds=5.0 + ) + + mock_series.rename.assert_called_with("content") + mock_series.rename.return_value.to_frame.assert_called_once() + + mock_session.read_gbq.assert_called_once() + query = mock_session.read_gbq.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "(SELECT my_col AS content FROM my_table)" in query + assert "STRUCT(0.0 AS start_second, 10.0 AS end_second, 5.0 AS interval_seconds)" in query + + +def test_generate_embedding_defaults(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + ai_ops.generate_embedding( + model_name, + mock_dataframe, + ) + + mock_session.read_gbq.assert_called_once() + query = mock_session.read_gbq.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "STRUCT()" in query + + +@mock.patch("bigframes.pandas.read_pandas") +def test_generate_embedding_with_pandas_dataframe(read_pandas_mock, mock_dataframe, mock_session): + # This tests that pandas input path works and calls read_pandas + model_name = "project.dataset.model" + + # Mock return value of read_pandas to be a BigFrames DataFrame + read_pandas_mock.return_value = mock_dataframe + + pandas_df = pd.DataFrame({"content": ["test"]}) + + ai_ops.generate_embedding( + model_name, + pandas_df, + ) + + read_pandas_mock.assert_called_once() + # Check that read_pandas was called with something (the pandas df) + assert read_pandas_mock.call_args[0][0] is pandas_df + + mock_session.read_gbq.assert_called_once()