diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index f835285a21..9b73ebd5c0 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -57,7 +57,11 @@ to_json, to_json_string, ) -from bigframes.bigquery._operations.search import create_vector_index, vector_search +from bigframes.bigquery._operations.search import ( + create_vector_index, + search, + vector_search, +) from bigframes.bigquery._operations.sql import sql_scalar from bigframes.bigquery._operations.struct import struct from bigframes.core import log_adapter @@ -99,6 +103,7 @@ to_json_string, # search ops create_vector_index, + search, vector_search, # sql ops sql_scalar, @@ -150,6 +155,7 @@ "to_json_string", # search ops "create_vector_index", + "search", "vector_search", # sql ops "sql_scalar", diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py index b65eed2475..10af14b8a0 100644 --- a/bigframes/bigquery/_operations/search.py +++ b/bigframes/bigquery/_operations/search.py @@ -20,10 +20,11 @@ import google.cloud.bigquery as bigquery +import bigframes.core.sql +import bigframes.dataframe import bigframes.ml.utils as utils if typing.TYPE_CHECKING: - import bigframes.dataframe as dataframe import bigframes.series as series import bigframes.session @@ -91,7 +92,7 @@ def create_vector_index( def vector_search( base_table: str, column_to_search: str, - query: Union[dataframe.DataFrame, series.Series], + query: Union[bigframes.dataframe.DataFrame, series.Series], *, query_column_to_search: Optional[str] = None, top_k: Optional[int] = None, @@ -99,7 +100,7 @@ def vector_search( fraction_lists_to_search: Optional[float] = None, use_brute_force: Optional[bool] = None, allow_large_results: Optional[bool] = None, -) -> dataframe.DataFrame: +) -> bigframes.dataframe.DataFrame: """ Conduct vector search which searches embeddings to find semantically similar entities. @@ -108,7 +109,6 @@ def vector_search( **Examples:** - >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq @@ -247,3 +247,65 @@ def vector_search( df = query._session.read_gbq_query(sql, allow_large_results=allow_large_results) return df + + +def search( + data_to_search: Union[bigframes.dataframe.DataFrame, series.Series], + search_query: str, +) -> series.Series: + """ + The SEARCH function checks to see whether a BigQuery table or other search + data contains a set of search terms (tokens). It returns TRUE if all search + terms appear in the data, based on the rules for search_query and text + analysis described in the text analyzer. Otherwise, this function returns + FALSE. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + + >>> data = bpd.read_gbq("SELECT 'Please use foobar@example.com as your email.' AS email") + >>> bbq.search(data['email'], 'exam') + 0 False + Name: email, dtype: boolean + + >>> bbq.search(data['email'], 'foobar') + 0 True + Name: email, dtype: boolean + + >>> bbq.search(data['email'], 'example.com') + 0 True + Name: email, dtype: boolean + + Args: + data_to_search (bigframes.dataframe.DataFrame | bigframes.series.Series): + The data to search over. + search_query (str): + A STRING literal, or a STRING constant expression that represents + the terms of the search query. + + Returns: + bigframes.series.Series: A new Series with the boolean result. + """ + import bigframes.operations.search_ops as search_ops + import bigframes.series + + if not isinstance( + data_to_search, (bigframes.series.Series, bigframes.dataframe.DataFrame) + ): + raise ValueError("data_to_search must be a Series or DataFrame") + + if isinstance(data_to_search, bigframes.dataframe.DataFrame): + # SEARCH on a table (or dataframe) treats it as a STRUCT. For easier + # application of a scalar unary op, we convert to a struct proactively + # in the expression. + import bigframes.bigquery._operations.struct as struct_ops + + data_to_search = struct_ops.struct(data_to_search) + + return data_to_search._apply_unary_op( + search_ops.SearchOp( + search_query=search_query, + ) + ) diff --git a/bigframes/core/compile/ibis_compiler/__init__.py b/bigframes/core/compile/ibis_compiler/__init__.py index 6b9d284c53..2af8061bd7 100644 --- a/bigframes/core/compile/ibis_compiler/__init__.py +++ b/bigframes/core/compile/ibis_compiler/__init__.py @@ -22,4 +22,5 @@ import bigframes.core.compile.ibis_compiler.operations.generic_ops # noqa: F401 import bigframes.core.compile.ibis_compiler.operations.geo_ops # noqa: F401 +import bigframes.core.compile.ibis_compiler.operations.search_ops # noqa: F401 import bigframes.core.compile.ibis_compiler.scalar_op_registry # noqa: F401 diff --git a/bigframes/core/compile/ibis_compiler/operations/search_ops.py b/bigframes/core/compile/ibis_compiler/operations/search_ops.py new file mode 100644 index 0000000000..516b49036d --- /dev/null +++ b/bigframes/core/compile/ibis_compiler/operations/search_ops.py @@ -0,0 +1,40 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +BigFrames -> Ibis compilation for the operations in bigframes.operations.search_ops. + +Please keep implementations in sequential order by op name. +""" + +from __future__ import annotations + +from bigframes_vendored.ibis.expr import types as ibis_types +import bigframes_vendored.ibis.expr.operations.udf as ibis_udf + +from bigframes.core.compile.ibis_compiler import scalar_op_compiler +from bigframes.operations import search_ops + +register_unary_op = scalar_op_compiler.scalar_op_compiler.register_unary_op + + +@register_unary_op(search_ops.SearchOp, pass_op=True) +def search_op_impl(x: ibis_types.Value, op: search_ops.SearchOp): + return search(x, op.search_query) + + +@ibis_udf.scalar.builtin(name="search") +def search(data_to_search, search_query) -> bool: + """Checks to see whether a table or other search data contains a set of search terms.""" + return False # pragma: NO COVER diff --git a/bigframes/core/compile/sqlglot/__init__.py b/bigframes/core/compile/sqlglot/__init__.py index 9e3f123807..61ba4398c6 100644 --- a/bigframes/core/compile/sqlglot/__init__.py +++ b/bigframes/core/compile/sqlglot/__init__.py @@ -25,6 +25,7 @@ import bigframes.core.compile.sqlglot.expressions.geo_ops # noqa: F401 import bigframes.core.compile.sqlglot.expressions.json_ops # noqa: F401 import bigframes.core.compile.sqlglot.expressions.numeric_ops # noqa: F401 +import bigframes.core.compile.sqlglot.expressions.search_ops # noqa: F401 import bigframes.core.compile.sqlglot.expressions.string_ops # noqa: F401 import bigframes.core.compile.sqlglot.expressions.struct_ops # noqa: F401 import bigframes.core.compile.sqlglot.expressions.timedelta_ops # noqa: F401 diff --git a/bigframes/core/compile/sqlglot/expressions/search_ops.py b/bigframes/core/compile/sqlglot/expressions/search_ops.py new file mode 100644 index 0000000000..1fff4282e6 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/search_ops.py @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import sqlglot.expressions as sge + +from bigframes import operations as ops +from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr +import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler + +register_nary_op = scalar_compiler.scalar_op_compiler.register_nary_op +register_unary_op = scalar_compiler.scalar_op_compiler.register_unary_op + + +@register_unary_op(ops.SearchOp, pass_op=True) +def _(expr: TypedExpr, op: ops.SearchOp) -> sge.Expression: + return sge.func("SEARCH", expr.expr, sge.convert(op.search_query)) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 5da8efaa3b..2c15c24106 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -184,6 +184,7 @@ NaryRemoteFunctionOp, RemoteFunctionOp, ) +from bigframes.operations.search_ops import SearchOp from bigframes.operations.string_ops import ( capitalize_op, EndsWithOp, @@ -374,6 +375,8 @@ "BinaryRemoteFunctionOp", "NaryRemoteFunctionOp", "RemoteFunctionOp", + # Search ops + "SearchOp", # Frequency ops "DatetimeToIntegerLabelOp", "FloorDtOp", diff --git a/bigframes/operations/search_ops.py b/bigframes/operations/search_ops.py new file mode 100644 index 0000000000..f63696f15f --- /dev/null +++ b/bigframes/operations/search_ops.py @@ -0,0 +1,28 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import typing + +from bigframes import dtypes +from bigframes.operations import base_ops + + +@dataclasses.dataclass(frozen=True) +class SearchOp(base_ops.UnaryOp): + name: typing.ClassVar[str] = "search" + search_query: str + + def output_type(self, *input_types): + return dtypes.BOOL_DTYPE diff --git a/tests/unit/bigquery/test_search.py b/tests/unit/bigquery/test_search.py new file mode 100644 index 0000000000..1b1524aa62 --- /dev/null +++ b/tests/unit/bigquery/test_search.py @@ -0,0 +1,171 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +import bigframes.bigquery as bbq +import bigframes.operations.search_ops as search_ops +import bigframes.series +import bigframes.session +import bigframes.testing.mocks + + +@pytest.fixture +def mock_session(): + return bigframes.testing.mocks.create_bigquery_session() + + +def test_search_series(mock_session): + # Use real Series backed by mock session (via read_pandas/ReadLocalNode) + s = bigframes.series.Series(["foo bar", "baz"], session=mock_session) + search_query = "foo" + result = bbq.search(s, search_query) + + # Verify the operation in the expression tree + import bigframes.core.nodes as nodes + import bigframes.core.expression as ex + + # Get the underlying node + node = result._block.expr.node + + # Traverse down to find the ProjectionNode + while isinstance(node, nodes.SelectionNode): + node = node.child + + # It should be a ProjectionNode (since search is a unary op applied to existing data) + assert isinstance(node, nodes.ProjectionNode) + + # Find the assignment corresponding to the result column + # result._value_column corresponds to one of the output columns of the SelectionNode chain + # But checking the ProjectionNode assignments directly is easier if we iterate through them. + # The SearchOp should be one of the assignments. + + # Locate the assignment with SearchOp + assignments = [expr for expr, id in node.assignments if isinstance(expr, ex.OpExpression) and isinstance(expr.op, search_ops.SearchOp)] + assert len(assignments) == 1 + assignment = assignments[0] + + # The expression should be an OpExpression with SearchOp + assert isinstance(assignment, ex.OpExpression) + assert isinstance(assignment.op, search_ops.SearchOp) + + assert assignment.op.search_query == search_query + assert assignment.op.json_scope is None + assert assignment.op.analyzer is None + assert assignment.op.analyzer_options is None + + +def test_search_series_with_options(mock_session): + s = bigframes.series.Series(["foo bar", "baz"], session=mock_session) + search_query = "foo" + result = bbq.search( + s, + search_query, + json_scope="JSON_VALUES", + analyzer="LOG_ANALYZER", + analyzer_options='{"delimiters": [" "]}', + ) + + # Verify the operation in the expression tree + import bigframes.core.nodes as nodes + import bigframes.core.expression as ex + + # Get the underlying node + node = result._block.expr.node + + # Traverse down to find the ProjectionNode + while isinstance(node, nodes.SelectionNode): + node = node.child + + # It should be a ProjectionNode + assert isinstance(node, nodes.ProjectionNode) + + # Locate the assignment with SearchOp + assignments = [expr for expr, id in node.assignments if isinstance(expr, ex.OpExpression) and isinstance(expr.op, search_ops.SearchOp)] + assert len(assignments) == 1 + assignment = assignments[0] + + assert isinstance(assignment, ex.OpExpression) + assert isinstance(assignment.op, search_ops.SearchOp) + + assert assignment.op.search_query == search_query + assert assignment.op.json_scope == "JSON_VALUES" + assert assignment.op.analyzer == "LOG_ANALYZER" + assert assignment.op.analyzer_options == '{"delimiters": [" "]}' + + +def test_search_dataframe(mock_session): + # Mock dataframe with 2 columns + df = pd.DataFrame({"col1": ["foo", "bar"], "col2": ["baz", "qux"]}) + bf = bigframes.dataframe.DataFrame(df, session=mock_session) + + search_query = "foo" + result = bbq.search(bf, search_query) + + import bigframes.core.nodes as nodes + import bigframes.core.expression as ex + from bigframes.operations import struct_ops + + # Get the underlying node + node = result._block.expr.node + + # Traverse down to find the ProjectionNode + while isinstance(node, nodes.SelectionNode): + node = node.child + + # Should be a ProjectionNode + assert isinstance(node, nodes.ProjectionNode) + + assignments = [expr for expr, id in node.assignments if isinstance(expr, ex.OpExpression) and isinstance(expr.op, search_ops.SearchOp)] + assert len(assignments) == 1 + assignment = assignments[0] + + assert isinstance(assignment, ex.OpExpression) + assert isinstance(assignment.op, search_ops.SearchOp) + assert assignment.op.search_query == search_query + + # Verify that the input to SearchOp is a StructOp + # The input expression to SearchOp + search_input = assignment.inputs[0] + + # Since struct() op and search op might be in the same ProjectionNode or different ones. + # If they are in the same ProjectionNode, `search_input` would be a DerefOp to a column not in assignments? + # No, ProjectionNode assignments are parallel. So struct op must be in a child node. + + # Check if struct op is in the same node (unlikely for parallel projection unless merged somehow, but typical flow puts them sequential) + + # If search_input is DerefOp, we look in the child node. + assert isinstance(search_input, ex.DerefOp) + + child_node = node.child + # Traverse SelectionNodes if any + while isinstance(child_node, nodes.SelectionNode): + child_node = child_node.child + + # It should be a ProjectionNode (from struct()) + assert isinstance(child_node, nodes.ProjectionNode) + + # Find the struct assignment + struct_col_id = search_input.id + struct_assignment = next(expr for expr, id in child_node.assignments if id == struct_col_id) + + assert isinstance(struct_assignment, ex.OpExpression) + assert isinstance(struct_assignment.op, struct_ops.StructOp) + assert struct_assignment.op.column_names == ("col1", "col2") + + +def test_search_invalid_input(mock_session): + with pytest.raises(ValueError, match="data_to_search must be a Series or DataFrame"): + bbq.search("invalid", "foo")