From 4bf54dfa1b8aa89f143e498c248ef229b1446381 Mon Sep 17 00:00:00 2001 From: Juro Oravec Date: Sun, 16 Nov 2025 21:50:53 +0100 Subject: [PATCH 1/3] refactor: allow multiline and comments in python expressions --- crates/djc-safe-eval/Cargo.toml | 1 + crates/djc-safe-eval/README.md | 34 ++ crates/djc-safe-eval/src/comments.rs | 433 ++++++++++++++++++++++ crates/djc-safe-eval/src/lib.rs | 3 +- crates/djc-safe-eval/src/transformer.rs | 301 +++++++++++---- crates/djc-safe-eval/tests/transformer.rs | 228 +++++++++++- djc_core/djc_safe_eval/eval.py | 21 +- tests/test_safe_eval.py | 18 +- 8 files changed, 952 insertions(+), 87 deletions(-) create mode 100644 crates/djc-safe-eval/src/comments.rs diff --git a/crates/djc-safe-eval/Cargo.toml b/crates/djc-safe-eval/Cargo.toml index 689bbd3..552adb8 100644 --- a/crates/djc-safe-eval/Cargo.toml +++ b/crates/djc-safe-eval/Cargo.toml @@ -12,3 +12,4 @@ ruff_python_ast = { workspace = true } ruff_python_codegen = { workspace = true } ruff_source_file = { workspace = true } ruff_text_size = { workspace = true } +regex = { workspace = true } diff --git a/crates/djc-safe-eval/README.md b/crates/djc-safe-eval/README.md index 52a3d62..7ba3410 100644 --- a/crates/djc-safe-eval/README.md +++ b/crates/djc-safe-eval/README.md @@ -33,6 +33,7 @@ expression, but also collects metadata for the linter about: 1. What variables were used in the expression 2. What variables were introduced in the expression using walrus operator `x := 1` +3. What comments were found in the expression (including their positions and text) ## Usage @@ -249,6 +250,39 @@ The entire python code must be a SINGLE [expression](https://docs.python.org/3/l For simplicity we don't allow async features like async comprehensions. +### Comments + +Python comments (`# ...`) are supported and are captured during parsing. Comments are preserved with their positions and text content, allowing linters and other tools to analyze them. + +```python +compiled = safe_eval("x + 1 # Add one to x") +``` + +### Multiline expressions + +Multiline expressions are supported. When an expression spans multiple lines, it is automatically wrapped in parentheses with newlines: `(\n{}\n)`. This wrapping serves two purposes: + +1. **Enables multiline syntax**: In Python, when you're inside parentheses `(...)`, square brackets `[...]`, or curly braces `{...}`, Python ignores indentation and allows expressions to span multiple lines. This means you can write: + + ```python + [ + 1, + 2, + 3, + ] + ``` + + Without the wrapping, Python would require proper indentation. + +2. **Allows trailing comments**: So we wrap the original expression in `(...)`. If used decided to add a comment after the expression, the comment would consume the closing `)`. Hence, we also add newlines so that `(` and `)` are on separate lines: + + ``` + (2 + 2 # comment) ❌ Comment consumes the ')' + (\n2 + 2 # comment\n) ✅ Comment is on separate line + ``` + +The wrapping is transparent to users - all token positions (variables, comments, etc.) are automatically adjusted to reference the original unwrapped source positions. + ### Supported syntax Almost anything that is a valid Python expression is allowed: diff --git a/crates/djc-safe-eval/src/comments.rs b/crates/djc-safe-eval/src/comments.rs new file mode 100644 index 0000000..ebbd706 --- /dev/null +++ b/crates/djc-safe-eval/src/comments.rs @@ -0,0 +1,433 @@ +use ruff_source_file::LineIndex; + +use crate::transformer::{Comment, Token}; + +/// Preprocess the source string to extract comments. +/// +/// Since we use `exec()` instead of `eval()`, Python naturally handles: +/// - Comments (they're ignored by the parser) +/// - Newlines (they're part of normal Python syntax) +pub fn extract_comments(source: &str) -> Result, String> { + let mut comments = Vec::new(); + let line_index = LineIndex::from_source_text(source); + + // Convert to bytes for easier indexing + let bytes = source.as_bytes(); + let mut i = 0; + let mut in_string = false; + let mut string_quote: Option = None; + let mut string_delimiter_count = 0; + + let check_for_string_start = |ch: char, i: usize| -> (bool, Option, i32, usize) { + if !(ch == '"' || ch == '\'') { + // Curr char NOT a quote + return (false, None, 0, i); + } + let quote_byte = ch as u8; + + // is_raw_string is already set if we saw a prefix + let in_string = true; + let string_quote = Some(quote_byte); + let string_delimiter_count; + let new_i; + + // Triple quote string + if i + 2 < bytes.len() && bytes[i + 1] == quote_byte && bytes[i + 2] == quote_byte { + string_delimiter_count = 3; + new_i = i + 3; + } else { + // Single quote string + string_delimiter_count = 1; + new_i = i + 1; + } + (in_string, string_quote, string_delimiter_count, new_i) + }; + + let check_for_comment = |ch: char, i: usize, comments: &mut Vec| -> (bool, usize) { + if ch != '#' { + return (false, i); + } + + // Found a comment - extract it + let comment_start = i; + let mut comment_end = i + 1; + + // Collect comment text until newline or end of input + while comment_end < bytes.len() { + let next_byte = bytes[comment_end]; + if next_byte == b'\n' || next_byte == b'\r' { + break; + } + comment_end += 1; + } + + let comment_text = + String::from_utf8_lossy(&bytes[comment_start + 1..comment_end]).to_string(); + + // Create tokens for the comment + let start_pos = + line_index.line_column(ruff_text_size::TextSize::from(comment_start as u32), source); + + let comment_token = Token { + content: format!("#{}", comment_text), + start_index: comment_start, + end_index: comment_end, + line_col: ( + start_pos.line.to_zero_indexed() + 1, + start_pos.column.to_zero_indexed() + 1, + ), + }; + + // Calculate position for value token (starts one character after #) + let value_start_pos = line_index.line_column( + ruff_text_size::TextSize::from((comment_start + 1) as u32), + source, + ); + + let value_token = Token { + content: comment_text.clone(), + start_index: comment_start + 1, + end_index: comment_end, + line_col: ( + value_start_pos.line.to_zero_indexed() + 1, + value_start_pos.column.to_zero_indexed() + 1, + ), + }; + + comments.push(Comment { + token: comment_token, + value: value_token, + }); + + (true, comment_end) + }; + + while i < bytes.len() { + let ch = bytes[i] as char; + + if !in_string { + // Check for string start quote(s) + // If so, advance past the string start quotes + let (new_in_string, new_string_quote, new_string_delimiter_count, new_i) = + check_for_string_start(ch, i); + if new_in_string { + in_string = new_in_string; + string_quote = new_string_quote; + string_delimiter_count = new_string_delimiter_count; + i = new_i; + continue; + } + + // Check for comment + // If found, advance past the comment + let (found_comment, new_i) = check_for_comment(ch, i, &mut comments); + if found_comment { + i = new_i; + continue; + } + + // Regular character outside string + i += 1; + } else { + // Inside a string + let quote_byte = string_quote.unwrap(); + + // Before checking for closing quotes or anything else, + // first handle escape sequences like `\"` or `\\`. + // + // Whether it's a raw string or not, Python allows nested quotes if they follow + // a backslash. The difference is that in raw string, the backslash is included + // in the final string too. While in regular string the backslash and character + // after it create a new single character, e.g. `\n` for newline. + // Compare `r"abc \" def"` vs `"abc \" def"` + // + // So check if we're escaping the next char, and if so push and skip both characters. + if bytes[i] == b'\\' && i + 1 < bytes.len() { + i += 2; + continue; + } + + // Check for closing quote(s) + // We already handled escape sequences above, so any quote here is a closing quote + if bytes[i] == quote_byte { + let mut quote_count = 1; + let mut j = i + 1; + while j < bytes.len() + && quote_count < string_delimiter_count + && bytes[j] == quote_byte + { + quote_count += 1; + j += 1; + } + + if quote_count == string_delimiter_count { + // Closing quote(s) + i = j; + in_string = false; + string_quote = None; + string_delimiter_count = 0; + continue; + } + } + + // Regular character inside string + i += 1; + } + } + + Ok(comments) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn _token(content: &str, start_index: usize, line: usize, col: usize) -> Token { + Token { + content: content.to_string(), + start_index, + end_index: start_index + content.len(), + line_col: (line, col), + } + } + + #[test] + fn test_simple_comment_extraction() { + let result = extract_comments("1 # comment").unwrap(); + assert_eq!( + result, + vec![Comment { + token: _token("# comment", 2, 1, 3), + value: _token(" comment", 3, 1, 4), + }], + ); + } + + #[test] + fn test_comment_at_end_of_input() { + let result = extract_comments("42#end").unwrap(); + assert_eq!( + result, + vec![Comment { + token: _token("#end", 2, 1, 3), + value: _token("end", 3, 1, 4), + }], + ); + } + + #[test] + fn test_multiple_comments() { + let result = extract_comments("x # first\n y # second").unwrap(); + assert_eq!( + result, + vec![ + Comment { + token: _token("# first", 2, 1, 3), + value: _token(" first", 3, 1, 4), + }, + Comment { + token: _token("# second", 13, 2, 4), + value: _token(" second", 14, 2, 5), + }, + ], + ); + } + + #[test] + fn test_comment_inside_string_not_extracted() { + let result = extract_comments(r#""text # not a comment""#).unwrap(); + assert_eq!(result, vec![],); + } + + #[test] + fn test_comment_after_string_extracted() { + let result = extract_comments(r#""t#xt" # this is a comment"#).unwrap(); + assert_eq!( + result, + vec![Comment { + token: _token("# this is a comment", 7, 1, 8), + value: _token(" this is a comment", 8, 1, 9), + }], + ); + } + + #[test] + fn test_multiline_string() { + // Newlines in triple-quoted strings are preserved + let result = extract_comments("1 \n 2 \"\"\"my\n#tring\"\"\" 3 \n 4").unwrap(); + assert_eq!(result, vec![]); + + let result = extract_comments("1 \n 2 '''my\n#tring''' 3 \n 4").unwrap(); + assert_eq!(result, vec![]); + } + + #[test] + fn test_raw_string_preserves_newlines() { + // In this case `\n` in string is a newline + let result = extract_comments("1 \n 2 r\"\"\"my\n#tring\"\"\" 3 \n 4").unwrap(); + assert_eq!(result, vec![],); + + // In this case `\#` in raw string is two characters `\` and `#` (not an escape) + let result = extract_comments(r#"1 \n 2 r"""my\#string""" 3 \n 4"#).unwrap(); + assert_eq!(result, vec![],); + } + + #[test] + fn test_fstring_newline_replacement() { + // Newlines in triple-quoted strings are preserved + let result = extract_comments("f\"\"\"my#\n{#name}\"\"\"").unwrap(); + assert_eq!(result, vec![],); + } + + #[test] + fn test_regular_string_no_newline() { + let result = extract_comments("\"he#lo\"").unwrap(); + assert_eq!(result, vec![]); + } + + #[test] + fn test_single_quoted_string_with_literal_newline() { + // Single-quoted strings cannot contain literal newlines in Python + // The preprocessing preserves the newline, but parsing will catch this as a syntax error. + let result = extract_comments("'ab#\n'").unwrap(); + assert_eq!(result, vec![]); + + let result = extract_comments("\"ab#\n\"").unwrap(); + assert_eq!(result, vec![]); + } + + #[test] + fn test_multiline_string_with_comment() { + // Newlines in triple-quoted strings are preserved (will work with exec()) + let result = extract_comments("\"\"\"my\n#tring\"\"\" # comment").unwrap(); + assert_eq!( + result, + vec![Comment { + token: _token("# comment", 16, 2, 11), + value: _token(" comment", 17, 2, 12), + }], + ); + } + + #[test] + fn test_string_with_escape_sequence() { + let result = extract_comments(r##""t#xt \"qu#te\"#" # after"##).unwrap(); + assert_eq!( + result, + vec![Comment { + token: _token("# after", 18, 1, 19), + value: _token(" after", 19, 1, 20), + }], + ); + } + + #[test] + fn test_raw_string_with_escape_sequence() { + let result = extract_comments(r##"r"t#xt \"qu#te\"#" # after"##).unwrap(); + assert_eq!( + result, + vec![Comment { + token: _token("# after", 19, 1, 20), + value: _token(" after", 20, 1, 21), + }], + ); + } + + #[test] + fn test_string_prefixes() { + // Test various string prefixes + assert_eq!(extract_comments(r#"r"raw""#).unwrap(), vec![],); + assert_eq!(extract_comments(r#"f"formatted""#).unwrap(), vec![],); + assert_eq!(extract_comments(r#"b"bytes""#).unwrap(), vec![],); + assert_eq!(extract_comments(r#"rf"raw formatted""#).unwrap(), vec![],); + } + + #[test] + fn test_comment_before_string() { + let source = r#"# comment before"text""#; + let result = extract_comments(source).unwrap(); + // When there's no newline, the comment extends to the end of input + // So the comment includes " comment before"text"" + assert_eq!( + result, + vec![Comment { + token: _token(source, 0, 1, 1), + value: _token(&source[1..], 1, 1, 2), + }], + ); + } + + #[test] + fn test_nested_strings() { + // String containing quote characters + let result = extract_comments(r#""ou#ter 'in#ner' ou#ter""#).unwrap(); + assert_eq!(result, vec![],); + } + + #[test] + fn test_triple_quotes_in_single_quote_string() { + // Triple quotes inside a single-quote string should not close it + let result = extract_comments(r#"'''te#xt \"\"\" ins#ide\"\"\te#xt2'''"#).unwrap(); + assert_eq!(result, vec![],); + } + + #[test] + fn test_empty_comment() { + let result = extract_comments("x #").unwrap(); + assert_eq!( + result, + vec![Comment { + token: _token("#", 2, 1, 3), + value: _token("", 3, 1, 4), + }], + ); + } + + #[test] + fn test_comment_with_carriage_return() { + let result = extract_comments("x # comment\r\nnext").unwrap(); + assert_eq!( + result, + vec![Comment { + token: _token("# comment", 2, 1, 3), + value: _token(" comment", 3, 1, 4), + }], + ); + } + + #[test] + fn test_multiline_string_with_escaped_newline() { + // Escaped newline in non-raw string should not be replaced + let result = extract_comments(r#""line1\\nline2""#).unwrap(); + assert_eq!(result, vec![]); + } + + #[test] + fn test_complex_expression_with_comments() { + let result = extract_comments( + "[ # comment 1\n 1, # comment 2\n 2, # comment 3\n] # comment 4", + ) + .unwrap(); + assert_eq!( + result, + vec![ + Comment { + token: _token("# comment 1", 7, 1, 8), + value: _token(" comment 1", 8, 1, 9), + }, + Comment { + token: _token("# comment 2", 26, 2, 8), + value: _token(" comment 2", 27, 2, 9), + }, + Comment { + token: _token("# comment 3", 45, 3, 8), + value: _token(" comment 3", 46, 3, 9), + }, + Comment { + token: _token("# comment 4", 65, 4, 9), + value: _token(" comment 4", 66, 4, 10), + }, + ], + ); + } +} diff --git a/crates/djc-safe-eval/src/lib.rs b/crates/djc-safe-eval/src/lib.rs index 5877680..ca29ec9 100644 --- a/crates/djc-safe-eval/src/lib.rs +++ b/crates/djc-safe-eval/src/lib.rs @@ -1,4 +1,5 @@ pub mod codegen; +pub mod comments; pub mod transformer; mod utils { pub mod python_ast; @@ -6,7 +7,7 @@ mod utils { // Re-export public API pub use codegen::generate_python_code; -pub use transformer::{Token, TransformResult, transform_expression_string}; +pub use transformer::{Comment, Token, TransformResult, transform_expression_string}; pub fn safe_eval(source: &str) -> Result { let result = transform_expression_string(source)?; diff --git a/crates/djc-safe-eval/src/transformer.rs b/crates/djc-safe-eval/src/transformer.rs index 497113a..13b862b 100644 --- a/crates/djc-safe-eval/src/transformer.rs +++ b/crates/djc-safe-eval/src/transformer.rs @@ -250,9 +250,11 @@ // - ❌ - StmtIpyEscapeCommand // - ❌ - ExprIpyEscapeCommand +use crate::comments::extract_comments; use crate::utils::python_ast::{ attribute, call, get_expr_range, interceptor_call, none_literal, string_literal, }; +use regex::Regex; use ruff_python_ast::visitor::transformer::Transformer; use ruff_python_ast::{self as ast, Expr, Stmt}; use ruff_python_parser::parse_expression; @@ -273,6 +275,15 @@ pub struct Token { pub line_col: (usize, usize), } +/// Represents a Python comment `# ...` +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Comment { + /// Token containing the entire comment span including `#` and the comment text + pub token: Token, + /// Token for the comment text (without the `#` prefix) + pub value: Token, +} + /// Result of transforming an expression string #[derive(Debug, Clone)] pub struct TransformResult { @@ -282,16 +293,99 @@ pub struct TransformResult { pub used_vars: Vec, /// Tokens for variables that are assigned via walrus operator (:=) pub assigned_vars: Vec, + /// Comments found in the original source + pub comments: Vec, +} + +/// Adjust byte ranges in error messages to account for source wrapping +/// and append the actual text at that range. +/// +/// So if we get error +/// +/// `Parse error: Expected an expression or a ']' at byte range 9..10` +/// +/// Then we adjust the range at the end to `byte range 7..8` and add the text at that range: +/// +/// `Parse error: Expected an expression or a ']' at byte range 7..8: ')'` +fn adjust_error_ranges(error_msg: &str, wrap_prefix_len: usize, source: &str) -> String { + // Pattern to match "byte range X..Y" where X and Y are numbers + let re = Regex::new(r"byte range (\d+)\.\.(\d+)").unwrap(); + + re.replace_all(error_msg, |caps: ®ex::Captures| { + let start: usize = caps[1].parse().unwrap_or(0); + let end: usize = caps[2].parse().unwrap_or(0); + let adjusted_start = start.saturating_sub(wrap_prefix_len); + let adjusted_end = end.saturating_sub(wrap_prefix_len); + + // Extract the text at the adjusted range from the original source + let range_text = if adjusted_start < adjusted_end && adjusted_end <= source.len() { + &source[adjusted_start..adjusted_end] + } else { + "" + }; + + // Format with the range and the text (if available) + if !range_text.is_empty() { + format!( + "byte range {}..{}: '{}'", + adjusted_start, adjusted_end, range_text + ) + } else { + format!("byte range {}..{}", adjusted_start, adjusted_end) + } + }) + .to_string() +} + +/// Adjust token positions from wrapped source coordinates to original source coordinates +fn adjust_token_position( + start: usize, + end: usize, + line_index: &LineIndex, + wrapped_source: &str, + wrap_prefix_len: usize, +) -> (usize, usize, usize, usize) { + // Convert byte offset to line/column (only for start position) + let start_pos = + line_index.line_column(ruff_text_size::TextSize::from(start as u32), wrapped_source); + + // Adjust positions: line minus one, col remains same, index minus two + // start_pos.line is 0-indexed in wrapped source, convert to 1-indexed then subtract 1 + let wrapped_line_1_indexed = start_pos.line.to_zero_indexed() + 1; + let adjusted_line = wrapped_line_1_indexed.saturating_sub(1); + let adjusted_col = start_pos.column.to_zero_indexed() + 1; + let adjusted_start = start.saturating_sub(wrap_prefix_len); + let adjusted_end = end.saturating_sub(wrap_prefix_len); + + (adjusted_start, adjusted_end, adjusted_line, adjusted_col) } /// The main entry point for transforming an expression string. /// Returns the transformed expression along with tokens for variables used and assigned. pub fn transform_expression_string(source: &str) -> Result { - let transformer = SandboxTransformer::new(); - let ast = parse_expression(source).map_err(|e| format!("Parse error: {}", e))?; + // Wrap the expression in (\n{}\n) to allow multi-line expressions and trailing comments. + // This is because in Python when we're inside an expression like inside `(...)`, `[...]`, or `{...}`, + // then Python does care about the newlines and whitespace indentation. + // + // So even something like this is valid: + // ```py + // [ + // 1, + // 2, 3, + // 4, + // ] + // ``` + let wrapped_source = format!("(\n{}\n)", source); + let wrap_prefix_len = 2; // "(\n" = 2 characters + + let transformer = SandboxTransformer::new(wrap_prefix_len); + let ast = parse_expression(&wrapped_source).map_err(|e| { + let error_msg = format!("Parse error: {}", e); + adjust_error_ranges(&error_msg, wrap_prefix_len, source) + })?; // Create a LineIndex to convert byte offsets to line/column positions - let line_index = LineIndex::from_source_text(source); + let line_index = LineIndex::from_source_text(&wrapped_source); // The top-level AST for an expression is an `ast::Mod::Expression`. // We want to transform the single expression inside it. @@ -312,25 +406,22 @@ pub fn transform_expression_string(source: &str) -> Result Result>, + // Offset to adjust ranges from wrapped source back to original source + wrap_prefix_len: usize, } impl SandboxTransformer { - pub fn new() -> Self { + pub fn new(wrap_prefix_len: usize) -> Self { Self { comprehension_variables: RefCell::new(HashSet::new()), lambda_variables: RefCell::new(HashSet::new()), validation_error: RefCell::new(None), assignments: RefCell::new(Vec::new()), used_variables: RefCell::new(Vec::new()), + wrap_prefix_len, } } + /// Adjust a TextRange from wrapped source coordinates to original source coordinates + fn adjust_range(&self, range: ruff_text_size::TextRange) -> ruff_text_size::TextRange { + let start = range + .start() + .to_usize() + .saturating_sub(self.wrap_prefix_len); + let end = range.end().to_usize().saturating_sub(self.wrap_prefix_len); + ruff_text_size::TextRange::new( + ruff_text_size::TextSize::from(start as u32), + ruff_text_size::TextSize::from(end as u32), + ) + } + /// Create a new SandboxTransformer with additional local variables /// We have to create a new copy because inside visit_xxx methods we have only read access. /// @@ -442,6 +576,7 @@ impl SandboxTransformer { lambda_variables: RefCell::new(new_lambda_vars), validation_error: RefCell::new(self.validation_error.borrow().clone()), assignments: RefCell::new(Vec::new()), + wrap_prefix_len: self.wrap_prefix_len, used_variables: RefCell::new(Vec::new()), } } @@ -766,8 +901,13 @@ impl Transformer for SandboxTransformer { let var_name_literal = string_literal(&var_name, range); // `variable(context, source, (start_index, end_index), "var_name")` - let call_expr = - interceptor_call("variable", vec![var_name_literal], vec![], range); + let adjusted_range = self.adjust_range(range); + let call_expr = interceptor_call( + "variable", + vec![var_name_literal], + vec![], + adjusted_range, + ); // Replace the current expression with the call *expr = call_expr; @@ -789,11 +929,12 @@ impl Transformer for SandboxTransformer { args.extend(call_expr.arguments.args.to_vec()); // `call(context, source, (start_index, end_index), fn, *args, **kwargs)` + let adjusted_range = self.adjust_range(call_expr.range); let wrapper_call = interceptor_call( "call", args, call_expr.arguments.keywords.to_vec(), - call_expr.range, + adjusted_range, ); // Replace the current expression with the wrapper call @@ -819,11 +960,12 @@ impl Transformer for SandboxTransformer { let attr_name_literal = string_literal(&attr_name, range); // `attribute(context, source, (start_index, end_index), obj, "attr_name")` + let adjusted_range = self.adjust_range(range); let wrapper_call = interceptor_call( "attribute", vec![*attr.value.clone(), attr_name_literal], vec![], - range, + adjusted_range, ); // Replace the current expression with the wrapper call @@ -841,11 +983,12 @@ impl Transformer for SandboxTransformer { let range = subscript.range; // `subscript(context, source, (start_index, end_index), obj, key)` + let adjusted_range = self.adjust_range(range); let wrapper_call = interceptor_call( "subscript", vec![*subscript.value.clone(), *subscript.slice.clone()], vec![], - range, + adjusted_range, ); // Replace the current expression with the wrapper call @@ -893,7 +1036,8 @@ impl Transformer for SandboxTransformer { ]; // `slice(context, source, (start_index, end_index), lower, upper, step)` - let slice_call = interceptor_call("slice", slice_args, vec![], range); + let adjusted_range = self.adjust_range(range); + let slice_call = interceptor_call("slice", slice_args, vec![], adjusted_range); // Replace the current expression with the slice() call *expr = slice_call; @@ -953,11 +1097,12 @@ impl Transformer for SandboxTransformer { let var_name_literal = string_literal(&var_name, range); // `assign(context, source, (start_index, end_index), "var_name", value)` + let adjusted_range = self.adjust_range(range); let wrapper_call = interceptor_call( "assign", vec![var_name_literal, *named.value.clone()], vec![], - range, + adjusted_range, ); // Replace the current expression with the wrapper call @@ -1009,15 +1154,14 @@ impl Transformer for SandboxTransformer { // - dynamic expression (`{width}.{precision}f`) // We don't call built-in `format()` here; we pass the format spec as metadata // and let our `format()` interceptor handle it. - let format_spec_expr = if let Some(format_spec) = - &mut interpolation.format_spec - { - // Build the format spec - can be static or dynamic - let mut spec_template_parts = Vec::new(); - let mut spec_format_args = Vec::new(); - - for spec_element in format_spec.elements.iter_mut() { - match spec_element { + let format_spec_expr = + if let Some(format_spec) = &mut interpolation.format_spec { + // Build the format spec - can be static or dynamic + let mut spec_template_parts = Vec::new(); + let mut spec_format_args = Vec::new(); + + for spec_element in format_spec.elements.iter_mut() { + match spec_element { ast::InterpolatedStringElement::Literal(lit) => { spec_template_parts.push(lit.value.to_string()); } @@ -1037,37 +1181,37 @@ impl Transformer for SandboxTransformer { .push(*spec_interp.expression.clone()); } } - } - - let spec_template_str = spec_template_parts.join(""); + } - // Build the format spec expression - if spec_format_args.is_empty() { - // Static format spec - just use a string literal - string_literal(&spec_template_str, expr_range) + let spec_template_str = spec_template_parts.join(""); + + // Build the format spec expression + if spec_format_args.is_empty() { + // Static format spec - just use a string literal + string_literal(&spec_template_str, expr_range) + } else { + // Dynamic format spec - we need to pass both template and args + // We'll pass this as a tuple: (template, *args) + let spec_template_literal = + string_literal(&spec_template_str, expr_range); + // Create a tuple: (template, arg1, arg2, ...) + Expr::Tuple(ast::ExprTuple { + node_index: Default::default(), + range: expr_range, + ctx: ast::ExprContext::Load, + parenthesized: false, + elts: { + let mut tuple_elts = + vec![spec_template_literal]; + tuple_elts.extend(spec_format_args); + tuple_elts.into() + }, + }) + } } else { - // Dynamic format spec - we need to pass both template and args - // We'll pass this as a tuple: (template, *args) - let spec_template_literal = - string_literal(&spec_template_str, expr_range); - // Create a tuple: (template, arg1, arg2, ...) - Expr::Tuple(ast::ExprTuple { - node_index: Default::default(), - range: expr_range, - ctx: ast::ExprContext::Load, - parenthesized: false, - elts: { - let mut tuple_elts = - vec![spec_template_literal]; - tuple_elts.extend(spec_format_args); - tuple_elts.into() - }, - }) - } - } else { - // No format spec - use empty string - string_literal("", expr_range) - }; + // No format spec - use empty string + string_literal("", expr_range) + }; // Add simple placeholder to template template_parts.push("{}".to_string()); @@ -1104,8 +1248,9 @@ impl Transformer for SandboxTransformer { let mut format_args_with_template = vec![template_literal]; format_args_with_template.extend(format_args); + let adjusted_range = self.adjust_range(range); let format_call = - interceptor_call("format", format_args_with_template, vec![], range); + interceptor_call("format", format_args_with_template, vec![], adjusted_range); // Replace the f-string with the intercepted format() call *expr = format_call; @@ -1212,6 +1357,8 @@ impl Transformer for SandboxTransformer { // `interpolation(context, source, token, value, expression, conversion, format_spec)` // Use interpolation.range for error reporting to point to the exact {expression!r:format} location + let adjusted_interpolation_range = + self.adjust_range(interpolation.range); let interpolation_call = interceptor_call( "interpolation", vec![ @@ -1221,7 +1368,7 @@ impl Transformer for SandboxTransformer { format_spec_expr, ], vec![], - interpolation.range, + adjusted_interpolation_range, ); template_args.push(interpolation_call); @@ -1231,7 +1378,9 @@ impl Transformer for SandboxTransformer { } // `template(context, source, token, ...)` - let template_call = interceptor_call("template", template_args, vec![], range); + let adjusted_range = self.adjust_range(range); + let template_call = + interceptor_call("template", template_args, vec![], adjusted_range); // Replace the t-string with the template() call *expr = template_call; diff --git a/crates/djc-safe-eval/tests/transformer.rs b/crates/djc-safe-eval/tests/transformer.rs index 51102f0..fc2a944 100644 --- a/crates/djc-safe-eval/tests/transformer.rs +++ b/crates/djc-safe-eval/tests/transformer.rs @@ -1,7 +1,7 @@ #[cfg(test)] mod tests { use djc_safe_eval::codegen::generate_python_code; - use djc_safe_eval::transformer::transform_expression_string; + use djc_safe_eval::transformer::{Comment, Token, transform_expression_string}; fn _test_transformation( input: &str, @@ -77,6 +77,31 @@ mod tests { } } + #[test] + fn test_multiple_lines() { + // NOTE: Altho we could technically handle this, we allow only a single expression, + // and this contains 2 statements/expressions, which raises an error. + // In this case the actual error message doesn't matter that much, as long as we raise here. + let result = transform_expression_string("x\n y").unwrap_err(); + assert_eq!( + result, + "Parse error: Expected ')', found name at byte range 3..4: 'y'" + ); + } + + #[test] + fn test_multiple_lines_escape() { + // NOTE: We wrap the entire Python expression in `(\n{}\n)`, so that inside the expression + // we can use newlines. Here we check that it's not possible to escape this simply by closing + // the first parenthesis. + let result = transform_expression_string("x)\n\nimport os; os.path.join('a', 'b')\n\n(") + .unwrap_err(); + assert_eq!( + result, + "Parse error: Unexpected token at the end of an expression at byte range 4..10: 'import'" + ); + } + #[test] fn test_allow_comments() { _test_transformation("1 # comment", "1", vec![], vec![]); @@ -151,6 +176,27 @@ mod tests { _test_transformation("[1, 2, 3]", "[1, 2, 3]", vec![], vec![]); } + #[test] + fn test_allow_multiline_expr() { + // Note: The code generator doesn't preserve formatting, so output is on one line + _test_transformation("[\n 1,\n 2,\n 3\n]", "[1, 2, 3]", vec![], vec![]); + } + + #[test] + fn test_allow_multiline_with_variables() { + // Note: The code generator doesn't preserve formatting, so output is on one line + _test_transformation( + "[\n x,\n y,\n z\n]", + "[variable(context, source, (4, 5), 'x'), variable(context, source, (9, 10), 'y'), variable(context, source, (14, 15), 'z')]", + vec![ + ("x", 4, 5, (2, 3)), + ("y", 9, 10, (3, 3)), + ("z", 14, 15, (4, 3)), + ], + vec![], + ); + } + #[test] fn test_allow_tuple_empty() { _test_transformation("()", "()", vec![], vec![]); @@ -2416,4 +2462,184 @@ mod tests { vec![], ); } + + // === COMMENT EXTRACTION TESTS === + + fn _token(content: &str, start_index: usize, line: usize, col: usize) -> Token { + Token { + content: content.to_string(), + start_index, + end_index: start_index + content.len(), + line_col: (line, col), + } + } + + #[test] + fn test_simple_comment_extraction() { + let result = transform_expression_string("1 # comment").unwrap(); + assert_eq!( + result.comments, + vec![Comment { + token: _token("# comment", 2, 1, 3), + value: _token(" comment", 3, 1, 4), + }], + ); + } + + #[test] + fn test_comment_at_end_of_input() { + let result = transform_expression_string("42#end").unwrap(); + assert_eq!( + result.comments, + vec![Comment { + token: _token("#end", 2, 1, 3), + value: _token("end", 3, 1, 4), + }], + ); + } + + #[test] + fn test_comment_inside_string_not_extracted() { + let result = transform_expression_string(r#""text # not a comment""#).unwrap(); + assert_eq!(result.comments, vec![]); + } + + #[test] + fn test_comment_after_string_extracted() { + let result = transform_expression_string(r#""t#xt" # this is a comment"#).unwrap(); + assert_eq!( + result.comments, + vec![Comment { + token: _token("# this is a comment", 7, 1, 8), + value: _token(" this is a comment", 8, 1, 9), + }], + ); + } + + #[test] + fn test_multiline_string_with_comment() { + // Newlines in triple-quoted strings are preserved (will work with exec()) + let result = transform_expression_string("\"\"\"my\n#tring\"\"\" # comment").unwrap(); + assert_eq!( + result.comments, + vec![Comment { + token: _token("# comment", 16, 2, 11), + value: _token(" comment", 17, 2, 12), + }], + ); + } + + #[test] + fn test_string_with_escape_sequence() { + let result = transform_expression_string(r##""t#xt \"qu#te\"#" # after"##).unwrap(); + assert_eq!( + result.comments, + vec![Comment { + token: _token("# after", 18, 1, 19), + value: _token(" after", 19, 1, 20), + }], + ); + } + + #[test] + fn test_raw_string_with_escape_sequence() { + let result = transform_expression_string(r##"r"t#xt \"qu#te\"#" # after"##).unwrap(); + assert_eq!( + result.comments, + vec![Comment { + token: _token("# after", 19, 1, 20), + value: _token(" after", 20, 1, 21), + }], + ); + } + + #[test] + fn test_comment_before_string() { + let source = r#"1# comment before"text""#; + let result = transform_expression_string(source).unwrap(); + // When there's no newline, the comment extends to the end of input + // So the comment includes " comment before"text"" + assert_eq!( + result.comments, + vec![Comment { + token: _token(&source[1..], 1, 1, 2), + value: _token(&source[2..], 2, 1, 3), + }], + ); + } + + #[test] + fn test_nested_strings() { + // String containing quote characters + let result = transform_expression_string(r#""ou#ter 'in#ner' ou#ter""#).unwrap(); + assert_eq!(result.comments, vec![]); + } + + #[test] + fn test_triple_quotes_in_single_quote_string() { + // Triple quotes inside a single-quote string should not close it + let result = + transform_expression_string(r#"'''te#xt \"\"\" ins#ide\"\"\te#xt2'''"#).unwrap(); + assert_eq!(result.comments, vec![]); + } + + #[test] + fn test_empty_comment() { + let result = transform_expression_string("x #").unwrap(); + assert_eq!( + result.comments, + vec![Comment { + token: _token("#", 2, 1, 3), + value: _token("", 3, 1, 4), + }], + ); + } + + #[test] + fn test_comment_with_carriage_return() { + let result = transform_expression_string("[x, # comment\r\nnext]").unwrap(); + assert_eq!( + result.comments, + vec![Comment { + token: _token("# comment", 4, 1, 5), + value: _token(" comment", 5, 1, 6), + }], + ); + } + + #[test] + fn test_multiline_string_with_escaped_newline() { + // Escaped newline in non-raw string should not be replaced + let result = transform_expression_string(r#""line1\\nline2""#).unwrap(); + assert_eq!(result.comments, vec![]); + } + + #[test] + fn test_complex_expression_with_comments() { + let result = transform_expression_string( + "[ # comment 1\n 1, # comment 2\n 2, # comment 3\n] # comment 4", + ) + .unwrap(); + assert_eq!( + result.comments, + vec![ + Comment { + token: _token("# comment 1", 7, 1, 8), + value: _token(" comment 1", 8, 1, 9), + }, + Comment { + token: _token("# comment 2", 26, 2, 8), + value: _token(" comment 2", 27, 2, 9), + }, + Comment { + token: _token("# comment 3", 45, 3, 8), + value: _token(" comment 3", 46, 3, 9), + }, + Comment { + token: _token("# comment 4", 65, 4, 9), + value: _token(" comment 4", 66, 4, 10), + }, + ], + ); + } } diff --git a/djc_core/djc_safe_eval/eval.py b/djc_core/djc_safe_eval/eval.py index f59ac86..531a22f 100644 --- a/djc_core/djc_safe_eval/eval.py +++ b/djc_core/djc_safe_eval/eval.py @@ -171,13 +171,18 @@ def assign_fn( # Wrap the transformed code in a lambda that captures the helper functions # This avoids the overhead of calling eval() and creating a dict on each evaluation - lambda_code = f"lambda context: ({transformed_code})" - + # NOTE: The lambda is assigned to a variable because we use `exec()` instead of `eval()` + # to support triple-quoted multi-line strings (which `eval()` doesn't handle). + # And while `eval()` returns its result directly, with `exec()` we need to assign it to a variable. + # We wrap with parentheses and newlines to allow multi-line expressions and trailing comments: + # the newlines ensure that trailing comments don't swallow the closing parenthesis. + lambda_code = f"_eval_expr = lambda context: (\n{transformed_code}\n)" + eval_locals = {} try: - # Compile the code but don't execute it - compiled_code = compile(lambda_code, f"Expression <{source}>", "eval") - # Actually execute the code - eval_func = eval(compiled_code, eval_namespace, {}) + # Execute the function definition + exec(lambda_code, eval_namespace, eval_locals) + # Get the function from the namespace + eval_func = eval_locals["_eval_expr"] except Exception as e: # If the error hasn't been processed by error_context decorator, # include the whole expression in the error message (without the "Error in..." prefix) @@ -189,8 +194,8 @@ def assign_fn( e._safe_eval_error_processed = True # type: ignore[attr-defined] raise - # Return a function that calls the compiled lambda directly - # This is much faster than calling eval() on each evaluation + # Return a function that calls the compiled function directly + # This is much faster than calling exec() on each evaluation def evaluate(context: Dict[str, Any]) -> Any: """ Evaluate the compiled expression with the given context. diff --git a/tests/test_safe_eval.py b/tests/test_safe_eval.py index 366061f..d9cde40 100644 --- a/tests/test_safe_eval.py +++ b/tests/test_safe_eval.py @@ -42,6 +42,20 @@ def method(self, a: int, b: int) -> int: class TestSyntax: + def test_allow_multiline(self): + compiled = safe_eval("[\n 1,\n 2,\n 3\n]") + context = {} + result = compiled(context) + assert result == [1, 2, 3] + assert context == {} + + def test_allow_multiline_with_variables(self): + compiled = safe_eval("[\n x,\n y,\n z\n]") + context = {"x": 10, "y": 20, "z": 30} + result = compiled(context) + assert result == [10, 20, 30] + assert context == {"x": 10, "y": 20, "z": 30} + # === LITERALS === def test_allow_literal_string(self): @@ -1781,7 +1795,9 @@ def test_variable_none(self): def test_syntax_error(self): with pytest.raises( SyntaxError, - match="Unexpected token at the end of an expression at byte range 2..4", + match=re.escape( + "Parse error: Expected an expression or a ']' at byte range 7..8: ')'" + ), ): safe_eval("x := [1)") From bbe4c68dd5f8b59eb934d063b5dcf64bb4de655a Mon Sep 17 00:00:00 2001 From: Juro Oravec Date: Sun, 16 Nov 2025 22:56:25 +0100 Subject: [PATCH 2/3] refactor: add tests to check against parser exploits --- crates/djc-safe-eval/tests/transformer.rs | 32 ++++++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/crates/djc-safe-eval/tests/transformer.rs b/crates/djc-safe-eval/tests/transformer.rs index fc2a944..46bfbde 100644 --- a/crates/djc-safe-eval/tests/transformer.rs +++ b/crates/djc-safe-eval/tests/transformer.rs @@ -89,11 +89,10 @@ mod tests { ); } + // Check if it's possible to escape the multiline restriction by closing the first parenthesis. + // This tries to exploit the fact that we wrap the entire expression in extra parentheses. #[test] - fn test_multiple_lines_escape() { - // NOTE: We wrap the entire Python expression in `(\n{}\n)`, so that inside the expression - // we can use newlines. Here we check that it's not possible to escape this simply by closing - // the first parenthesis. + fn test_multiple_lines_escape_1() { let result = transform_expression_string("x)\n\nimport os; os.path.join('a', 'b')\n\n(") .unwrap_err(); assert_eq!( @@ -102,6 +101,31 @@ mod tests { ); } + // This is the most that the logic can be "exploited" - constructing an expression + // that makes use of the hidden parentheses. + // So while `lambda x: x + 2)(5` would raise a syntax error in Python, since we wrap it + // in parentheses, we actually end up with `(lambda x: x + 2)(5)`, which is valid. + #[test] + fn test_multiple_lines_escape_2() { + _test_transformation( + "lambda x: x + 2)(5", + "call(context, source, (0, 20), lambda x: x + 2, 5)", + vec![], + vec![], + ); + } + + // However, it's still not possible to construct an expression that contains a statement, + // as this test shows. + #[test] + fn test_multiple_lines_escape_3() { + let result = transform_expression_string("def fn(x): x + 2)(5").unwrap_err(); + assert_eq!( + result, + "Parse error: Expected an identifier, but found a keyword 'def' that cannot be used here at byte range 0..3: 'def'" + ); + } + #[test] fn test_allow_comments() { _test_transformation("1 # comment", "1", vec![], vec![]); From 13b9106494edca75ca6ac98374f173f63010f897 Mon Sep 17 00:00:00 2001 From: Juro Oravec Date: Sun, 16 Nov 2025 23:09:54 +0100 Subject: [PATCH 3/3] refactor: update cargo.lock --- Cargo.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.lock b/Cargo.lock index bc564e4..27f3b83 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -314,6 +314,7 @@ dependencies = [ name = "djc-safe-eval" version = "1.0.0" dependencies = [ + "regex", "ruff_python_ast", "ruff_python_codegen", "ruff_python_parser",