From 7aada4ab0d15d43195aff6bd4a135e1c1c82415a Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 5 Sep 2025 16:53:19 +0100 Subject: [PATCH 1/4] Implement write method for BasicFileSystem --- rust/src/filesystem.rs | 24 +++++++++++++++++++++++ src/grimp/application/ports/filesystem.py | 5 ++++- tests/unit/adaptors/test_filesystem.py | 8 ++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/rust/src/filesystem.rs b/rust/src/filesystem.rs index 4b54781c..fd880f93 100644 --- a/rust/src/filesystem.rs +++ b/rust/src/filesystem.rs @@ -1,4 +1,6 @@ +use std::io::prelude::*; use itertools::Itertools; +use std::fs::File; use pyo3::exceptions::{PyFileNotFoundError, PyTypeError, PyUnicodeDecodeError}; use pyo3::prelude::*; use regex::Regex; @@ -22,6 +24,8 @@ pub trait FileSystem: Send + Sync { fn exists(&self, file_name: &str) -> bool; fn read(&self, file_name: &str) -> PyResult; + + fn write(&mut self, file_name: &str, contents: &str) -> PyResult<()>; } #[derive(Clone)] @@ -129,6 +133,12 @@ impl FileSystem for RealBasicFileSystem { }) } } + + fn write(&mut self, file_name: &str, contents: &str) -> PyResult<()> { + let mut file = File::create(file_name)?; + file.write_all(contents.as_bytes())?; + Ok(()) + } } #[pymethods] @@ -161,6 +171,10 @@ impl PyRealBasicFileSystem { fn read(&self, file_name: &str) -> PyResult { self.inner.read(file_name) } + + fn write(&mut self, file_name: &str, contents: &str) -> PyResult<()> { + self.inner.write(file_name, contents) + } } type FileSystemContents = HashMap; @@ -243,6 +257,12 @@ impl FileSystem for FakeBasicFileSystem { ))), } } + + #[allow(unused_variables)] + fn write(&mut self, file_name: &str, contents: &str) -> PyResult<()> { + self.contents.insert(file_name.to_string(), contents.to_string()); + Ok(()) + } } #[pymethods] @@ -278,6 +298,10 @@ impl PyFakeBasicFileSystem { self.inner.read(file_name) } + fn write(&mut self, file_name: &str, contents: &str) -> PyResult<()> { + self.inner.write(file_name, contents) + } + // Temporary workaround method for Python tests. fn convert_to_basic(&self) -> PyResult { Ok(PyFakeBasicFileSystem { diff --git a/src/grimp/application/ports/filesystem.py b/src/grimp/application/ports/filesystem.py index 6c1bfd9f..60a4416e 100644 --- a/src/grimp/application/ports/filesystem.py +++ b/src/grimp/application/ports/filesystem.py @@ -93,7 +93,7 @@ def convert_to_basic(self) -> BasicFileSystem: class BasicFileSystem(Protocol): """ - A more limited file system, used by the Rust-based scan_for_imports function. + A more limited file system. Having two different file system APIs is an interim approach, allowing us to implement BasicFileSystem in Rust without needing to implement the full range @@ -112,5 +112,8 @@ def split(self, file_name: str) -> Tuple[str, str]: def read(self, file_name: str) -> str: ... + def write(self, file_name: str, contents: str) -> None: + ... + def exists(self, file_name: str) -> bool: ... diff --git a/tests/unit/adaptors/test_filesystem.py b/tests/unit/adaptors/test_filesystem.py index 68d86f45..117e63fd 100644 --- a/tests/unit/adaptors/test_filesystem.py +++ b/tests/unit/adaptors/test_filesystem.py @@ -148,6 +148,14 @@ def test_read(self, file_name, expected_contents): else: assert file_system.read(file_name) == expected_contents + def test_write(self): + some_filename, some_contents = "path/to/some-file.txt", "Some contents." + file_system = self.file_system_cls() + + file_system.write(some_filename, some_contents) + + assert file_system.read(some_filename) == some_contents + class TestFakeFileSystem(_Base): file_system_cls = FakeFileSystem From 33642772192af2fac50e8d18db950a6680944d2a Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 5 Sep 2025 17:04:10 +0100 Subject: [PATCH 2/4] Use BasicFileSystem in the cache Because this is defined in rust, it allows us to push down more logic into Rust. --- src/grimp/adaptors/caching.py | 6 +++--- src/grimp/application/ports/caching.py | 6 +++--- src/grimp/application/usecases.py | 2 +- tests/unit/adaptors/test_caching.py | 18 +++++++++--------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/grimp/adaptors/caching.py b/src/grimp/adaptors/caching.py index 13fe9167..328b514a 100644 --- a/src/grimp/adaptors/caching.py +++ b/src/grimp/adaptors/caching.py @@ -4,7 +4,7 @@ import logging from typing import Dict, List, Optional, Set, Tuple, Type -from grimp.application.ports.filesystem import AbstractFileSystem +from grimp.application.ports.filesystem import BasicFileSystem from grimp.application.ports.modulefinder import FoundPackage, ModuleFile from grimp.domain.valueobjects import DirectImport, Module @@ -77,7 +77,7 @@ def __init__(self, *args, namer: Type[CacheFileNamer], **kwargs) -> None: @classmethod def setup( cls, - file_system: AbstractFileSystem, + file_system: BasicFileSystem, found_packages: Set[FoundPackage], include_external_packages: bool, exclude_type_checking_imports: bool = False, @@ -202,7 +202,7 @@ def _read_data_map_file(self) -> Dict[Module, Set[DirectImport]]: ) try: imports_by_module = rust.read_cache_data_map_file( - data_cache_filename, self.file_system.convert_to_basic() + data_cache_filename, self.file_system ) except FileNotFoundError: logger.info(f"No cache file: {data_cache_filename}.") diff --git a/src/grimp/application/ports/caching.py b/src/grimp/application/ports/caching.py index d3f58f9b..6d3aa2bd 100644 --- a/src/grimp/application/ports/caching.py +++ b/src/grimp/application/ports/caching.py @@ -3,7 +3,7 @@ from grimp.application.ports.modulefinder import FoundPackage, ModuleFile from grimp.domain.valueobjects import DirectImport, Module -from .filesystem import AbstractFileSystem +from .filesystem import BasicFileSystem class CacheMiss(Exception): @@ -13,7 +13,7 @@ class CacheMiss(Exception): class Cache: def __init__( self, - file_system: AbstractFileSystem, + file_system: BasicFileSystem, include_external_packages: bool, exclude_type_checking_imports: bool, found_packages: Set[FoundPackage], @@ -31,7 +31,7 @@ def __init__( @classmethod def setup( cls, - file_system: AbstractFileSystem, + file_system: BasicFileSystem, found_packages: Set[FoundPackage], *, include_external_packages: bool, diff --git a/src/grimp/application/usecases.py b/src/grimp/application/usecases.py index b8841f6e..f398f73f 100644 --- a/src/grimp/application/usecases.py +++ b/src/grimp/application/usecases.py @@ -109,7 +109,7 @@ def _scan_packages( if cache_dir is not None: cache_dir_if_supplied = cache_dir if cache_dir != NotSupplied else None cache: caching.Cache = settings.CACHE_CLASS.setup( - file_system=file_system, + file_system=file_system.convert_to_basic(), found_packages=found_packages, include_external_packages=include_external_packages, exclude_type_checking_imports=exclude_type_checking_imports, diff --git a/tests/unit/adaptors/test_caching.py b/tests/unit/adaptors/test_caching.py index 8b733850..f96a3fc4 100644 --- a/tests/unit/adaptors/test_caching.py +++ b/tests/unit/adaptors/test_caching.py @@ -203,7 +203,7 @@ class TestCache: ] }""", }, - ) + ).convert_to_basic() MODULE_FILE_UNMODIFIED = ModuleFile( module=Module("mypackage.foo.unmodified"), mtime=SOME_MTIME ) @@ -250,7 +250,7 @@ def test_logs_missing_cache_files(self, caplog): caplog.set_level(logging.INFO, logger=Cache.__module__) Cache.setup( - file_system=FakeFileSystem(), # No cache files. + file_system=FakeFileSystem().convert_to_basic(), # No cache files. found_packages=self.FOUND_PACKAGES, namer=SimplisticFileNamer, include_external_packages=False, @@ -278,7 +278,7 @@ def test_logs_corrupt_cache_meta_file_reading(self, serialized_mtime: str, caplo }}""", ".grimp_cache/mypackage.data.json": "{}", }, - ) + ).convert_to_basic() Cache.setup( file_system=file_system, found_packages=self.FOUND_PACKAGES, @@ -306,7 +306,7 @@ def test_logs_corrupt_cache_data_file_reading(self, caplog): }}""", ".grimp_cache/mypackage.data.json": "INVALID JSON", }, - ) + ).convert_to_basic() Cache.setup( file_system=file_system, @@ -408,7 +408,7 @@ def test_raises_cache_miss_for_missing_module_from_data(self): }}""", ".grimp_cache/mypackage.data.json": """{}""", }, - ) + ).convert_to_basic() module_file = ModuleFile(module=Module("mypackage.somemodule"), mtime=self.SOME_MTIME) cache = Cache.setup( file_system=file_system, @@ -451,7 +451,7 @@ def test_raises_cache_miss_for_corrupt_meta_file(self, serialized_mtime): ] }""", }, - ) + ).convert_to_basic() cache = Cache.setup( file_system=file_system, found_packages=self.FOUND_PACKAGES, @@ -487,7 +487,7 @@ def test_raises_cache_miss_for_corrupt_data_file(self, serialized_import): ] }}""", }, - ) + ).convert_to_basic() cache = Cache.setup( file_system=file_system, found_packages=self.FOUND_PACKAGES, @@ -566,7 +566,7 @@ def test_write_to_cache( self, include_external_packages, expected_data_file_name, cache_dir, caplog ): caplog.set_level(logging.INFO, logger=Cache.__module__) - file_system = FakeFileSystem() + file_system = FakeFileSystem().convert_to_basic() blue_one = Module(name="blue.one") blue_two = Module(name="blue.two") green_one = Module(name="green.one") @@ -673,7 +673,7 @@ def test_write_to_cache( def test_write_to_cache_adds_marker_files(self): some_cache_dir = "/tmp/some-cache-dir" - file_system = FakeFileSystem() + file_system = FakeFileSystem().convert_to_basic() cache = Cache.setup( file_system=file_system, cache_dir=some_cache_dir, From f8725948f09c94d26e18331a3e25be2ee4e62858 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 5 Sep 2025 17:59:19 +0100 Subject: [PATCH 3/4] WIP Trying to get writing to work Tests are failing because the file system isn't stateful between calls across the boundary (I think because we clone the inner file system when passing it in). Is this inevitable? --- rust/src/caching.rs | 39 +++++++++++++++++++++++++++++ rust/src/filesystem.rs | 7 +++++- rust/src/lib.rs | 3 ++- src/grimp/adaptors/caching.py | 29 +++++++++++++-------- tests/unit/adaptors/test_caching.py | 8 ++++-- 5 files changed, 72 insertions(+), 14 deletions(-) diff --git a/rust/src/caching.rs b/rust/src/caching.rs index cb00b417..8d97a276 100644 --- a/rust/src/caching.rs +++ b/rust/src/caching.rs @@ -6,6 +6,30 @@ use pyo3::types::PyDict; use pyo3::{Bound, PyAny, PyResult, Python, pyfunction}; use std::collections::{HashMap, HashSet}; +/// Writes the cache file containing all the imports for a given package. +/// Args: +/// - filename: str +/// - imports_by_module: dict[Module, Set[DirectImport]] +/// - file_system: The file system interface to use. (A BasicFileSystem.) +#[pyfunction] +pub fn write_cache_data_map_file<'py>( + py: Python<'py>, + filename: &str, + imports_by_module: Bound<'py, PyDict>, + file_system: Bound<'py, PyAny>, +) -> PyResult<()> { + eprintln!("About to clone for write."); + let mut file_system_boxed = get_file_system_boxed(&file_system)?; + + let imports_by_module_rust = imports_by_module_to_rust(imports_by_module); + + let file_contents = serialize_imports_by_module(&imports_by_module_rust); + + file_system_boxed.write(filename, &format!("{} HELLO", &file_contents))?; + + Ok(()) +} + /// Reads the cache file containing all the imports for a given package. /// Args: /// - filename: str @@ -17,6 +41,7 @@ pub fn read_cache_data_map_file<'py>( filename: &str, file_system: Bound<'py, PyAny>, ) -> PyResult> { + eprintln!("About to clone for read."); let file_system_boxed = get_file_system_boxed(&file_system)?; let file_contents = file_system_boxed.read(filename)?; @@ -26,6 +51,20 @@ pub fn read_cache_data_map_file<'py>( Ok(imports_by_module_to_py(py, imports_by_module)) } +#[allow(unused_variables)] +fn imports_by_module_to_rust( + imports_by_module: Bound, +) -> HashMap> { + HashMap::new() +} + +#[allow(unused_variables)] +fn serialize_imports_by_module( + imports_by_module: &HashMap>, +) -> String { + "".to_string() +} + pub fn parse_json_to_map( json_str: &str, filename: &str, diff --git a/rust/src/filesystem.rs b/rust/src/filesystem.rs index fd880f93..84d7254c 100644 --- a/rust/src/filesystem.rs +++ b/rust/src/filesystem.rs @@ -250,6 +250,7 @@ impl FileSystem for FakeBasicFileSystem { } fn read(&self, file_name: &str) -> PyResult { + eprintln!("{:?}", &self.contents.keys()); match self.contents.get(file_name) { Some(file_name) => Ok(file_name.clone()), None => Err(PyFileNotFoundError::new_err(format!( @@ -260,7 +261,10 @@ impl FileSystem for FakeBasicFileSystem { #[allow(unused_variables)] fn write(&mut self, file_name: &str, contents: &str) -> PyResult<()> { + eprintln!("Writing into fake {}, {}", file_name, contents); + eprintln!("Contents currently {:?}", self.contents); self.contents.insert(file_name.to_string(), contents.to_string()); + eprintln!("Contents now {:?}", self.contents); Ok(()) } } @@ -405,7 +409,7 @@ pub fn get_file_system_boxed<'py>( file_system: &Bound<'py, PyAny>, ) -> PyResult> { let file_system_boxed: Box; - + eprintln!("Cloning file system."); if let Ok(py_real) = file_system.extract::>() { file_system_boxed = Box::new(py_real.inner.clone()); } else if let Ok(py_fake) = file_system.extract::>() { @@ -415,5 +419,6 @@ pub fn get_file_system_boxed<'py>( "file_system must be an instance of RealBasicFileSystem or FakeBasicFileSystem", )); } + Ok(file_system_boxed) } diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 5a42f9cb..32b53342 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -8,7 +8,7 @@ mod import_scanning; pub mod module_expressions; mod module_finding; -use crate::caching::read_cache_data_map_file; +use crate::caching::{read_cache_data_map_file,write_cache_data_map_file}; use crate::errors::{GrimpError, GrimpResult}; use crate::exceptions::{ CorruptCache, InvalidModuleExpression, ModuleNotPresent, NoSuchContainer, ParseError, @@ -32,6 +32,7 @@ use std::collections::HashSet; #[pymodule] fn _rustgrimp(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(scan_for_imports))?; + m.add_wrapped(wrap_pyfunction!(write_cache_data_map_file))?; m.add_wrapped(wrap_pyfunction!(read_cache_data_map_file))?; m.add_class::()?; m.add_class::()?; diff --git a/src/grimp/adaptors/caching.py b/src/grimp/adaptors/caching.py index 328b514a..956b6adb 100644 --- a/src/grimp/adaptors/caching.py +++ b/src/grimp/adaptors/caching.py @@ -120,8 +120,24 @@ def write( self, imports_by_module: Dict[Module, Set[DirectImport]], ) -> None: - self._write_marker_files_if_not_already_there() + # Write data file. + data_cache_filename = self.file_system.join( + self.cache_dir, + self._namer.make_data_file_name( + found_packages=self.found_packages, + include_external_packages=self.include_external_packages, + exclude_type_checking_imports=self.exclude_type_checking_imports, + ), + ) + # Rust version + rust.write_cache_data_map_file( + filename=data_cache_filename, + imports_by_module=imports_by_module, + file_system=self.file_system, + ) + self._write_marker_files_if_not_already_there() + # Python version primitives_map: PrimitiveFormat = {} for found_package in self.found_packages: primitives_map_for_found_package: PrimitiveFormat = { @@ -138,15 +154,8 @@ def write( primitives_map.update(primitives_map_for_found_package) serialized = json.dumps(primitives_map) - data_cache_filename = self.file_system.join( - self.cache_dir, - self._namer.make_data_file_name( - found_packages=self.found_packages, - include_external_packages=self.include_external_packages, - exclude_type_checking_imports=self.exclude_type_checking_imports, - ), - ) - self.file_system.write(data_cache_filename, serialized) + # self.file_system.write(data_cache_filename, serialized) + logger.info(f"Wrote data cache file {data_cache_filename}.") # Write meta files. diff --git a/tests/unit/adaptors/test_caching.py b/tests/unit/adaptors/test_caching.py index f96a3fc4..baa771ae 100644 --- a/tests/unit/adaptors/test_caching.py +++ b/tests/unit/adaptors/test_caching.py @@ -554,12 +554,16 @@ def test_uses_cache_multiple_packages( | expected_additional_imports ) - @pytest.mark.parametrize("cache_dir", ("/tmp/some-cache-dir", "/tmp/some-cache-dir/", None)) + @pytest.mark.parametrize("cache_dir", ( + "/tmp/some-cache-dir", + # "/tmp/some-cache-dir/", + # None, + )) @pytest.mark.parametrize( "include_external_packages, expected_data_file_name", ( (False, "blue,green.data.json"), - (True, "blue,green:external.data.json"), + #(True, "blue,green:external.data.json"), ), ) def test_write_to_cache( From 1386977393183ee7d0eade5cd28056a2cec7fd1a Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 5 Sep 2025 18:39:43 +0100 Subject: [PATCH 4/4] WIP Write json with rust There are a couple of problems with this: - It's incompatible with the old json format for some reason. - The unit tests don't work because the file system that's passed in doesn't maintain its state between the two read / write calls. --- rust/src/caching.rs | 51 ++++++++++++++++++++++++++++++++----- rust/src/filesystem.rs | 2 +- rust/src/import_scanning.rs | 23 ++++++++++++++--- rust/src/module_finding.rs | 1 + 4 files changed, 66 insertions(+), 11 deletions(-) diff --git a/rust/src/caching.rs b/rust/src/caching.rs index 8d97a276..e2abf021 100644 --- a/rust/src/caching.rs +++ b/rust/src/caching.rs @@ -2,9 +2,12 @@ use crate::errors::{GrimpError, GrimpResult}; use crate::filesystem::get_file_system_boxed; use crate::import_scanning::{DirectImport, imports_by_module_to_py}; use crate::module_finding::Module; -use pyo3::types::PyDict; +use pyo3::types::PyAnyMethods; +use pyo3::types::{PyDict, PySet}; +use pyo3::types::{PyDictMethods, PySetMethods}; use pyo3::{Bound, PyAny, PyResult, Python, pyfunction}; use std::collections::{HashMap, HashSet}; +use serde_json; /// Writes the cache file containing all the imports for a given package. /// Args: @@ -20,12 +23,12 @@ pub fn write_cache_data_map_file<'py>( ) -> PyResult<()> { eprintln!("About to clone for write."); let mut file_system_boxed = get_file_system_boxed(&file_system)?; - + let imports_by_module_rust = imports_by_module_to_rust(imports_by_module); - + let file_contents = serialize_imports_by_module(&imports_by_module_rust); - file_system_boxed.write(filename, &format!("{} HELLO", &file_contents))?; + file_system_boxed.write(filename, &file_contents)?; Ok(()) } @@ -53,16 +56,50 @@ pub fn read_cache_data_map_file<'py>( #[allow(unused_variables)] fn imports_by_module_to_rust( - imports_by_module: Bound, + imports_by_module_py: Bound, ) -> HashMap> { - HashMap::new() + let mut imports_by_module_rust = HashMap::new(); + + for (py_key, py_value) in imports_by_module_py.iter() { + let module: Module = py_key.extract().unwrap(); + let py_set = py_value + .downcast::() + .expect("Expected value to be a Python set."); + let mut hashset: HashSet = HashSet::new(); + for element in py_set.iter() { + let direct_import: DirectImport = element + .extract() + .expect("Expected value to be DirectImport."); + hashset.insert(direct_import); + } + imports_by_module_rust.insert(module, hashset); + } + + imports_by_module_rust } #[allow(unused_variables)] fn serialize_imports_by_module( imports_by_module: &HashMap>, ) -> String { - "".to_string() + let raw_map: HashMap<&str, Vec<(&str, usize, &str)>> = imports_by_module + .iter() + .map(|(module, imports)| { + let imports_vec: Vec<(&str, usize, &str)> = imports + .iter() + .map(|import| { + ( + import.imported.as_str(), + import.line_number, + import.line_contents.as_str(), + ) + }) + .collect(); + (module.name.as_str(), imports_vec) + }) + .collect(); + + serde_json::to_string(&raw_map).expect("Failed to serialize to JSON") } pub fn parse_json_to_map( diff --git a/rust/src/filesystem.rs b/rust/src/filesystem.rs index 84d7254c..ec1bd3f5 100644 --- a/rust/src/filesystem.rs +++ b/rust/src/filesystem.rs @@ -419,6 +419,6 @@ pub fn get_file_system_boxed<'py>( "file_system must be an instance of RealBasicFileSystem or FakeBasicFileSystem", )); } - + Ok(file_system_boxed) } diff --git a/rust/src/import_scanning.rs b/rust/src/import_scanning.rs index 1dec3f45..c5f5dd3a 100644 --- a/rust/src/import_scanning.rs +++ b/rust/src/import_scanning.rs @@ -1,13 +1,13 @@ use crate::errors::GrimpResult; use crate::filesystem::FileSystem; -use crate::module_finding::{FoundPackage, Module}; +use crate::module_finding::{FoundPackage, Module, ModuleFile}; use crate::{import_parsing, module_finding}; use itertools::Itertools; use pyo3::prelude::*; -use pyo3::types::{PyDict, PySet}; +use pyo3::types::{PyDict, PyFrozenSet, PySet}; /// Statically analyses some Python modules for import statements within their shared package. use rayon::prelude::*; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::io::{self, ErrorKind}; #[derive(Debug, Hash, Eq, PartialEq)] @@ -18,6 +18,23 @@ pub struct DirectImport { pub line_contents: String, } +impl<'py> FromPyObject<'py> for DirectImport { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + let importer: String = ob.getattr("importer")?.getattr("name")?.extract()?; + let imported: String = ob.getattr("imported")?.getattr("name")?.extract()?; + let line_number: usize = ob.getattr("line_number")?.extract()?; + let line_contents: String = ob.getattr("line_contents")?.extract()?; + + Ok(DirectImport { + importer, + imported, + line_number, + line_contents, + }) + } +} + + pub fn py_found_packages_to_rust(py_found_packages: &Bound<'_, PyAny>) -> HashSet { let py_set = py_found_packages .downcast::() diff --git a/rust/src/module_finding.rs b/rust/src/module_finding.rs index 4034b265..b1262697 100644 --- a/rust/src/module_finding.rs +++ b/rust/src/module_finding.rs @@ -20,6 +20,7 @@ impl fmt::Display for Module { } } + #[derive(Debug, Clone, PartialEq, Eq, Hash)] /// Set of modules found under a single package, together with metadata. pub struct FoundPackage {