From 04c2733d5714b1a71394df9760725f56e062dc46 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 8 Feb 2020 11:14:42 +0000 Subject: [PATCH 01/26] WIP - matching files for rename --- .../FileSystemState.cs | 88 +++++++++++++++++-- 1 file changed, 82 insertions(+), 6 deletions(-) diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index 7191809..5cf55af 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -4,6 +4,7 @@ using System.IO.Enumeration; using System.Runtime.InteropServices; using System.Runtime.Serialization.Formatters.Binary; +using System.Linq; namespace YellowCounter.FileSystemState { @@ -49,21 +50,96 @@ public FileChangeList GetChanges() { _version++; + FileChangeList rawChanges = GetCreatesAndChanges(); + + var removals = GetRemovalsX(); //.ToLookup(x => (x.LastWriteTimeUtc, x.Length)); + + // Look at all created files. + // Can we find removed files which match on lastwrite / length? + // These are probably renames. + // TODO - same directory different name OR same name different directory + var renames = rawChanges + .Where(x => x.ChangeType == WatcherChangeTypes.Created) + .Select(x => new { + FileChange = x, + State = _state.Get(x.Directory, x.Name) + }) + .GroupJoin(removals, + x => new { x.State.LastWriteTimeUtc, x.State.Length }, + x => new { x.LastWriteTimeUtc, x.Length }, + (x, y) => new { NewFile = x, OldFile = y.First() }) + .ToList(); + + var adds = rawChanges + .Where(x => x.ChangeType == WatcherChangeTypes.Created) + .Except(renames.Select(x => x.NewFile.FileChange)); + + var removes = removals.Except(renames.Select(x => x.OldFile)) + .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Deleted)); + + + + GetRenames(rawChanges); + + //List<(string directory, string path)> removals = GetRemovals(); + //foreach(var (directory, path) in removals) + //{ + // rawChanges.AddRemoved(directory, path); + // _state.Remove(directory, path); + //} + + // Clear out the files that have been removed or renamed from our state. + foreach(var r in removals) + { + _state.Remove(r.Directory, r.Path); + } + + return rawChanges; + } + + private FileChangeList GetCreatesAndChanges() + { var enumerator = new FileSystemChangeEnumerator(this); - while (enumerator.MoveNext()) + while(enumerator.MoveNext()) { // Ignore `.Current` } var changes = enumerator.Changes; + return changes; + } - List<(string directory, string path)> removals = GetRemovals(); - foreach (var (directory, path) in removals) + private void GetRenames(FileChangeList changes) + { + + foreach(var value in _state.Values) { - changes.AddRemoved(directory, path); - _state.Remove(directory, path); + // Find files in our state that have not been marked (have gone missing) + if(value.Version != _version) + { + // Is there another file in there with the same lastwrite and length? + // That's what we've renamed it to. + var renamedTo = _state.Keys + //.Where(x => x.directory == value.Directory) + .Select(x => _state[x]) + .Where(x => x.LastWriteTimeUtc == value.LastWriteTimeUtc && x.Length == value.Length + && (x.Path != value.Path || x.Directory != value.Directory)) + .FirstOrDefault(); + + // changes.Remove( + _state.Remove(value.Directory, value.Path); + } } + } - return changes; + private IEnumerable GetRemovalsX() + { + foreach(var value in _state.Values) + { + if(value.Version != _version) + { + yield return value; + } + } } private List<(string directory, string path)> GetRemovals() From 8b0bc64296f0ec4d2517b32ac0081b876b5fbe5f Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 8 Feb 2020 12:34:09 +0000 Subject: [PATCH 02/26] Doesn't regress, but does it work? --- YellowCounter.FileSystemState/FileChange.cs | 10 +++ .../FileChangeList.cs | 2 + .../FileSystemState.cs | 78 +++++++++++++------ 3 files changed, 67 insertions(+), 23 deletions(-) diff --git a/YellowCounter.FileSystemState/FileChange.cs b/YellowCounter.FileSystemState/FileChange.cs index 5baa7e5..a87bceb 100644 --- a/YellowCounter.FileSystemState/FileChange.cs +++ b/YellowCounter.FileSystemState/FileChange.cs @@ -14,9 +14,19 @@ internal FileChange(string directory, string path, WatcherChangeTypes type) Name = path; ChangeType = type; } + internal FileChange(string directory, string path, WatcherChangeTypes type, string oldDirectory, string oldName) + { + Directory = directory; + Name = path; + ChangeType = type; + OldDirectory = oldDirectory; + OldName = oldName; + } public string Directory { get; } public string Name { get; } + public string OldDirectory { get; } + public string OldName { get; } public WatcherChangeTypes ChangeType { get; } } } diff --git a/YellowCounter.FileSystemState/FileChangeList.cs b/YellowCounter.FileSystemState/FileChangeList.cs index a911185..5a8d64f 100644 --- a/YellowCounter.FileSystemState/FileChangeList.cs +++ b/YellowCounter.FileSystemState/FileChangeList.cs @@ -10,5 +10,7 @@ public class FileChangeList : List internal void AddChanged(string directory, string path) => Add(new FileChange(directory, path, WatcherChangeTypes.Changed)); internal void AddRemoved(string directory, string path) => Add(new FileChange(directory, path, WatcherChangeTypes.Deleted)); + internal void AddRenamed(string directory, string path, string oldDirectory, string oldPath) => + Add(new FileChange(directory, path, WatcherChangeTypes.Renamed, oldDirectory, oldPath)); } } diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index 5cf55af..129a0a3 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -50,43 +50,68 @@ public FileChangeList GetChanges() { _version++; - FileChangeList rawChanges = GetCreatesAndChanges(); + var rawChanges = GetCreatesAndChanges(); - var removals = GetRemovalsX(); //.ToLookup(x => (x.LastWriteTimeUtc, x.Length)); + var removals = GetRemovalsX().ToList(); //.ToLookup(x => (x.LastWriteTimeUtc, x.Length)); - // Look at all created files. - // Can we find removed files which match on lastwrite / length? - // These are probably renames. - // TODO - same directory different name OR same name different directory - var renames = rawChanges + var createsByTime = rawChanges .Where(x => x.ChangeType == WatcherChangeTypes.Created) .Select(x => new { FileChange = x, State = _state.Get(x.Directory, x.Name) }) - .GroupJoin(removals, - x => new { x.State.LastWriteTimeUtc, x.State.Length }, - x => new { x.LastWriteTimeUtc, x.Length }, - (x, y) => new { NewFile = x, OldFile = y.First() }) + .GroupBy(x => new + { + // Group by last write time, length and directory + x.State.LastWriteTimeUtc, + x.State.Length, + x.State.Path + }, + (x, y) => new + { + // Return key fields, and list of all created files for the + // given (time, length, path) key + x.LastWriteTimeUtc, + x.Length, + x.Path, + Creates = y.ToList() + }); + + var removesByTime = removals + .GroupBy(x => new { x.LastWriteTimeUtc, x.Length, x.Path }, + (x, y) => new { x.LastWriteTimeUtc, x.Length, x.Path, Removes = y.ToList() }); + + // Join creates and removes by (time, length, directory), then filter to + // only those matches which are unambiguous. + var renames = createsByTime.Join(removesByTime, + x => new { x.LastWriteTimeUtc, x.Length, x.Path }, + x => new { x.LastWriteTimeUtc, x.Length, x.Path }, + (x, y) => new { x.Creates, y.Removes } + ) + .Where(x => x.Creates.Count == 1 && x.Removes.Count == 1) + .Select(x => new + { + NewFile = x.Creates[0], + OldFile = x.Removes[0] + }) .ToList(); var adds = rawChanges .Where(x => x.ChangeType == WatcherChangeTypes.Created) .Except(renames.Select(x => x.NewFile.FileChange)); - var removes = removals.Except(renames.Select(x => x.OldFile)) - .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Deleted)); - + var changes = rawChanges + .Where(x => x.ChangeType == WatcherChangeTypes.Changed); + var removes = removals + .Except(renames.Select(x => x.OldFile)) + .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Deleted)); - GetRenames(rawChanges); - - //List<(string directory, string path)> removals = GetRemovals(); - //foreach(var (directory, path) in removals) - //{ - // rawChanges.AddRemoved(directory, path); - // _state.Remove(directory, path); - //} + var renames2 = renames.Select(x => new FileChange(x.NewFile.FileChange.Directory, + x.NewFile.FileChange.Name, + WatcherChangeTypes.Renamed, + x.OldFile.Directory, + x.OldFile.Path)); // Clear out the files that have been removed or renamed from our state. foreach(var r in removals) @@ -94,7 +119,14 @@ public FileChangeList GetChanges() _state.Remove(r.Directory, r.Path); } - return rawChanges; + var result = new FileChangeList(); + + result.AddRange(adds); + result.AddRange(changes); + result.AddRange(removes); + result.AddRange(renames2); + + return result; } private FileChangeList GetCreatesAndChanges() From ea9110381f53959b92162e7689cd20de34dfba32 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 8 Feb 2020 12:51:35 +0000 Subject: [PATCH 03/26] Renaming test working --- .../UnitTests.cs | 36 ++++++++++++++ .../FileSystemState.cs | 47 +++++++++++-------- 2 files changed, 63 insertions(+), 20 deletions(-) diff --git a/YellowCounter.FileSystemState.Tests/UnitTests.cs b/YellowCounter.FileSystemState.Tests/UnitTests.cs index bffd142..37b4010 100644 --- a/YellowCounter.FileSystemState.Tests/UnitTests.cs +++ b/YellowCounter.FileSystemState.Tests/UnitTests.cs @@ -126,6 +126,42 @@ public static void FileSystemWatcher_Changed_File() } } + + + [Fact] + public static void FileSystemWatcher_Renamed_File() + { + string currentDir = Utility.GetRandomDirectory(); + string fileName = Path.GetRandomFileName(); + string newName = Path.GetRandomFileName(); + string fullName = Path.Combine(currentDir, fileName); + + + FileSystemState watcher = new FileSystemState(currentDir); + + using(FileStream file = File.Create(fullName)) { } + watcher.LoadState(); + + File.Move(fullName, Path.Combine(currentDir, newName)); + + var changes = watcher.GetChanges(); + + try + { + Assert.Single(changes); + FileChange change = changes[0]; + Assert.Equal(WatcherChangeTypes.Renamed, change.ChangeType); + Assert.Equal(fileName, change.OldName); + Assert.Equal(currentDir, change.OldDirectory); + Assert.Equal(newName, change.Name); + Assert.Equal(currentDir, change.Directory); + } + finally + { + Directory.Delete(currentDir, true); + } + } + [Fact] public static void FileSystemWatcher_Filter() { diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index 129a0a3..7e18809 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -56,36 +56,39 @@ public FileChangeList GetChanges() var createsByTime = rawChanges .Where(x => x.ChangeType == WatcherChangeTypes.Created) - .Select(x => new { + .Select(x => new + { FileChange = x, State = _state.Get(x.Directory, x.Name) }) - .GroupBy(x => new + .GroupBy(x => new + { + // Group by last write time, length and directory + x.State.LastWriteTimeUtc, + x.State.Length, + x.State.Directory + }, + (x, y) => new { - // Group by last write time, length and directory - x.State.LastWriteTimeUtc, - x.State.Length, - x.State.Path - }, - (x, y) => new - { // Return key fields, and list of all created files for the // given (time, length, path) key x.LastWriteTimeUtc, x.Length, - x.Path, - Creates = y.ToList() - }); + x.Directory, + Creates = y.ToList() + }) + .ToList(); var removesByTime = removals - .GroupBy(x => new { x.LastWriteTimeUtc, x.Length, x.Path }, - (x, y) => new { x.LastWriteTimeUtc, x.Length, x.Path, Removes = y.ToList() }); + .GroupBy(x => new { x.LastWriteTimeUtc, x.Length, x.Directory }, + (x, y) => new { x.LastWriteTimeUtc, x.Length, x.Directory, Removes = y.ToList() }) + .ToList(); // Join creates and removes by (time, length, directory), then filter to // only those matches which are unambiguous. var renames = createsByTime.Join(removesByTime, - x => new { x.LastWriteTimeUtc, x.Length, x.Path }, - x => new { x.LastWriteTimeUtc, x.Length, x.Path }, + x => new { x.LastWriteTimeUtc, x.Length, x.Directory }, + x => new { x.LastWriteTimeUtc, x.Length, x.Directory }, (x, y) => new { x.Creates, y.Removes } ) .Where(x => x.Creates.Count == 1 && x.Removes.Count == 1) @@ -98,20 +101,24 @@ public FileChangeList GetChanges() var adds = rawChanges .Where(x => x.ChangeType == WatcherChangeTypes.Created) - .Except(renames.Select(x => x.NewFile.FileChange)); + .Except(renames.Select(x => x.NewFile.FileChange)) + .ToList(); var changes = rawChanges - .Where(x => x.ChangeType == WatcherChangeTypes.Changed); + .Where(x => x.ChangeType == WatcherChangeTypes.Changed) + .ToList(); var removes = removals .Except(renames.Select(x => x.OldFile)) - .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Deleted)); + .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Deleted)) + .ToList(); var renames2 = renames.Select(x => new FileChange(x.NewFile.FileChange.Directory, x.NewFile.FileChange.Name, WatcherChangeTypes.Renamed, x.OldFile.Directory, - x.OldFile.Path)); + x.OldFile.Path)) + .ToList(); // Clear out the files that have been removed or renamed from our state. foreach(var r in removals) From 78c9777edd6c670e56b2aba101a9fa33bea8af99 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 8 Feb 2020 15:04:55 +0000 Subject: [PATCH 04/26] Removed dead code --- .../FileSystemState.cs | 42 ++----------------- 1 file changed, 4 insertions(+), 38 deletions(-) diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index 7e18809..2bba1a9 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -52,7 +52,7 @@ public FileChangeList GetChanges() var rawChanges = GetCreatesAndChanges(); - var removals = GetRemovalsX().ToList(); //.ToLookup(x => (x.LastWriteTimeUtc, x.Length)); + var removals = GetRemovals().ToList(); var createsByTime = rawChanges .Where(x => x.ChangeType == WatcherChangeTypes.Created) @@ -120,6 +120,7 @@ public FileChangeList GetChanges() x.OldFile.Path)) .ToList(); + // Clear out the files that have been removed or renamed from our state. foreach(var r in removals) { @@ -147,30 +148,7 @@ private FileChangeList GetCreatesAndChanges() return changes; } - private void GetRenames(FileChangeList changes) - { - - foreach(var value in _state.Values) - { - // Find files in our state that have not been marked (have gone missing) - if(value.Version != _version) - { - // Is there another file in there with the same lastwrite and length? - // That's what we've renamed it to. - var renamedTo = _state.Keys - //.Where(x => x.directory == value.Directory) - .Select(x => _state[x]) - .Where(x => x.LastWriteTimeUtc == value.LastWriteTimeUtc && x.Length == value.Length - && (x.Path != value.Path || x.Directory != value.Directory)) - .FirstOrDefault(); - - // changes.Remove( - _state.Remove(value.Directory, value.Path); - } - } - } - - private IEnumerable GetRemovalsX() + private IEnumerable GetRemovals() { foreach(var value in _state.Values) { @@ -179,22 +157,10 @@ private IEnumerable GetRemovalsX() yield return value; } } - } - private List<(string directory, string path)> GetRemovals() - { - List<(string, string)> removals = new List<(string, string)>(); - foreach (var value in _state.Values) - { - if (value.Version != _version) - { - removals.Add((value.Directory, value.Path)); - } - } - - return removals; } + protected internal virtual void DetermineChange(string directory, ref FileChangeList changes, ref FileSystemEntry file) { string path = file.FileName.ToString(); From 960a52b85a98d3ed3ac59dd918b74c31680782f1 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 8 Feb 2020 16:25:32 +0000 Subject: [PATCH 05/26] Store state in HashSet not dictionary. Removed serialization for now - think deserves a better format. --- .../SerializableTests.cs | 130 +++++++++--------- YellowCounter.FileSystemState/FileState.cs | 9 +- .../FileSystemState.cs | 97 ++++--------- .../PathToFileStateHashtable.cs | 68 ++++++++- 4 files changed, 163 insertions(+), 141 deletions(-) diff --git a/YellowCounter.FileSystemState.Tests/SerializableTests.cs b/YellowCounter.FileSystemState.Tests/SerializableTests.cs index 49f8dbe..cc4be45 100644 --- a/YellowCounter.FileSystemState.Tests/SerializableTests.cs +++ b/YellowCounter.FileSystemState.Tests/SerializableTests.cs @@ -4,83 +4,83 @@ public class FileSystemStateSerializableTests { - [Fact] - public void RoundTripDoesNotAffectOriginalTest() - { - string currentDir = Utility.GetRandomDirectory(); - string fileName = Path.GetRandomFileName() + ".txt"; - string fullName = Path.Combine(currentDir, fileName); + //[Fact] + //public void RoundTripDoesNotAffectOriginalTest() + //{ + // string currentDir = Utility.GetRandomDirectory(); + // string fileName = Path.GetRandomFileName() + ".txt"; + // string fullName = Path.Combine(currentDir, fileName); - FileSystemState state = new FileSystemState(currentDir, "*.csv"); - FileSystemState state2 = new FileSystemState(currentDir, "*.txt"); + // FileSystemState state = new FileSystemState(currentDir, "*.csv"); + // FileSystemState state2 = new FileSystemState(currentDir, "*.txt"); - state.LoadState(); - RoundTrip(state, state2); + // state.LoadState(); + // RoundTrip(state, state2); - using (var file = File.Create(fullName)) { } + // using (var file = File.Create(fullName)) { } - try - { - Assert.Empty(state.GetChanges()); - Assert.Single(state2.GetChanges()); - } - finally - { - Directory.Delete(currentDir, true); - } - } + // try + // { + // Assert.Empty(state.GetChanges()); + // Assert.Single(state2.GetChanges()); + // } + // finally + // { + // Directory.Delete(currentDir, true); + // } + //} - [Fact] - public void RoundTripVersionReset_NoChanges_Test() - { - string currentDir = Utility.GetRandomDirectory(); - string fileName = Path.GetRandomFileName(); - string fullName = Path.Combine(currentDir, fileName); - using (var file = File.Create(fullName)) { } + //[Fact] + //public void RoundTripVersionReset_NoChanges_Test() + //{ + // string currentDir = Utility.GetRandomDirectory(); + // string fileName = Path.GetRandomFileName(); + // string fullName = Path.Combine(currentDir, fileName); + // using (var file = File.Create(fullName)) { } - FileSystemState state = new FileSystemState(currentDir); - state.LoadState(); - state.GetChanges(); + // FileSystemState state = new FileSystemState(currentDir); + // state.LoadState(); + // state.GetChanges(); - FileSystemState state2 = new FileSystemState(currentDir); - RoundTrip(state, state2); + // FileSystemState state2 = new FileSystemState(currentDir); + // RoundTrip(state, state2); - try - { - Assert.Empty(state.GetChanges()); - Assert.Empty(state2.GetChanges()); - } - finally - { - Directory.Delete(currentDir, true); - } - } + // try + // { + // Assert.Empty(state.GetChanges()); + // Assert.Empty(state2.GetChanges()); + // } + // finally + // { + // Directory.Delete(currentDir, true); + // } + //} - [Fact] - public void RoundTripVersionReset_Deletion_Test() - { - string currentDir = Utility.GetRandomDirectory(); - string fileName = Path.GetRandomFileName(); - string fullName = Path.Combine(currentDir, fileName); - using (var file = File.Create(fullName)) { } + //[Fact] + //public void RoundTripVersionReset_Deletion_Test() + //{ + // string currentDir = Utility.GetRandomDirectory(); + // string fileName = Path.GetRandomFileName(); + // string fullName = Path.Combine(currentDir, fileName); + // using (var file = File.Create(fullName)) { } - FileSystemState state = new FileSystemState(currentDir); - state.LoadState(); + // FileSystemState state = new FileSystemState(currentDir); + // state.LoadState(); - FileSystemState state2 = new FileSystemState(currentDir); - RoundTrip(state, state2); - File.Delete(fullName); + // FileSystemState state2 = new FileSystemState(currentDir); + // RoundTrip(state, state2); + // File.Delete(fullName); - try - { - Assert.Single(state.GetChanges()); - Assert.Single(state2.GetChanges()); - } - finally - { - Directory.Delete(currentDir, true); - } - } + // try + // { + // Assert.Single(state.GetChanges()); + // Assert.Single(state2.GetChanges()); + // } + // finally + // { + // Directory.Delete(currentDir, true); + // } + //} private static void RoundTrip(FileSystemState source, FileSystemState destination) { diff --git a/YellowCounter.FileSystemState/FileState.cs b/YellowCounter.FileSystemState/FileState.cs index ddd27a0..79ed4c5 100644 --- a/YellowCounter.FileSystemState/FileState.cs +++ b/YellowCounter.FileSystemState/FileState.cs @@ -9,11 +9,18 @@ namespace YellowCounter.FileSystemState internal class FileState { [NonSerialized] - public long Version; // removal notification are implemented something similar to "mark and sweep". This value is incremented in the mark phase + public long LastSeenVersion; // removal notification are implemented something similar to "mark and sweep". This value is incremented in the mark phase + + [NonSerialized] + public long CreateVersion; + [NonSerialized] + public long ChangeVersion; public string Directory; public string Path; public DateTimeOffset LastWriteTimeUtc; public long Length; + + internal FileState Clone() => (FileState)this.MemberwiseClone(); } } diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index 2bba1a9..606f6fd 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -48,25 +48,22 @@ public void SaveState(Stream stream) // This function walks all watched files, collects changes, and updates state public FileChangeList GetChanges() { - _version++; + var enumerator = new FileSystemChangeEnumerator(this); + while(enumerator.MoveNext()) { } + + var rawChanges = _state.Read().Where(x => x.LastSeenVersion == _version).ToList(); - var rawChanges = GetCreatesAndChanges(); + var removals = _state.Read().Where(x => x.LastSeenVersion != _version).ToList(); - var removals = GetRemovals().ToList(); var createsByTime = rawChanges - .Where(x => x.ChangeType == WatcherChangeTypes.Created) - .Select(x => new - { - FileChange = x, - State = _state.Get(x.Directory, x.Name) - }) + .Where(x => x.CreateVersion == _version) .GroupBy(x => new { // Group by last write time, length and directory - x.State.LastWriteTimeUtc, - x.State.Length, - x.State.Directory + x.LastWriteTimeUtc, + x.Length, + x.Directory }, (x, y) => new { @@ -100,12 +97,14 @@ public FileChangeList GetChanges() .ToList(); var adds = rawChanges - .Where(x => x.ChangeType == WatcherChangeTypes.Created) - .Except(renames.Select(x => x.NewFile.FileChange)) + .Where(x => x.CreateVersion == _version) + .Except(renames.Select(x => x.NewFile)) + .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Created)) .ToList(); var changes = rawChanges - .Where(x => x.ChangeType == WatcherChangeTypes.Changed) + .Where(x => x.ChangeVersion == _version && x.CreateVersion != _version) + .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Changed)) .ToList(); var removes = removals @@ -113,19 +112,17 @@ public FileChangeList GetChanges() .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Deleted)) .ToList(); - var renames2 = renames.Select(x => new FileChange(x.NewFile.FileChange.Directory, - x.NewFile.FileChange.Name, + var renames2 = renames.Select(x => new FileChange( + x.NewFile.Directory, + x.NewFile.Path, WatcherChangeTypes.Renamed, x.OldFile.Directory, x.OldFile.Path)) .ToList(); - - + // Clear out the files that have been removed or renamed from our state. - foreach(var r in removals) - { - _state.Remove(r.Directory, r.Path); - } + _state.Sweep(_version); + _version++; var result = new FileChangeList(); @@ -137,57 +134,19 @@ public FileChangeList GetChanges() return result; } - private FileChangeList GetCreatesAndChanges() - { - var enumerator = new FileSystemChangeEnumerator(this); - while(enumerator.MoveNext()) - { - // Ignore `.Current` - } - var changes = enumerator.Changes; - return changes; - } - - private IEnumerable GetRemovals() - { - foreach(var value in _state.Values) - { - if(value.Version != _version) - { - yield return value; - } - } - - } - protected internal virtual void DetermineChange(string directory, ref FileChangeList changes, ref FileSystemEntry file) { string path = file.FileName.ToString(); - FileState fileState = _state.Get(directory, path); - if (fileState == null) // file added - { - fileState = new FileState(); - fileState.Directory = directory; - fileState.Path = path; - fileState.LastWriteTimeUtc = file.LastWriteTimeUtc; - fileState.Length = file.Length; - fileState.Version = _version; - _state.Add(directory, path, fileState); - changes.AddAdded(directory, path); - return; - } - - fileState.Version = _version; - - var previousState = fileState; - if (file.LastWriteTimeUtc != fileState.LastWriteTimeUtc || file.Length != fileState.Length) - { - changes.AddChanged(directory, fileState.Path); - fileState.LastWriteTimeUtc = file.LastWriteTimeUtc; - fileState.Length = file.Length; - } + FileState fs = new FileState(); + fs.Directory = directory; + fs.Path = path; + fs.LastWriteTimeUtc = file.LastWriteTimeUtc; + fs.Length = file.Length; + + _state.Mark(fs, _version); + } protected internal virtual bool ShouldIncludeEntry(ref FileSystemEntry entry) diff --git a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs index c544c2a..fbafe80 100644 --- a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs +++ b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs @@ -1,20 +1,76 @@ using System; using System.Collections.Generic; using System.Runtime.Serialization; +using System.Linq; namespace YellowCounter.FileSystemState { [Serializable] - internal class PathToFileStateHashtable : Dictionary<(string directory, string file), FileState>, ISerializable + internal class PathToFileStateHashtable { - public PathToFileStateHashtable() { } + HashSet hash; + public PathToFileStateHashtable() + { + hash = new HashSet(100, new FileStateComparer()); + } - public void Add(string directory, string file, FileState value) => Add((directory, file), value); + public void Mark(FileState input, long version) + { + // Is the file already known to us? + if(hash.TryGetValue(input, out FileState fs)) + { + // Mark that we've seen the file. + fs.LastSeenVersion = version; - public void Remove(string directory, string file) => Remove((directory, file)); + // Has it changed since we last saw it? + if(fs.LastWriteTimeUtc != input.LastWriteTimeUtc + || fs.Length != input.Length) + { + // Mark that this version was a change + fs.ChangeVersion = version; - public FileState Get(string directory, string file) => this.GetValueOrDefault((directory, file)); + // Update the last write time / file length. + fs.LastWriteTimeUtc = input.LastWriteTimeUtc; + fs.Length = input.Length; - protected PathToFileStateHashtable(SerializationInfo info, StreamingContext context) : base(info, context) { } + } + } + else // It's a new file. + { + // Don't futz the input, clone it + FileState fs2 = input.Clone(); + + // Mark that we've seen it + fs2.LastSeenVersion = version; + fs2.CreateVersion = version; + fs2.ChangeVersion = version; + + hash.Add(fs2); + } + } + + internal void Sweep(long version) + { + // Remove the records of files that have been deleted. + hash.RemoveWhere(x => x.LastSeenVersion != version); + } + + public IEnumerable Read() + { + foreach(var x in hash) + yield return x; + } + } + + internal class FileStateComparer : IEqualityComparer + { + // Equivalent if directory and path match. + public bool Equals(FileState x, FileState y) + { + return x.Directory == y.Directory && x.Path == y.Path; + } + + public int GetHashCode(FileState obj) => + HashCode.Combine(obj.Directory.GetHashCode() ^ obj.Path.GetHashCode()); } } From b792d86f3250d9fb4cfc01c33279ddde394557cf Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 8 Feb 2020 17:02:27 +0000 Subject: [PATCH 06/26] Refactoring to tidy up --- .../FileSystemState.cs | 143 ++++++++++++------ 1 file changed, 95 insertions(+), 48 deletions(-) diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index 606f6fd..a3aa618 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -47,17 +47,103 @@ public void SaveState(Stream stream) // This function walks all watched files, collects changes, and updates state public FileChangeList GetChanges() + { + // Get the raw file changes, either create, file change or removal. + var (creates, changes, removals) = getFileChanges(); + + // Match up the creates and removals to get the renames + var renames = matchRenames(creates, removals); + + // Convert to the output format. + var result = convertToFileChanges(creates, changes, removals, renames); + + + return result; + } + + + private void gatherChanges() { var enumerator = new FileSystemChangeEnumerator(this); while(enumerator.MoveNext()) { } + } + + private void acceptChanges() + { + // Clear out the files that have been removed or renamed from our state. + _state.Sweep(_version); + _version++; + } + + private FileChangeList convertToFileChanges( + IEnumerable creates, + IEnumerable changes, + IEnumerable removals, + IEnumerable<(FileState NewFile, FileState OldFile)> renames) + { + var createResults = creates + .Except(renames.Select(x => x.NewFile)) + .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Created)) + ; + + var changeResults = changes + .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Changed)) + ; + + var removeResults = removals + .Except(renames.Select(x => x.OldFile)) + .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Deleted)) + ; + + var renameResults = renames.Select(x => new FileChange( + x.NewFile.Directory, + x.NewFile.Path, + WatcherChangeTypes.Renamed, + x.OldFile.Directory, + x.OldFile.Path)) + ; + + var result = new FileChangeList(); + + result.AddRange(createResults); + result.AddRange(changeResults); + result.AddRange(removeResults); + result.AddRange(renameResults); + + return result; + } - var rawChanges = _state.Read().Where(x => x.LastSeenVersion == _version).ToList(); + private (IEnumerable creates, IEnumerable changes, IEnumerable removals) getFileChanges() + { + var creates = new List(); + var changes = new List(); + var removals = new List(); - var removals = _state.Read().Where(x => x.LastSeenVersion != _version).ToList(); + gatherChanges(); + foreach(var x in _state.Read()) + { + if(x.LastSeenVersion == _version) + { + if(x.CreateVersion == _version) + creates.Add(x); + else + changes.Add(x); + } + else + removals.Add(x); + } + + acceptChanges(); + + return (creates, changes, removals); + } - var createsByTime = rawChanges - .Where(x => x.CreateVersion == _version) + private static IEnumerable<(FileState NewFile, FileState OldFile)> matchRenames( + IEnumerable creates, + IEnumerable removals) + { + var createsByTime = creates .GroupBy(x => new { // Group by last write time, length and directory @@ -83,58 +169,19 @@ public FileChangeList GetChanges() // Join creates and removes by (time, length, directory), then filter to // only those matches which are unambiguous. - var renames = createsByTime.Join(removesByTime, + return createsByTime.Join(removesByTime, x => new { x.LastWriteTimeUtc, x.Length, x.Directory }, x => new { x.LastWriteTimeUtc, x.Length, x.Directory }, (x, y) => new { x.Creates, y.Removes } ) .Where(x => x.Creates.Count == 1 && x.Removes.Count == 1) - .Select(x => new - { - NewFile = x.Creates[0], - OldFile = x.Removes[0] - }) - .ToList(); - - var adds = rawChanges - .Where(x => x.CreateVersion == _version) - .Except(renames.Select(x => x.NewFile)) - .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Created)) - .ToList(); - - var changes = rawChanges - .Where(x => x.ChangeVersion == _version && x.CreateVersion != _version) - .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Changed)) - .ToList(); - - var removes = removals - .Except(renames.Select(x => x.OldFile)) - .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Deleted)) + .Select(x => ( + NewFile: x.Creates[0], + OldFile: x.Removes[0] + )) .ToList(); - - var renames2 = renames.Select(x => new FileChange( - x.NewFile.Directory, - x.NewFile.Path, - WatcherChangeTypes.Renamed, - x.OldFile.Directory, - x.OldFile.Path)) - .ToList(); - - // Clear out the files that have been removed or renamed from our state. - _state.Sweep(_version); - _version++; - - var result = new FileChangeList(); - - result.AddRange(adds); - result.AddRange(changes); - result.AddRange(removes); - result.AddRange(renames2); - - return result; } - protected internal virtual void DetermineChange(string directory, ref FileChangeList changes, ref FileSystemEntry file) { string path = file.FileName.ToString(); From 4f68c37f4370b478f3e27ad82e3771621658dee1 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 8 Feb 2020 17:14:12 +0000 Subject: [PATCH 07/26] Added ability to see files moved from one directory to another. --- .../UnitTests.cs | 37 +++++++++++++++++++ .../FileSystemState.cs | 30 ++++++++++----- 2 files changed, 58 insertions(+), 9 deletions(-) diff --git a/YellowCounter.FileSystemState.Tests/UnitTests.cs b/YellowCounter.FileSystemState.Tests/UnitTests.cs index 37b4010..936c997 100644 --- a/YellowCounter.FileSystemState.Tests/UnitTests.cs +++ b/YellowCounter.FileSystemState.Tests/UnitTests.cs @@ -162,6 +162,43 @@ public static void FileSystemWatcher_Renamed_File() } } + [Fact] + public static void FileSystemWatcher_Renamed_Directory() + { + string currentDir = Utility.GetRandomDirectory(); + string fileName = Path.GetRandomFileName(); + string subDir = Path.Combine(currentDir, "subdir"); + string fullName = Path.Combine(currentDir, fileName); + string newName = Path.Combine(subDir, fileName); + + FileSystemState watcher = new FileSystemState(currentDir, options: new EnumerationOptions() { RecurseSubdirectories = true }); + + Directory.CreateDirectory(subDir); + + using(FileStream file = File.Create(fullName)) { } + watcher.LoadState(); + + File.Move(fullName, Path.Combine(currentDir, newName)); + + var changes = watcher.GetChanges(); + + try + { + Assert.Single(changes); + FileChange change = changes[0]; + Assert.Equal(WatcherChangeTypes.Renamed, change.ChangeType); + Assert.Equal(fileName, change.OldName); + Assert.Equal(currentDir, change.OldDirectory); + Assert.Equal(fileName, change.Name); + Assert.Equal(subDir, change.Directory); + } + finally + { + Directory.Delete(subDir, true); + Directory.Delete(currentDir, true); + } + } + [Fact] public static void FileSystemWatcher_Filter() { diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index a3aa618..25a520d 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -139,17 +139,29 @@ private FileChangeList convertToFileChanges( return (creates, changes, removals); } - private static IEnumerable<(FileState NewFile, FileState OldFile)> matchRenames( - IEnumerable creates, + private IEnumerable<(FileState NewFile, FileState OldFile)> matchRenames( + IEnumerable creates, IEnumerable removals) + { + // Want to match creates and removals to convert to renames either by: + // Same directory, different name + // or different directory, same name. + return matchRenames(creates, removals, false) + .Concat(matchRenames(creates, removals, true)); + } + + private IEnumerable<(FileState NewFile, FileState OldFile)> matchRenames( + IEnumerable creates, + IEnumerable removals, + bool byName) { var createsByTime = creates .GroupBy(x => new { - // Group by last write time, length and directory + // Group by last write time, length and directory or filename x.LastWriteTimeUtc, x.Length, - x.Directory + Name = byName ? x.Directory : x.Path }, (x, y) => new { @@ -157,21 +169,21 @@ private FileChangeList convertToFileChanges( // given (time, length, path) key x.LastWriteTimeUtc, x.Length, - x.Directory, + x.Name, Creates = y.ToList() }) .ToList(); var removesByTime = removals - .GroupBy(x => new { x.LastWriteTimeUtc, x.Length, x.Directory }, - (x, y) => new { x.LastWriteTimeUtc, x.Length, x.Directory, Removes = y.ToList() }) + .GroupBy(x => new { x.LastWriteTimeUtc, x.Length, Name = byName ? x.Directory : x.Path }, + (x, y) => new { x.LastWriteTimeUtc, x.Length, x.Name, Removes = y.ToList() }) .ToList(); // Join creates and removes by (time, length, directory), then filter to // only those matches which are unambiguous. return createsByTime.Join(removesByTime, - x => new { x.LastWriteTimeUtc, x.Length, x.Directory }, - x => new { x.LastWriteTimeUtc, x.Length, x.Directory }, + x => new { x.LastWriteTimeUtc, x.Length, x.Name }, + x => new { x.LastWriteTimeUtc, x.Length, x.Name }, (x, y) => new { x.Creates, y.Removes } ) .Where(x => x.Creates.Count == 1 && x.Removes.Count == 1) From 850c19e162c3c6a8ca42da577db933714f155fe9 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Tue, 11 Feb 2020 08:33:31 +0000 Subject: [PATCH 08/26] Lifting FileSystemEntry through to the collection --- .../FileSystemChangesEnumerator.cs | 58 +++++++++++++------ .../FileSystemState.cs | 18 +++--- .../IAcceptFileSystemEntry.cs | 12 ++++ 3 files changed, 62 insertions(+), 26 deletions(-) create mode 100644 YellowCounter.FileSystemState/IAcceptFileSystemEntry.cs diff --git a/YellowCounter.FileSystemState/FileSystemChangesEnumerator.cs b/YellowCounter.FileSystemState/FileSystemChangesEnumerator.cs index 93444a4..f6557c9 100644 --- a/YellowCounter.FileSystemState/FileSystemChangesEnumerator.cs +++ b/YellowCounter.FileSystemState/FileSystemChangesEnumerator.cs @@ -2,46 +2,68 @@ // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System; +using System.IO; using System.IO.Enumeration; +using System.Runtime.InteropServices; namespace YellowCounter.FileSystemState { - internal class FileSystemChangeEnumerator: FileSystemEnumerator + internal class FileSystemChangeEnumerator : FileSystemEnumerator { - private FileChangeList _changes = new FileChangeList(); - private string _currentDirectory; private FileSystemState _watcher; + private readonly string filter; + private IAcceptFileSystemEntry acceptFileSystemEntry; - public FileSystemChangeEnumerator(FileSystemState watcher) - : base(watcher.Path, watcher.EnumerationOptions) + private static bool ignoreCase; + + static FileSystemChangeEnumerator() { - _watcher = watcher; + ignoreCase = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) + || RuntimeInformation.IsOSPlatform(OSPlatform.OSX); } - public FileChangeList Changes => _changes; + public FileSystemChangeEnumerator( + string filter, + string path, + EnumerationOptions enumerationOptions) + : base(path, enumerationOptions) + { + this.filter = filter; + } - protected override void OnDirectoryFinished(ReadOnlySpan directory) - => _currentDirectory = null; + public void Scan(IAcceptFileSystemEntry acceptFileSystemEntry) + { + this.acceptFileSystemEntry = acceptFileSystemEntry; + + try + { + // Enumerating causes TransformEntry() to be called repeatedly + while(MoveNext()) { } + } + finally + { + this.acceptFileSystemEntry = null; + } + } protected override string TransformEntry(ref FileSystemEntry entry) { - _watcher.DetermineChange(_currentDirectory, ref _changes, ref entry); + acceptFileSystemEntry.Accept(ref entry); return null; } protected override bool ShouldIncludeEntry(ref FileSystemEntry entry) { - // Don't want to convert this to string every time - if (_currentDirectory == null) - _currentDirectory = entry.Directory.ToString(); + if(entry.IsDirectory) + return false; - return _watcher.ShouldIncludeEntry(ref entry); - } + if(FileSystemName.MatchesSimpleExpression(filter, entry.FileName, ignoreCase: ignoreCase)) + return true; - protected override bool ShouldRecurseIntoEntry(ref FileSystemEntry entry) - { - return _watcher.ShouldRecurseIntoEntry(ref entry); + return false; } + + protected override bool ShouldRecurseIntoEntry(ref FileSystemEntry entry) => true; } } diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index 25a520d..6903c23 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -8,7 +8,7 @@ namespace YellowCounter.FileSystemState { - public class FileSystemState + public class FileSystemState : IAcceptFileSystemEntry { private long _version = default; private PathToFileStateHashtable _state = new PathToFileStateHashtable(); @@ -64,8 +64,9 @@ public FileChangeList GetChanges() private void gatherChanges() { - var enumerator = new FileSystemChangeEnumerator(this); - while(enumerator.MoveNext()) { } + var enumerator = new FileSystemChangeEnumerator(this.Filter, this.Path, this.EnumerationOptions); + + enumerator.Scan(); } private void acceptChanges() @@ -194,15 +195,15 @@ private FileChangeList convertToFileChanges( .ToList(); } - protected internal virtual void DetermineChange(string directory, ref FileChangeList changes, ref FileSystemEntry file) + public void Accept(ref FileSystemEntry fileSystemEntry) { - string path = file.FileName.ToString(); + string path = fileSystemEntry.FileName.ToString(); FileState fs = new FileState(); - fs.Directory = directory; + fs.Directory = fileSystemEntry.Directory.ToString(); fs.Path = path; - fs.LastWriteTimeUtc = file.LastWriteTimeUtc; - fs.Length = file.Length; + fs.LastWriteTimeUtc = fileSystemEntry.LastWriteTimeUtc; + fs.Length = fileSystemEntry.Length; _state.Mark(fs, _version); @@ -220,5 +221,6 @@ protected internal virtual bool ShouldIncludeEntry(ref FileSystemEntry entry) } protected internal virtual bool ShouldRecurseIntoEntry(ref FileSystemEntry entry) => true; + } } diff --git a/YellowCounter.FileSystemState/IAcceptFileSystemEntry.cs b/YellowCounter.FileSystemState/IAcceptFileSystemEntry.cs new file mode 100644 index 0000000..7e395f9 --- /dev/null +++ b/YellowCounter.FileSystemState/IAcceptFileSystemEntry.cs @@ -0,0 +1,12 @@ +using System; +using System.Collections.Generic; +using System.IO.Enumeration; +using System.Text; + +namespace YellowCounter.FileSystemState +{ + public interface IAcceptFileSystemEntry + { + void Accept(ref FileSystemEntry fileSystemEntry); + } +} From befd0805ff615766f011a379a74a46737b38bfd6 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Thu, 13 Feb 2020 08:08:39 +0000 Subject: [PATCH 09/26] Hash directory and file names --- .../ReadOnlySpanCharHashing.cs | 23 +++ .../FileSystemChangesEnumerator.cs | 23 +-- .../FileSystemState.cs | 42 +++-- .../PathToFileStateHashtable.cs | 174 +++++++++++++++--- .../ReadOnlySpanExtensions.cs | 28 +++ 5 files changed, 233 insertions(+), 57 deletions(-) create mode 100644 YellowCounter.FileSystemState.Tests/ReadOnlySpanCharHashing.cs create mode 100644 YellowCounter.FileSystemState/ReadOnlySpanExtensions.cs diff --git a/YellowCounter.FileSystemState.Tests/ReadOnlySpanCharHashing.cs b/YellowCounter.FileSystemState.Tests/ReadOnlySpanCharHashing.cs new file mode 100644 index 0000000..c42bd8a --- /dev/null +++ b/YellowCounter.FileSystemState.Tests/ReadOnlySpanCharHashing.cs @@ -0,0 +1,23 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Xunit; +using System.Linq; +using YellowCounter.FileSystemState; + +namespace YellowCounter.FileSystemState.Tests +{ + public class ReadOnlySpanCharHashing + { + [Fact] + public void Test1() + { + var x = new ReadOnlySpan("Hello".ToCharArray()); + var y = new ReadOnlySpan("Hello".ToCharArray()); + + // Note that each run of the program gets a new key so we can't rely + // on a specific fixed value. + Assert.Equal(x.GetHashOfContents(), y.GetHashOfContents()); + } + } +} diff --git a/YellowCounter.FileSystemState/FileSystemChangesEnumerator.cs b/YellowCounter.FileSystemState/FileSystemChangesEnumerator.cs index f6557c9..00de751 100644 --- a/YellowCounter.FileSystemState/FileSystemChangesEnumerator.cs +++ b/YellowCounter.FileSystemState/FileSystemChangesEnumerator.cs @@ -8,7 +8,7 @@ namespace YellowCounter.FileSystemState { - internal class FileSystemChangeEnumerator : FileSystemEnumerator + internal class FileSystemChangeEnumerator : FileSystemEnumerator { private FileSystemState _watcher; private readonly string filter; @@ -25,28 +25,21 @@ static FileSystemChangeEnumerator() public FileSystemChangeEnumerator( string filter, string path, - EnumerationOptions enumerationOptions) + EnumerationOptions enumerationOptions, + IAcceptFileSystemEntry acceptFileSystemEntry) : base(path, enumerationOptions) { this.filter = filter; + this.acceptFileSystemEntry = acceptFileSystemEntry; } - public void Scan(IAcceptFileSystemEntry acceptFileSystemEntry) + public void Scan() { - this.acceptFileSystemEntry = acceptFileSystemEntry; - - try - { - // Enumerating causes TransformEntry() to be called repeatedly - while(MoveNext()) { } - } - finally - { - this.acceptFileSystemEntry = null; - } + // Enumerating causes TransformEntry() to be called repeatedly + while(MoveNext()) { } } - protected override string TransformEntry(ref FileSystemEntry entry) + protected override object TransformEntry(ref FileSystemEntry entry) { acceptFileSystemEntry.Accept(ref entry); diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index 6903c23..a038871 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -10,7 +10,7 @@ namespace YellowCounter.FileSystemState { public class FileSystemState : IAcceptFileSystemEntry { - private long _version = default; + private long _version = 0L; private PathToFileStateHashtable _state = new PathToFileStateHashtable(); public FileSystemState(string path, string filter = "*", EnumerationOptions options = null) @@ -57,18 +57,37 @@ public FileChangeList GetChanges() // Convert to the output format. var result = convertToFileChanges(creates, changes, removals, renames); - return result; } private void gatherChanges() { - var enumerator = new FileSystemChangeEnumerator(this.Filter, this.Path, this.EnumerationOptions); + var enumerator = new FileSystemChangeEnumerator( + this.Filter, + this.Path, + this.EnumerationOptions, + this); enumerator.Scan(); } + public void Accept(ref FileSystemEntry fileSystemEntry) + { + _state.Mark(ref fileSystemEntry, _version); + + //string path = fileSystemEntry.FileName.ToString(); + + //FileState fs = new FileState(); + //fs.Directory = fileSystemEntry.Directory.ToString(); + //fs.Path = path; + //fs.LastWriteTimeUtc = fileSystemEntry.LastWriteTimeUtc; + //fs.Length = fileSystemEntry.Length; + + //_state.Mark(fs, _version); + + } + private void acceptChanges() { // Clear out the files that have been removed or renamed from our state. @@ -114,7 +133,10 @@ private FileChangeList convertToFileChanges( return result; } - private (IEnumerable creates, IEnumerable changes, IEnumerable removals) getFileChanges() + private ( + IEnumerable creates, + IEnumerable changes, + IEnumerable removals) getFileChanges() { var creates = new List(); var changes = new List(); @@ -195,19 +217,7 @@ private FileChangeList convertToFileChanges( .ToList(); } - public void Accept(ref FileSystemEntry fileSystemEntry) - { - string path = fileSystemEntry.FileName.ToString(); - FileState fs = new FileState(); - fs.Directory = fileSystemEntry.Directory.ToString(); - fs.Path = path; - fs.LastWriteTimeUtc = fileSystemEntry.LastWriteTimeUtc; - fs.Length = fileSystemEntry.Length; - - _state.Mark(fs, _version); - - } protected internal virtual bool ShouldIncludeEntry(ref FileSystemEntry entry) { diff --git a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs index fbafe80..684e02e 100644 --- a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs +++ b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs @@ -2,6 +2,7 @@ using System.Collections.Generic; using System.Runtime.Serialization; using System.Linq; +using System.IO.Enumeration; namespace YellowCounter.FileSystemState { @@ -9,57 +10,178 @@ namespace YellowCounter.FileSystemState internal class PathToFileStateHashtable { HashSet hash; + Dictionary> dict; public PathToFileStateHashtable() { hash = new HashSet(100, new FileStateComparer()); - } - public void Mark(FileState input, long version) + dict = new Dictionary>(); + } + + internal void Mark(ref FileSystemEntry input,long version) { - // Is the file already known to us? - if(hash.TryGetValue(input, out FileState fs)) + // Without allocating strings, calculate a hashcode based on the + // directory and filename. + int hashCode = HashCode.Combine( + input.Directory.GetHashOfContents(), + input.FileName.GetHashOfContents()); + + if(dict.TryGetValue(hashCode, out var fileStates)) { - // Mark that we've seen the file. - fs.LastSeenVersion = version; + bool found = false; - // Has it changed since we last saw it? - if(fs.LastWriteTimeUtc != input.LastWriteTimeUtc - || fs.Length != input.Length) + // Normally there will only be 1 but we could get a hash collision. + foreach(var existing in fileStates) { - // Mark that this version was a change - fs.ChangeVersion = version; + // We've only matched on hashcode so far, so there could be false + // matches in here. Do a proper comparision on filename/directory. - // Update the last write time / file length. - fs.LastWriteTimeUtc = input.LastWriteTimeUtc; - fs.Length = input.Length; + // Use Equals() to match to avoid allocating strings. + if(input.FileName.Equals(existing.Path, StringComparison.Ordinal) + && input.Directory.Equals(existing.Directory, StringComparison.Ordinal)) + { + // Found the file; compare to our existing record so we can + // detect if it has been modified. + markExisting(existing, input, version); + + found = true; + break; + } + } + // Hash collision! Add on the end of the list. + if(!found) + { + fileStates.Add(newFileState(input, version)); } } - else // It's a new file. + else { - // Don't futz the input, clone it - FileState fs2 = input.Clone(); + // Not seen before, create a 1-element list and add to the dictionary. + dict.Add(hashCode, new List() { newFileState(input, version) }); + } + } - // Mark that we've seen it - fs2.LastSeenVersion = version; - fs2.CreateVersion = version; - fs2.ChangeVersion = version; + private void markExisting(FileState fs, FileSystemEntry input, long version) + { + // Mark that we've seen the file. + fs.LastSeenVersion = version; - hash.Add(fs2); + // Has it changed since we last saw it? + if(fs.LastWriteTimeUtc != input.LastWriteTimeUtc + || fs.Length != input.Length) + { + // Mark that this version was a change + fs.ChangeVersion = version; + + // Update the last write time / file length. + fs.LastWriteTimeUtc = input.LastWriteTimeUtc; + fs.Length = input.Length; } } - internal void Sweep(long version) + private FileState newFileState(FileSystemEntry input, long version) { - // Remove the records of files that have been deleted. - hash.RemoveWhere(x => x.LastSeenVersion != version); + var fileState = new FileState(); + + fileState.LastSeenVersion = version; + fileState.CreateVersion = version; + fileState.ChangeVersion = version; + + // Here's where we're allocating the strings. Note we only do this when + // we first see a file, not on each subsequent scan for changes. + fileState.Directory = input.Directory.ToString(); + fileState.Path = input.FileName.ToString(); + + fileState.LastWriteTimeUtc = input.LastWriteTimeUtc; + fileState.Length = input.Length; + + return fileState; } public IEnumerable Read() { - foreach(var x in hash) + foreach(var x in dict.Values.SelectMany(y => y)) + { yield return x; + } + } + + public void Sweep(long version) + { + var toRemove = new List(); + + // Go through every list of filestates in our state dictionary + foreach(var (hash, list) in dict) + { + // Remove any item in the list which we didn't see on the last mark + // phase (every item that is seen gets the LastSeenVersion updated) + list.RemoveAll(x => x.LastSeenVersion != version); + + // In the normal case where there are no hash collisions, this will + // remove the one and only item from the list. We can then remove + // the hash entry from the dictionary. + // If there was a hash collision, the reduced-size list would remain. + if(list.Count == 0) + { + toRemove.Add(hash); + } + } + + // We can't remove the items while iterating so remove here instead. + foreach(var hash in toRemove) + { + dict.Remove(hash); + } } + + //public void Mark(FileState input, long version) + //{ + // // Is the file already known to us? + // if(hash.TryGetValue(input, out FileState fs)) + // { + // // Mark that we've seen the file. + // fs.LastSeenVersion = version; + + // // Has it changed since we last saw it? + // if(fs.LastWriteTimeUtc != input.LastWriteTimeUtc + // || fs.Length != input.Length) + // { + // // Mark that this version was a change + // fs.ChangeVersion = version; + + // // Update the last write time / file length. + // fs.LastWriteTimeUtc = input.LastWriteTimeUtc; + // fs.Length = input.Length; + + // } + // } + // else // It's a new file. + // { + // // Don't futz the input, clone it + // FileState fs2 = input.Clone(); + + // // Mark that we've seen it + // fs2.LastSeenVersion = version; + // fs2.CreateVersion = version; + // fs2.ChangeVersion = version; + + // hash.Add(fs2); + // } + //} + + //internal void Sweep(long version) + //{ + // // Remove the records of files that have been deleted. + // hash.RemoveWhere(x => x.LastSeenVersion != version); + //} + + //public IEnumerable Read() + //{ + // foreach(var x in hash) + // yield return x; + //} + } internal class FileStateComparer : IEqualityComparer diff --git a/YellowCounter.FileSystemState/ReadOnlySpanExtensions.cs b/YellowCounter.FileSystemState/ReadOnlySpanExtensions.cs new file mode 100644 index 0000000..7e72bc8 --- /dev/null +++ b/YellowCounter.FileSystemState/ReadOnlySpanExtensions.cs @@ -0,0 +1,28 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace YellowCounter.FileSystemState +{ + public static class ReadOnlySpanExtensions + { + /// + /// Combine hashcodes of each element in the ReadOnlySpan + /// + /// + /// + /// + public static int GetHashOfContents(this ReadOnlySpan span) + { + // struct so allocated on stack + var hash = new HashCode(); + + foreach(var elem in span) + { + hash.Add(elem); + } + + return hash.ToHashCode(); + } + } +} From ef4d24e3cb028625dbe177d79f81b2d2a17cd29f Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Fri, 14 Feb 2020 07:22:17 +0000 Subject: [PATCH 10/26] Attempt to intern strings --- .../UnitTests.cs | 10 +++ .../FileSystemState.cs | 2 +- .../IStringInternPool.cs | 9 +++ .../PathToFileStateHashtable.cs | 77 +++---------------- .../StringInternPool.cs | 48 ++++++++++++ 5 files changed, 80 insertions(+), 66 deletions(-) create mode 100644 YellowCounter.FileSystemState/IStringInternPool.cs create mode 100644 YellowCounter.FileSystemState/StringInternPool.cs diff --git a/YellowCounter.FileSystemState.Tests/UnitTests.cs b/YellowCounter.FileSystemState.Tests/UnitTests.cs index 936c997..c0b5e49 100644 --- a/YellowCounter.FileSystemState.Tests/UnitTests.cs +++ b/YellowCounter.FileSystemState.Tests/UnitTests.cs @@ -296,4 +296,14 @@ public static void FileSystemWatcher_Recursive() Directory.Delete(currentDir, true); } } + + + [Fact] + public static void FileSystemWatcher_BigDir() + { + string currentDir = @"C:\Users\SpanWork\Documents"; + + FileSystemState watcher = new FileSystemState(currentDir, options: new EnumerationOptions { RecurseSubdirectories = true }); + watcher.LoadState(); + } } diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index a038871..5cf0a44 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -11,7 +11,7 @@ namespace YellowCounter.FileSystemState public class FileSystemState : IAcceptFileSystemEntry { private long _version = 0L; - private PathToFileStateHashtable _state = new PathToFileStateHashtable(); + private PathToFileStateHashtable _state = new PathToFileStateHashtable(new StringInternPool()); public FileSystemState(string path, string filter = "*", EnumerationOptions options = null) { diff --git a/YellowCounter.FileSystemState/IStringInternPool.cs b/YellowCounter.FileSystemState/IStringInternPool.cs new file mode 100644 index 0000000..1f43dc3 --- /dev/null +++ b/YellowCounter.FileSystemState/IStringInternPool.cs @@ -0,0 +1,9 @@ +using System; + +namespace YellowCounter.FileSystemState +{ + public interface IStringInternPool + { + string Intern(ref ReadOnlySpan span); + } +} \ No newline at end of file diff --git a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs index 684e02e..b07a0ff 100644 --- a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs +++ b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs @@ -9,13 +9,13 @@ namespace YellowCounter.FileSystemState [Serializable] internal class PathToFileStateHashtable { - HashSet hash; Dictionary> dict; - public PathToFileStateHashtable() - { - hash = new HashSet(100, new FileStateComparer()); + private readonly IStringInternPool stringInternPool; + public PathToFileStateHashtable(IStringInternPool stringInternPool) + { dict = new Dictionary>(); + this.stringInternPool = stringInternPool; } internal void Mark(ref FileSystemEntry input,long version) @@ -90,8 +90,14 @@ private FileState newFileState(FileSystemEntry input, long version) // Here's where we're allocating the strings. Note we only do this when // we first see a file, not on each subsequent scan for changes. - fileState.Directory = input.Directory.ToString(); - fileState.Path = input.FileName.ToString(); + var dir = input.Directory; + var fn = input.FileName; + + fileState.Directory = stringInternPool.Intern(ref dir); + fileState.Path = stringInternPool.Intern(ref fn); + + //fileState.Directory = input.Directory.ToString(); + //fileState.Path = input.FileName.ToString(); fileState.LastWriteTimeUtc = input.LastWriteTimeUtc; fileState.Length = input.Length; @@ -134,65 +140,6 @@ public void Sweep(long version) dict.Remove(hash); } } - - //public void Mark(FileState input, long version) - //{ - // // Is the file already known to us? - // if(hash.TryGetValue(input, out FileState fs)) - // { - // // Mark that we've seen the file. - // fs.LastSeenVersion = version; - - // // Has it changed since we last saw it? - // if(fs.LastWriteTimeUtc != input.LastWriteTimeUtc - // || fs.Length != input.Length) - // { - // // Mark that this version was a change - // fs.ChangeVersion = version; - - // // Update the last write time / file length. - // fs.LastWriteTimeUtc = input.LastWriteTimeUtc; - // fs.Length = input.Length; - - // } - // } - // else // It's a new file. - // { - // // Don't futz the input, clone it - // FileState fs2 = input.Clone(); - - // // Mark that we've seen it - // fs2.LastSeenVersion = version; - // fs2.CreateVersion = version; - // fs2.ChangeVersion = version; - - // hash.Add(fs2); - // } - //} - - //internal void Sweep(long version) - //{ - // // Remove the records of files that have been deleted. - // hash.RemoveWhere(x => x.LastSeenVersion != version); - //} - - //public IEnumerable Read() - //{ - // foreach(var x in hash) - // yield return x; - //} - } - internal class FileStateComparer : IEqualityComparer - { - // Equivalent if directory and path match. - public bool Equals(FileState x, FileState y) - { - return x.Directory == y.Directory && x.Path == y.Path; - } - - public int GetHashCode(FileState obj) => - HashCode.Combine(obj.Directory.GetHashCode() ^ obj.Path.GetHashCode()); - } } diff --git a/YellowCounter.FileSystemState/StringInternPool.cs b/YellowCounter.FileSystemState/StringInternPool.cs new file mode 100644 index 0000000..15c9483 --- /dev/null +++ b/YellowCounter.FileSystemState/StringInternPool.cs @@ -0,0 +1,48 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace YellowCounter.FileSystemState +{ + /// + /// Not thread-safe string interning. + /// Probably needs a garbage collector at some point? + /// + public class StringInternPool : IStringInternPool + { + public Dictionary> dict = new Dictionary>(); + + public string Intern(ref ReadOnlySpan span) + { + int hash = span.GetHashOfContents(); + + if(dict.TryGetValue(hash, out var strings)) + { + foreach(var s in strings) + { + // Interned case - found existing string which matches. + if(span.Equals(s, StringComparison.Ordinal)) + return s; + } + + // Hash collision + string newString = span.ToString(); + strings.Add(newString); + + return newString; + } + else + { + // Add new item + string newString = span.ToString(); + + var newList = new List(); + newList.Add(newString); + + dict.Add(hash, newList); + + return newString; + } + } + } +} From 41509910abab8d0bf0e1e37cd5d367c331ddcb40 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Fri, 28 Feb 2020 07:43:06 +0000 Subject: [PATCH 11/26] Reduce string length by cutting off root folder --- .../UnitTests.cs | 8 ++-- ...YellowCounter.FileSystemState.Tests.csproj | 2 + YellowCounter.FileSystemState/FileState.cs | 4 +- .../FileSystemChangesEnumerator.cs | 13 ++++- .../FileSystemState.cs | 47 ++++++++++--------- .../PathToFileStateHashtable.cs | 28 ++++++----- 6 files changed, 64 insertions(+), 38 deletions(-) diff --git a/YellowCounter.FileSystemState.Tests/UnitTests.cs b/YellowCounter.FileSystemState.Tests/UnitTests.cs index c0b5e49..cfe05db 100644 --- a/YellowCounter.FileSystemState.Tests/UnitTests.cs +++ b/YellowCounter.FileSystemState.Tests/UnitTests.cs @@ -11,7 +11,7 @@ public static void FileSystemWatcher_ctor_Defaults() string path = Environment.CurrentDirectory; var watcher = new FileSystemState(path); - Assert.Equal(path, watcher.Path); + Assert.Equal(path, watcher.RootDir); Assert.Equal("*", watcher.Filter); Assert.NotNull(watcher.EnumerationOptions); } @@ -23,7 +23,7 @@ public static void FileSystemWatcher_ctor_OptionalParams() const string filter = "*.csv"; var watcher = new FileSystemState(currentDir, filter, new EnumerationOptions { RecurseSubdirectories = true }); - Assert.Equal(currentDir, watcher.Path); + Assert.Equal(currentDir, watcher.RootDir); Assert.Equal(filter, watcher.Filter); Assert.True(watcher.EnumerationOptions.RecurseSubdirectories); } @@ -32,7 +32,7 @@ public static void FileSystemWatcher_ctor_OptionalParams() public static void FileSystemWatcher_ctor_Null() { // Not valid - Assert.Throws("path", () => new FileSystemState(null)); + Assert.Throws("rootDir", () => new FileSystemState(null)); Assert.Throws("filter", () => new FileSystemState(Environment.CurrentDirectory, null)); // Valid @@ -305,5 +305,7 @@ public static void FileSystemWatcher_BigDir() FileSystemState watcher = new FileSystemState(currentDir, options: new EnumerationOptions { RecurseSubdirectories = true }); watcher.LoadState(); + + var q = watcher.GetChanges(); } } diff --git a/YellowCounter.FileSystemState.Tests/YellowCounter.FileSystemState.Tests.csproj b/YellowCounter.FileSystemState.Tests/YellowCounter.FileSystemState.Tests.csproj index d42f712..a544178 100644 --- a/YellowCounter.FileSystemState.Tests/YellowCounter.FileSystemState.Tests.csproj +++ b/YellowCounter.FileSystemState.Tests/YellowCounter.FileSystemState.Tests.csproj @@ -8,6 +8,8 @@ + + diff --git a/YellowCounter.FileSystemState/FileState.cs b/YellowCounter.FileSystemState/FileState.cs index 79ed4c5..9a99185 100644 --- a/YellowCounter.FileSystemState/FileState.cs +++ b/YellowCounter.FileSystemState/FileState.cs @@ -16,8 +16,8 @@ internal class FileState [NonSerialized] public long ChangeVersion; - public string Directory; - public string Path; + public string RelativeDir; + public string FileName; public DateTimeOffset LastWriteTimeUtc; public long Length; diff --git a/YellowCounter.FileSystemState/FileSystemChangesEnumerator.cs b/YellowCounter.FileSystemState/FileSystemChangesEnumerator.cs index 00de751..03ad864 100644 --- a/YellowCounter.FileSystemState/FileSystemChangesEnumerator.cs +++ b/YellowCounter.FileSystemState/FileSystemChangesEnumerator.cs @@ -2,6 +2,7 @@ // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System; +using System.Collections.Generic; using System.IO; using System.IO.Enumeration; using System.Runtime.InteropServices; @@ -10,9 +11,9 @@ namespace YellowCounter.FileSystemState { internal class FileSystemChangeEnumerator : FileSystemEnumerator { - private FileSystemState _watcher; private readonly string filter; private IAcceptFileSystemEntry acceptFileSystemEntry; + //private string currentDirectory; private static bool ignoreCase; @@ -39,6 +40,13 @@ public void Scan() while(MoveNext()) { } } + protected override void OnDirectoryFinished(ReadOnlySpan directory) + { + //currentDirectory = null; + + base.OnDirectoryFinished(directory); + } + protected override object TransformEntry(ref FileSystemEntry entry) { acceptFileSystemEntry.Accept(ref entry); @@ -48,6 +56,9 @@ protected override object TransformEntry(ref FileSystemEntry entry) protected override bool ShouldIncludeEntry(ref FileSystemEntry entry) { + //if(currentDirectory == null) + // currentDirectory = entry.Directory.ToString(); + if(entry.IsDirectory) return false; diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index 5cf0a44..2d6ca84 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -11,38 +11,43 @@ namespace YellowCounter.FileSystemState public class FileSystemState : IAcceptFileSystemEntry { private long _version = 0L; - private PathToFileStateHashtable _state = new PathToFileStateHashtable(new StringInternPool()); + private PathToFileStateHashtable _state; - public FileSystemState(string path, string filter = "*", EnumerationOptions options = null) + public FileSystemState(string rootDir, string filter = "*", EnumerationOptions options = null) { - Path = path ?? throw new ArgumentNullException(nameof(path)); - Filter = filter ?? throw new ArgumentNullException(nameof(filter)); + this.RootDir = rootDir ?? throw new ArgumentNullException(nameof(rootDir)); + this.Filter = filter ?? throw new ArgumentNullException(nameof(filter)); - if (!Directory.Exists(path)) + if (!Directory.Exists(rootDir)) throw new DirectoryNotFoundException(); EnumerationOptions = options ?? new EnumerationOptions(); + + _state = new PathToFileStateHashtable(new StringInternPool(), this.RootDir.Length); } - public string Path { get; set; } + public string RootDir { get; set; } public string Filter { get; set; } public EnumerationOptions EnumerationOptions { get; set; } public void LoadState() { - GetChanges(); + // Set initial baseline by reading current directory state without returning + // every file as a change. + gatherChanges(); + acceptChanges(); } public void LoadState(Stream stream) { - BinaryFormatter serializer = new BinaryFormatter(); - _state = (PathToFileStateHashtable)serializer.Deserialize(stream); + //BinaryFormatter serializer = new BinaryFormatter(); + //_state = (PathToFileStateHashtable)serializer.Deserialize(stream); } public void SaveState(Stream stream) { - BinaryFormatter serializer = new BinaryFormatter(); - serializer.Serialize(stream, _state); + //BinaryFormatter serializer = new BinaryFormatter(); + //serializer.Serialize(stream, _state); } // This function walks all watched files, collects changes, and updates state @@ -65,7 +70,7 @@ private void gatherChanges() { var enumerator = new FileSystemChangeEnumerator( this.Filter, - this.Path, + this.RootDir, this.EnumerationOptions, this); @@ -103,24 +108,24 @@ private FileChangeList convertToFileChanges( { var createResults = creates .Except(renames.Select(x => x.NewFile)) - .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Created)) + .Select(x => new FileChange(this.RootDir + x.RelativeDir, x.FileName, WatcherChangeTypes.Created)) ; var changeResults = changes - .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Changed)) + .Select(x => new FileChange(this.RootDir + x.RelativeDir, x.FileName, WatcherChangeTypes.Changed)) ; var removeResults = removals .Except(renames.Select(x => x.OldFile)) - .Select(x => new FileChange(x.Directory, x.Path, WatcherChangeTypes.Deleted)) + .Select(x => new FileChange(this.RootDir + x.RelativeDir, x.FileName, WatcherChangeTypes.Deleted)) ; var renameResults = renames.Select(x => new FileChange( - x.NewFile.Directory, - x.NewFile.Path, + this.RootDir + x.NewFile.RelativeDir, + x.NewFile.FileName, WatcherChangeTypes.Renamed, - x.OldFile.Directory, - x.OldFile.Path)) + this.RootDir + x.OldFile.RelativeDir, + x.OldFile.FileName)) ; var result = new FileChangeList(); @@ -184,7 +189,7 @@ private FileChangeList convertToFileChanges( // Group by last write time, length and directory or filename x.LastWriteTimeUtc, x.Length, - Name = byName ? x.Directory : x.Path + Name = byName ? x.RelativeDir : x.FileName }, (x, y) => new { @@ -198,7 +203,7 @@ private FileChangeList convertToFileChanges( .ToList(); var removesByTime = removals - .GroupBy(x => new { x.LastWriteTimeUtc, x.Length, Name = byName ? x.Directory : x.Path }, + .GroupBy(x => new { x.LastWriteTimeUtc, x.Length, Name = byName ? x.RelativeDir : x.FileName }, (x, y) => new { x.LastWriteTimeUtc, x.Length, x.Name, Removes = y.ToList() }) .ToList(); diff --git a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs index b07a0ff..a3d1a38 100644 --- a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs +++ b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs @@ -11,19 +11,26 @@ internal class PathToFileStateHashtable { Dictionary> dict; private readonly IStringInternPool stringInternPool; + private readonly int truncate; - public PathToFileStateHashtable(IStringInternPool stringInternPool) + public PathToFileStateHashtable(IStringInternPool stringInternPool, int truncate = 0) { dict = new Dictionary>(); this.stringInternPool = stringInternPool; + this.truncate = truncate; } - + internal void Mark(ref FileSystemEntry input,long version) { + // If we are scanning folder c:\verylongdirectoryname\ there is no need to store + // the same text c:\verylongdirectoryname\ over and over again so we remove the + // root from the directory name leaving the relative path + var relativeDir = input.Directory.Slice(truncate); + // Without allocating strings, calculate a hashcode based on the // directory and filename. int hashCode = HashCode.Combine( - input.Directory.GetHashOfContents(), + relativeDir.GetHashOfContents(), input.FileName.GetHashOfContents()); if(dict.TryGetValue(hashCode, out var fileStates)) @@ -37,8 +44,8 @@ internal void Mark(ref FileSystemEntry input,long version) // matches in here. Do a proper comparision on filename/directory. // Use Equals() to match to avoid allocating strings. - if(input.FileName.Equals(existing.Path, StringComparison.Ordinal) - && input.Directory.Equals(existing.Directory, StringComparison.Ordinal)) + if(input.FileName.Equals(existing.FileName, StringComparison.Ordinal) + && relativeDir.Equals(existing.RelativeDir, StringComparison.Ordinal)) { // Found the file; compare to our existing record so we can // detect if it has been modified. @@ -52,13 +59,13 @@ internal void Mark(ref FileSystemEntry input,long version) // Hash collision! Add on the end of the list. if(!found) { - fileStates.Add(newFileState(input, version)); + fileStates.Add(newFileState(input, ref relativeDir, version)); } } else { // Not seen before, create a 1-element list and add to the dictionary. - dict.Add(hashCode, new List() { newFileState(input, version) }); + dict.Add(hashCode, new List() { newFileState(input, ref relativeDir, version) }); } } @@ -80,7 +87,7 @@ private void markExisting(FileState fs, FileSystemEntry input, long version) } } - private FileState newFileState(FileSystemEntry input, long version) + private FileState newFileState(FileSystemEntry input, ref ReadOnlySpan relativeDir, long version) { var fileState = new FileState(); @@ -90,11 +97,10 @@ private FileState newFileState(FileSystemEntry input, long version) // Here's where we're allocating the strings. Note we only do this when // we first see a file, not on each subsequent scan for changes. - var dir = input.Directory; var fn = input.FileName; - fileState.Directory = stringInternPool.Intern(ref dir); - fileState.Path = stringInternPool.Intern(ref fn); + fileState.RelativeDir = stringInternPool.Intern(ref relativeDir); + fileState.FileName = stringInternPool.Intern(ref fn); //fileState.Directory = input.Directory.ToString(); //fileState.Path = input.FileName.ToString(); From dabe8428a66dfbfa7acb3ca47fd51387f6098887 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Fri, 28 Feb 2020 17:31:05 +0000 Subject: [PATCH 12/26] PathRedux first trial version --- PathReduxTests/PathRedux/PathStorageTests.cs | 36 ++++++++ PathReduxTests/PathReduxTests.csproj | 22 +++++ ...YellowCounter.FileSystemState.Tests.csproj | 1 + YellowCounter.FileSystemState.sln | 12 ++- .../PathRedux/PathStorage.cs | 86 +++++++++++++++++++ 5 files changed, 154 insertions(+), 3 deletions(-) create mode 100644 PathReduxTests/PathRedux/PathStorageTests.cs create mode 100644 PathReduxTests/PathReduxTests.csproj create mode 100644 YellowCounter.FileSystemState/PathRedux/PathStorage.cs diff --git a/PathReduxTests/PathRedux/PathStorageTests.cs b/PathReduxTests/PathRedux/PathStorageTests.cs new file mode 100644 index 0000000..b413bdd --- /dev/null +++ b/PathReduxTests/PathRedux/PathStorageTests.cs @@ -0,0 +1,36 @@ +using Microsoft.VisualStudio.TestTools.UnitTesting; +using System; +using System.Collections.Generic; +using System.Text; +using YellowCounter.FileSystemState.PathRedux; +using Shouldly; + +namespace YellowCounter.FileSystemState.Tests.PathRedux +{ + [TestClass] + public class PathStorageTests + { + [TestMethod] + public void PathStorage1() + { + var pathStorage = new PathStorage(); + + int idx1 = pathStorage.Store("Hello"); + int idx2 = pathStorage.Store("World"); + + pathStorage.Retrieve(idx1).ToString().ShouldBe("Hello"); + pathStorage.Retrieve(idx2).ToString().ShouldBe("World"); + } + + [TestMethod] + public void PathStorage2() + { + var pathStorage = new PathStorage(); + + int idx1 = pathStorage.Store("Hello"); + int idx2 = pathStorage.Store("World"); + + pathStorage.Retrieve(new[] { idx1, idx2 }).ToString().ShouldBe("HelloWorld"); + } + } +} diff --git a/PathReduxTests/PathReduxTests.csproj b/PathReduxTests/PathReduxTests.csproj new file mode 100644 index 0000000..15913c5 --- /dev/null +++ b/PathReduxTests/PathReduxTests.csproj @@ -0,0 +1,22 @@ + + + + netcoreapp3.1 + + false + + + + + + + + + + + + + + + + diff --git a/YellowCounter.FileSystemState.Tests/YellowCounter.FileSystemState.Tests.csproj b/YellowCounter.FileSystemState.Tests/YellowCounter.FileSystemState.Tests.csproj index a544178..757f763 100644 --- a/YellowCounter.FileSystemState.Tests/YellowCounter.FileSystemState.Tests.csproj +++ b/YellowCounter.FileSystemState.Tests/YellowCounter.FileSystemState.Tests.csproj @@ -8,6 +8,7 @@ + diff --git a/YellowCounter.FileSystemState.sln b/YellowCounter.FileSystemState.sln index a4f00e3..a038a13 100644 --- a/YellowCounter.FileSystemState.sln +++ b/YellowCounter.FileSystemState.sln @@ -1,12 +1,14 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 15 -VisualStudioVersion = 15.0.27703.2026 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29728.190 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "YellowCounter.FileSystemState", "YellowCounter.FileSystemState\YellowCounter.FileSystemState.csproj", "{8C085D5D-AC6F-48D9-A547-B6C92E18D2FB}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "YellowCounter.FileSystemState", "YellowCounter.FileSystemState\YellowCounter.FileSystemState.csproj", "{8C085D5D-AC6F-48D9-A547-B6C92E18D2FB}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "YellowCounter.FileSystemState.Tests", "YellowCounter.FileSystemState.Tests\YellowCounter.FileSystemState.Tests.csproj", "{EE22E810-4ADC-4399-9C72-B2B70831EB05}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PathReduxTests", "PathReduxTests\PathReduxTests.csproj", "{33F0288C-B927-4145-84E1-321BD5AD8996}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -21,6 +23,10 @@ Global {EE22E810-4ADC-4399-9C72-B2B70831EB05}.Debug|Any CPU.Build.0 = Debug|Any CPU {EE22E810-4ADC-4399-9C72-B2B70831EB05}.Release|Any CPU.ActiveCfg = Release|Any CPU {EE22E810-4ADC-4399-9C72-B2B70831EB05}.Release|Any CPU.Build.0 = Release|Any CPU + {33F0288C-B927-4145-84E1-321BD5AD8996}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {33F0288C-B927-4145-84E1-321BD5AD8996}.Debug|Any CPU.Build.0 = Debug|Any CPU + {33F0288C-B927-4145-84E1-321BD5AD8996}.Release|Any CPU.ActiveCfg = Release|Any CPU + {33F0288C-B927-4145-84E1-321BD5AD8996}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs new file mode 100644 index 0000000..54adfe8 --- /dev/null +++ b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs @@ -0,0 +1,86 @@ +using System; +using System.Buffers; +using System.Collections.Generic; +using System.Text; + +namespace YellowCounter.FileSystemState.PathRedux +{ + public class PathStorage + { + private Memory buffer = new char[100000]; + private int pos; + + public void Init() + { + } + + public int Store(ReadOnlySpan input) + { + var bufSpan = buffer.Span; + + var result = pos; + + input.CopyTo(bufSpan.Slice(pos, input.Length)); + pos += input.Length; + + bufSpan[pos] = '\0'; + pos++; + + return result; + } + + public ReadOnlySpan Retrieve(int index) + { + var bufSpan = buffer.Span; + + var begin = bufSpan.Slice(index); + + int len = begin.IndexOf('\0'); + + return begin.Slice(0, len); + } + + public ReadOnlySequence Retrieve(IEnumerable indices) + { + Segment root = null; + Segment current = null; + + int len = 0; + + foreach(var idx in indices) + { + var tail = buffer.Slice(idx); + len = tail.Span.IndexOf('\0'); + var text = tail.Slice(0, len); + + if(root == null) + { + root = new Segment(text); + current = root; + } + else + { + current = current.Add(text); + } + } + + return new ReadOnlySequence(root, 0, current, len); + } + + class Segment : ReadOnlySequenceSegment + { + public Segment(ReadOnlyMemory memory) + => Memory = memory; + public Segment Add(ReadOnlyMemory mem) + { + var segment = new Segment(mem); + segment.RunningIndex = RunningIndex + + Memory.Length; + Next = segment; + return segment; + } + } + + } + +} From 14616e686c08a94e204d2c80713bc76bd5d88702 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Fri, 6 Mar 2020 12:54:22 +0000 Subject: [PATCH 13/26] Attempt to do chained lookup --- .../PathRedux/ChainedLookupTests.cs | 95 +++++++++ PathReduxTests/PathRedux/CharBufferTests.cs | 79 ++++++++ .../PathRedux/HashedCharBufferTests.cs | 29 +++ PathReduxTests/PathRedux/PathStorageTests.cs | 36 ---- .../PathRedux/ChainedLookup.cs | 75 ++++++++ .../PathRedux/CharBuffer.cs | 180 ++++++++++++++++++ .../PathRedux/HashedCharBuffer.cs | 108 +++++++++++ .../PathRedux/PathStorage.cs | 86 --------- 8 files changed, 566 insertions(+), 122 deletions(-) create mode 100644 PathReduxTests/PathRedux/ChainedLookupTests.cs create mode 100644 PathReduxTests/PathRedux/CharBufferTests.cs create mode 100644 PathReduxTests/PathRedux/HashedCharBufferTests.cs delete mode 100644 PathReduxTests/PathRedux/PathStorageTests.cs create mode 100644 YellowCounter.FileSystemState/PathRedux/ChainedLookup.cs create mode 100644 YellowCounter.FileSystemState/PathRedux/CharBuffer.cs create mode 100644 YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs delete mode 100644 YellowCounter.FileSystemState/PathRedux/PathStorage.cs diff --git a/PathReduxTests/PathRedux/ChainedLookupTests.cs b/PathReduxTests/PathRedux/ChainedLookupTests.cs new file mode 100644 index 0000000..a089376 --- /dev/null +++ b/PathReduxTests/PathRedux/ChainedLookupTests.cs @@ -0,0 +1,95 @@ +using Microsoft.VisualStudio.TestTools.UnitTesting; +using System; +using System.Collections.Generic; +using System.Text; +using YellowCounter.FileSystemState.PathRedux; +using Shouldly; + +namespace PathReduxTests.PathRedux +{ + [TestClass] + public class ChainedLookupTests + { + [TestMethod] + public void ChainedLookupStoreRetrieve() + { + var m = new ChainedLookup(2, 2); + + m.Store(0, 123456).ShouldBe(true); + m.Store(0, 765432).ShouldBe(true); + + var result = m.Retrieve(0); + + result.ToArray().ShouldBe(new[] { 123456, 765432 }); + } + + [TestMethod] + public void ChainedLookupStoreFlowpast() + { + var m = new ChainedLookup(2, 2); + + m.Store(1, 123456).ShouldBe(true); + m.Store(1, 765432).ShouldBe(false); + + var result = m.Retrieve(1); + + result.ToArray().ShouldBe(new[] { 123456 }); + } + + [TestMethod] + public void ChainedLookupStoreZero() + { + var m = new ChainedLookup(2, 2); + + // It can store a zero + m.Store(0, 0).ShouldBe(true); + + var result = m.Retrieve(0); + result.ToArray().ShouldBe(new[] { 0 }); + } + + [TestMethod] + public void ChainedLookupChainLimit() + { + var m = new ChainedLookup(8, 2); + + m.Store(0, 100).ShouldBe(true); + m.Store(0, 200).ShouldBe(true); + m.Store(0, 300).ShouldBe(false); + + var result = m.Retrieve(0); + + result.ToArray().ShouldBe(new[] { 100, 200 }); + } + + [TestMethod] + public void ChainedLookupOverlap() + { + var m = new ChainedLookup(8, 8); + + // The values are going to overlap. + m.Store(0, 100).ShouldBe(true); + m.Store(1, 200).ShouldBe(true); + m.Store(0, 300).ShouldBe(true); + + var result = m.Retrieve(0); + + result.ToArray().ShouldBe(new[] { 100, 200, 300 }); + } + + [TestMethod] + public void ChainedLookupOverlapLimited() + { + var m = new ChainedLookup(8, 2); + + // If we set the max chain to a lower value then the overlap + // won't occur. + m.Store(0, 100).ShouldBe(true); + m.Store(1, 200).ShouldBe(true); + m.Store(0, 300).ShouldBe(false); + + m.Retrieve(0).ToArray().ShouldBe(new[] { 100, 200 }); + m.Retrieve(1).ToArray().ShouldBe(new[] { 200 }); + } + } +} diff --git a/PathReduxTests/PathRedux/CharBufferTests.cs b/PathReduxTests/PathRedux/CharBufferTests.cs new file mode 100644 index 0000000..ca6f8e3 --- /dev/null +++ b/PathReduxTests/PathRedux/CharBufferTests.cs @@ -0,0 +1,79 @@ +using Microsoft.VisualStudio.TestTools.UnitTesting; +using System; +using System.Collections.Generic; +using System.Text; +using YellowCounter.FileSystemState.PathRedux; +using Shouldly; + +namespace YellowCounter.FileSystemState.Tests.PathRedux +{ + [TestClass] + public class CharBufferTests + { + [TestMethod] + public void CharBuffer1() + { + var charBuffer = new CharBuffer(100); + + int idx1 = charBuffer.Store("Hello"); + int idx2 = charBuffer.Store("World"); + + charBuffer.Retrieve(idx1).ToString().ShouldBe("Hello"); + charBuffer.Retrieve(idx2).ToString().ShouldBe("World"); + } + + [TestMethod] + public void CharBuffer2() + { + var charBuffer = new CharBuffer(100); + + int idx1 = charBuffer.Store("Hello"); + int idx2 = charBuffer.Store("World"); + + charBuffer.Retrieve(new[] { idx1, idx2 }).ToString().ShouldBe("HelloWorld"); + } + + [TestMethod] + public void CharBufferRealloc() + { + var charBuffer = new CharBuffer(13); + + int idx1 = charBuffer.Store("Hello"); + int idx2 = charBuffer.Store("World"); + + var helloSpan = charBuffer.Retrieve(idx1); + + var worldSpan = charBuffer.Retrieve(idx2); + + charBuffer.Resize(25); + + // These spans are still pointing at the old buffer - how does it avoid + // freeing up the memory? + helloSpan.ToString().ShouldBe("Hello"); + worldSpan.ToString().ShouldBe("World"); + + var hello2Span = charBuffer.Retrieve(idx1); + var world2Span = charBuffer.Retrieve(idx2); + + hello2Span.ToString().ShouldBe("Hello"); + world2Span.ToString().ShouldBe("World"); + } + + [TestMethod] + public void CharBufferEnumerate() + { + var charBuffer = new CharBuffer(100); + + int idx1 = charBuffer.Store("Hello"); + int idx2 = charBuffer.Store("World"); + + var results = new List(); + foreach(var item in charBuffer) + { + results.Add(item.Span.ToString()); + } + + results.ShouldBe(new[] { "Hello", "World" }); + } + } +} diff --git a/PathReduxTests/PathRedux/HashedCharBufferTests.cs b/PathReduxTests/PathRedux/HashedCharBufferTests.cs new file mode 100644 index 0000000..51a3a07 --- /dev/null +++ b/PathReduxTests/PathRedux/HashedCharBufferTests.cs @@ -0,0 +1,29 @@ +using Microsoft.VisualStudio.TestTools.UnitTesting; +using Shouldly; +using System; +using System.Collections.Generic; +using System.Text; +using YellowCounter.FileSystemState.PathRedux; + +namespace PathReduxTests.PathRedux +{ + [TestClass] + public class HashedCharBufferTests + { + + [TestMethod] + public void HashedCharBufferAddAndRetrieve() + { + var buf = new HashedCharBuffer(20, 16, 3); + + buf.Store("Hello"); + buf.Store("World"); + + buf.Find("Hello").ShouldBe(0); + buf.Find("World").ShouldBe(6); + + buf.Retrieve(0).ToString().ShouldBe("Hello"); + buf.Retrieve(6).ToString().ShouldBe("World"); + } + } +} diff --git a/PathReduxTests/PathRedux/PathStorageTests.cs b/PathReduxTests/PathRedux/PathStorageTests.cs deleted file mode 100644 index b413bdd..0000000 --- a/PathReduxTests/PathRedux/PathStorageTests.cs +++ /dev/null @@ -1,36 +0,0 @@ -using Microsoft.VisualStudio.TestTools.UnitTesting; -using System; -using System.Collections.Generic; -using System.Text; -using YellowCounter.FileSystemState.PathRedux; -using Shouldly; - -namespace YellowCounter.FileSystemState.Tests.PathRedux -{ - [TestClass] - public class PathStorageTests - { - [TestMethod] - public void PathStorage1() - { - var pathStorage = new PathStorage(); - - int idx1 = pathStorage.Store("Hello"); - int idx2 = pathStorage.Store("World"); - - pathStorage.Retrieve(idx1).ToString().ShouldBe("Hello"); - pathStorage.Retrieve(idx2).ToString().ShouldBe("World"); - } - - [TestMethod] - public void PathStorage2() - { - var pathStorage = new PathStorage(); - - int idx1 = pathStorage.Store("Hello"); - int idx2 = pathStorage.Store("World"); - - pathStorage.Retrieve(new[] { idx1, idx2 }).ToString().ShouldBe("HelloWorld"); - } - } -} diff --git a/YellowCounter.FileSystemState/PathRedux/ChainedLookup.cs b/YellowCounter.FileSystemState/PathRedux/ChainedLookup.cs new file mode 100644 index 0000000..83b623f --- /dev/null +++ b/YellowCounter.FileSystemState/PathRedux/ChainedLookup.cs @@ -0,0 +1,75 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Collections.Specialized; +using System.Text; + +namespace YellowCounter.FileSystemState.PathRedux +{ + public class ChainedLookup + { + private Memory mem; + private readonly int capacity; + private readonly int maxChain; + private BitArray usage; + + public ChainedLookup(int capacity, int maxChain) + { + mem = new int[capacity]; + usage = new BitArray(capacity); + this.capacity = capacity; + this.maxChain = maxChain; + } + + public int Capacity => mem.Length; + + public bool Store(int hash, int value) + { + int key = keyFromHash(hash); + + var span = mem.Span; + int chainLen = 0; + + // Look for an empty slot in our buffer + for(int i = key; i < capacity; i++) + { + if(!usage[i]) + { + span[i] = value; + usage[i] = true; + + return true; + } + + chainLen++; + + // Don't build up too long a chain of values - we'll build a new + // buffer instead. + if(chainLen >= maxChain) + return false; + } + + return false; + } + + private int keyFromHash(int hash) => (int)unchecked((uint)hash % (uint)Capacity); + + public ReadOnlySpan Retrieve(int hash) + { + int key = keyFromHash(hash); + + var span = mem.Span; + int chainLen = 0; + + for(int i = key; i < capacity && chainLen <= maxChain; i++) + { + if(!usage[i]) + break; + + chainLen++; + } + + return mem.Span.Slice(key, chainLen); + } + } +} diff --git a/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs b/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs new file mode 100644 index 0000000..43e4dd9 --- /dev/null +++ b/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs @@ -0,0 +1,180 @@ +using System; +using System.Buffers; +using System.Collections; +using System.Collections.Generic; +using System.Text; + +namespace YellowCounter.FileSystemState.PathRedux +{ + public class CharBuffer + { + + private Memory buffer; + private int pos; + + public CharBuffer(int capacity) + { + buffer = new char[capacity]; + } + + public int Capacity => buffer.Length; + + public void Resize(int capacity) + { + if(capacity < pos) + throw new Exception("Cannot resize because data truncation would occur"); + + var newBuffer = new char[capacity]; + + this.buffer.CopyTo(newBuffer); + + this.buffer = newBuffer; + } + + public int Store(ReadOnlySpan input) + { + if(input.Length + pos + 1 >= buffer.Length) + return -1; + + var bufSpan = buffer.Span; + + // Return current buffer start position as the result. + var result = pos; + + // Write the text into our buffer + input.CopyTo(bufSpan.Slice(pos, input.Length)); + pos += input.Length; + + // Null terminate + bufSpan[pos] = '\0'; + pos++; + + return result; + } + + public int Match(ReadOnlySpan arg, ReadOnlySpan indices) + { + var bufSpan = buffer.Span; + + foreach(int idx in indices) + { + if(bufSpan.Slice(idx, arg.Length).SequenceEqual(arg)) + { + // Check for null terminator so we don't match to a + // longer string. + if(bufSpan[idx + arg.Length] == '\0') + return idx; + } + } + + // -1 for not found. + return -1; + } + + public ReadOnlySpan Retrieve(int index) + { + var bufSpan = buffer.Span; + + var begin = bufSpan.Slice(index); + + int len = begin.IndexOf('\0'); + + return begin.Slice(0, len); + } + + + + public Enumerator GetEnumerator() + { + var bufSpan = buffer.Span; + + return new Enumerator(bufSpan); + } + + public ref struct Enumerator + { + private int pos; + private int len; + ReadOnlySpan bufSpan; + Item current; + + public Enumerator(ReadOnlySpan bufSpan) + { + pos = -1; + len = 0; + this.bufSpan = bufSpan; + current = new Item(); + } + + public readonly Item Current => current; + public bool MoveNext() + { + // Advance past zero terminator and previous string. + pos += 1 + len; + + var tail = bufSpan.Slice(pos); + + // Reached the end? End enumerating. + if(tail[0] == '\0') + return false; + + len = tail.IndexOf('\0'); + + this.current.Span = tail.Slice(0, len); + this.current.Pos = pos; + + return true; + } + } + + public ref struct Item + { + public ReadOnlySpan Span; + public int Pos; + } + + + public ReadOnlySequence Retrieve(ReadOnlySpan indices) + { + Segment root = null; + Segment current = null; + + int len = 0; + + foreach(var idx in indices) + { + var tail = buffer.Slice(idx); + len = tail.Span.IndexOf('\0'); + var text = tail.Slice(0, len); + + if(root == null) + { + root = new Segment(text); + current = root; + } + else + { + current = current.Add(text); + } + } + + return new ReadOnlySequence(root, 0, current, len); + } + + class Segment : ReadOnlySequenceSegment + { + public Segment(ReadOnlyMemory memory) + => Memory = memory; + public Segment Add(ReadOnlyMemory mem) + { + var segment = new Segment(mem); + segment.RunningIndex = RunningIndex + + Memory.Length; + Next = segment; + return segment; + } + } + + } + +} diff --git a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs new file mode 100644 index 0000000..fe95059 --- /dev/null +++ b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs @@ -0,0 +1,108 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace YellowCounter.FileSystemState.PathRedux +{ + public class HashedCharBuffer + { + private readonly int linearSearchLimit; + private CharBuffer charBuffer; + private ChainedLookup chainedLookup; + + public HashedCharBuffer(int initialCharCapacity, int initialHashCapacity, int linearSearchLimit) + { + charBuffer = new CharBuffer(initialCharCapacity); + chainedLookup = new ChainedLookup(initialHashCapacity, linearSearchLimit); + this.linearSearchLimit = linearSearchLimit; + } + + /// + /// Returns index position + /// + /// + /// + public int Store(ReadOnlySpan text) + { + int hash = text.GetHashOfContents(); + int foundPos = findByHash(hash, text); + + if(foundPos != -1) + return foundPos; + + int pos = charBuffer.Store(text); + if(pos == -1) + { + int newSize = charBuffer.Capacity * 2; + if(newSize < text.Length + charBuffer.Capacity) + newSize = charBuffer.Capacity + text.Length; + + charBuffer.Resize(newSize); + + pos = charBuffer.Store(text); + } + + if(!chainedLookup.Store(hash, pos)) + { + rebuildLookup(); + chainedLookup.Store(hash, pos); + } + + return pos; + } + + public ReadOnlySpan Retrieve(int pos) + { + return charBuffer.Retrieve(pos); + } + + public int Find(ReadOnlySpan text) + { + int hash = text.GetHashOfContents(); + return findByHash(hash, text); + } + + private int findByHash(int hash, ReadOnlySpan text) + { + var indices = chainedLookup.Retrieve(hash); + return charBuffer.Match(text, indices); + } + + private void rebuildLookup() + { + // Doubling capacity will halve the number of hash collisions + var newLookup = new ChainedLookup(chainedLookup.Capacity * 2, linearSearchLimit); + + // Populate a new lookup from our existing data. + foreach(var itm in charBuffer) + { + if(!newLookup.Store(itm.Span.GetHashOfContents(), itm.Pos)) + throw new Exception("Oops"); + } + + // Use the new lookup + chainedLookup = newLookup; + } + } +} + + + + + +// Split by backslash / slash + +// Starting at the longest sequence, +// e.g. C:\abc\cde\efg\ghi\ +// then going backwards as +// C:\abc\cde\efg\ +// C:\abc\cde\ +// C:\abc\ + +// Generate the hashcode of the text. +// Look up the hashcode in the dictionary +// If we found it, we will get two things: +// Index of the tail entry +// Index of the parent + +// Create a new record \ No newline at end of file diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs deleted file mode 100644 index 54adfe8..0000000 --- a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs +++ /dev/null @@ -1,86 +0,0 @@ -using System; -using System.Buffers; -using System.Collections.Generic; -using System.Text; - -namespace YellowCounter.FileSystemState.PathRedux -{ - public class PathStorage - { - private Memory buffer = new char[100000]; - private int pos; - - public void Init() - { - } - - public int Store(ReadOnlySpan input) - { - var bufSpan = buffer.Span; - - var result = pos; - - input.CopyTo(bufSpan.Slice(pos, input.Length)); - pos += input.Length; - - bufSpan[pos] = '\0'; - pos++; - - return result; - } - - public ReadOnlySpan Retrieve(int index) - { - var bufSpan = buffer.Span; - - var begin = bufSpan.Slice(index); - - int len = begin.IndexOf('\0'); - - return begin.Slice(0, len); - } - - public ReadOnlySequence Retrieve(IEnumerable indices) - { - Segment root = null; - Segment current = null; - - int len = 0; - - foreach(var idx in indices) - { - var tail = buffer.Slice(idx); - len = tail.Span.IndexOf('\0'); - var text = tail.Slice(0, len); - - if(root == null) - { - root = new Segment(text); - current = root; - } - else - { - current = current.Add(text); - } - } - - return new ReadOnlySequence(root, 0, current, len); - } - - class Segment : ReadOnlySequenceSegment - { - public Segment(ReadOnlyMemory memory) - => Memory = memory; - public Segment Add(ReadOnlyMemory mem) - { - var segment = new Segment(mem); - segment.RunningIndex = RunningIndex + - Memory.Length; - Next = segment; - return segment; - } - } - - } - -} From 41fdd2cee45da02d35a09d9afee10b911389292d Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 7 Mar 2020 09:52:47 +0000 Subject: [PATCH 14/26] Attempt on hash bucket storage --- PathReduxTests/PathRedux/FixedHashFunction.cs | 28 +++++ .../PathRedux/HashedCharBufferTests.cs | 101 +++++++++++++++++- PathReduxTests/PathReduxTests.csproj | 1 + .../PathRedux/ChainedLookup.cs | 5 + .../PathRedux/HashFunction.cs | 11 ++ .../PathRedux/HashedCharBuffer.cs | 28 +++-- .../PathRedux/HashedCharBufferOptions.cs | 14 +++ .../PathRedux/IHashFunction.cs | 9 ++ 8 files changed, 186 insertions(+), 11 deletions(-) create mode 100644 PathReduxTests/PathRedux/FixedHashFunction.cs create mode 100644 YellowCounter.FileSystemState/PathRedux/HashFunction.cs create mode 100644 YellowCounter.FileSystemState/PathRedux/HashedCharBufferOptions.cs create mode 100644 YellowCounter.FileSystemState/PathRedux/IHashFunction.cs diff --git a/PathReduxTests/PathRedux/FixedHashFunction.cs b/PathReduxTests/PathRedux/FixedHashFunction.cs new file mode 100644 index 0000000..25adc4a --- /dev/null +++ b/PathReduxTests/PathRedux/FixedHashFunction.cs @@ -0,0 +1,28 @@ +using System; +using System.Collections.Generic; +using System.Text; +using YellowCounter.FileSystemState.PathRedux; + +namespace PathReduxTests.PathRedux +{ + internal class FixedHashFunction : IHashFunction + { + private Dictionary _force = new Dictionary(); + + public FixedHashFunction Fix(string arg, int value) + { + _force[arg] = value; + return this; + } + + public int HashSequence(ReadOnlySpan arg) + { + // Yes I know we are allocating a string here; this code is for testing + // not performance. + if(_force.TryGetValue(arg.ToString(), out int forcedHash)) + return forcedHash; + + throw new Exception($"Need a fixed hash value for {arg.ToString()}"); + } + } +} diff --git a/PathReduxTests/PathRedux/HashedCharBufferTests.cs b/PathReduxTests/PathRedux/HashedCharBufferTests.cs index 51a3a07..152a985 100644 --- a/PathReduxTests/PathRedux/HashedCharBufferTests.cs +++ b/PathReduxTests/PathRedux/HashedCharBufferTests.cs @@ -1,4 +1,5 @@ using Microsoft.VisualStudio.TestTools.UnitTesting; +using NSubstitute; using Shouldly; using System; using System.Collections.Generic; @@ -12,9 +13,46 @@ public class HashedCharBufferTests { [TestMethod] - public void HashedCharBufferAddAndRetrieve() + public void HashedCharBufferAddAndRetrieveNoClash() { - var buf = new HashedCharBuffer(20, 16, 3); + // Fix the hash codes. + var hasher = new FixedHashFunction() + .Fix("Hello", 1) + .Fix("World", 2); + + var buf = new HashedCharBuffer(new HashedCharBufferOptions() + { + HashFunction = hasher, + InitialCharCapacity = 20, + InitialHashCapacity = 16, + LinearSearchLimit = 3 + }); + + buf.Store("Hello"); + buf.Store("World"); + + buf.Find("Hello").ShouldBe(0); + buf.Find("World").ShouldBe(6); + + buf.Retrieve(0).ToString().ShouldBe("Hello"); + buf.Retrieve(6).ToString().ShouldBe("World"); + } + + [TestMethod] + public void HashedCharBufferAddAndRetrieveClash() + { + // Fix the hash codes to the same value + var hasher = new FixedHashFunction() + .Fix("Hello", 1) + .Fix("World", 1); + + var buf = new HashedCharBuffer(new HashedCharBufferOptions() + { + HashFunction = hasher, + InitialCharCapacity = 20, + InitialHashCapacity = 16, + LinearSearchLimit = 3 + }); buf.Store("Hello"); buf.Store("World"); @@ -25,5 +63,64 @@ public void HashedCharBufferAddAndRetrieve() buf.Retrieve(0).ToString().ShouldBe("Hello"); buf.Retrieve(6).ToString().ShouldBe("World"); } + + [TestMethod] + public void HashedCharBufferHashCollision() + { + // Fix the hash codes to the same value + var hasher = new FixedHashFunction() + .Fix("Hello", 1) + .Fix("World", 1); + + // Allow only 1 item in the linear search phase + var buf = new HashedCharBuffer(new HashedCharBufferOptions() + { + HashFunction = hasher, + InitialCharCapacity = 20, + InitialHashCapacity = 16, + LinearSearchLimit = 1 + }); + + buf.Store("Hello"); + + Should.Throw(() => + { + buf.Store("World"); + }, typeof(Exception)).Message.ShouldBe("Too many hash collisions. Increase LinearSearchLimit to overcome."); + } + + [TestMethod] + public void HashedCharBufferAddAndRetrieveClashRunOutX() + { + // Fix the hash codes to the same value modulo 16 + var hasher = new FixedHashFunction() + .Fix("Hello", 1) + .Fix("World", 17); + + // Allow 1 items in the linear search phase + var buf = new HashedCharBuffer(new HashedCharBufferOptions() + { + HashFunction = hasher, + InitialCharCapacity = 20, + InitialHashCapacity = 16, + LinearSearchLimit = 1 + }); + + buf.HashCapacity.ShouldBe(16); + + buf.Store("Hello"); + buf.Store("World"); + + buf.Find("Hello").ShouldBe(0); + buf.Find("World").ShouldBe(6); + + buf.Retrieve(0).ToString().ShouldBe("Hello"); + buf.Retrieve(6).ToString().ShouldBe("World"); + + // Hash capacity will have doubled to avoid clash of hashes + // 1 % 16 and 17 % 16 + // Once we double, we get 32 hash buckets so clash avoided. + buf.HashCapacity.ShouldBe(32); + } } } diff --git a/PathReduxTests/PathReduxTests.csproj b/PathReduxTests/PathReduxTests.csproj index 15913c5..082d74e 100644 --- a/PathReduxTests/PathReduxTests.csproj +++ b/PathReduxTests/PathReduxTests.csproj @@ -11,6 +11,7 @@ + diff --git a/YellowCounter.FileSystemState/PathRedux/ChainedLookup.cs b/YellowCounter.FileSystemState/PathRedux/ChainedLookup.cs index 83b623f..99b6611 100644 --- a/YellowCounter.FileSystemState/PathRedux/ChainedLookup.cs +++ b/YellowCounter.FileSystemState/PathRedux/ChainedLookup.cs @@ -52,6 +52,11 @@ public bool Store(int hash, int value) return false; } + /// + /// Modulo divide the hash by our capacity + /// + /// + /// private int keyFromHash(int hash) => (int)unchecked((uint)hash % (uint)Capacity); public ReadOnlySpan Retrieve(int hash) diff --git a/YellowCounter.FileSystemState/PathRedux/HashFunction.cs b/YellowCounter.FileSystemState/PathRedux/HashFunction.cs new file mode 100644 index 0000000..f858fd9 --- /dev/null +++ b/YellowCounter.FileSystemState/PathRedux/HashFunction.cs @@ -0,0 +1,11 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace YellowCounter.FileSystemState.PathRedux +{ + public class HashFunction : IHashFunction + { + public int HashSequence(ReadOnlySpan arg) => arg.GetHashOfContents(); + } +} diff --git a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs index fe95059..11a0666 100644 --- a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs +++ b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs @@ -9,14 +9,22 @@ public class HashedCharBuffer private readonly int linearSearchLimit; private CharBuffer charBuffer; private ChainedLookup chainedLookup; + private IHashFunction hashFunction; - public HashedCharBuffer(int initialCharCapacity, int initialHashCapacity, int linearSearchLimit) + public HashedCharBuffer(HashedCharBufferOptions options) { - charBuffer = new CharBuffer(initialCharCapacity); - chainedLookup = new ChainedLookup(initialHashCapacity, linearSearchLimit); - this.linearSearchLimit = linearSearchLimit; + charBuffer = new CharBuffer(options.InitialCharCapacity); + chainedLookup = new ChainedLookup(options.InitialHashCapacity, options.LinearSearchLimit); + + this.hashFunction = options.HashFunction; + this.linearSearchLimit = options.LinearSearchLimit; } + public int LinearSearchLimit => this.linearSearchLimit; + public int CharCapacity => charBuffer.Capacity; + public int HashCapacity => chainedLookup.Capacity; + public IHashFunction HashFunction => hashFunction; + /// /// Returns index position /// @@ -24,7 +32,7 @@ public HashedCharBuffer(int initialCharCapacity, int initialHashCapacity, int li /// public int Store(ReadOnlySpan text) { - int hash = text.GetHashOfContents(); + int hash = hashSequence(text); int foundPos = findByHash(hash, text); if(foundPos != -1) @@ -58,7 +66,7 @@ public ReadOnlySpan Retrieve(int pos) public int Find(ReadOnlySpan text) { - int hash = text.GetHashOfContents(); + int hash = hashSequence(text); return findByHash(hash, text); } @@ -68,16 +76,18 @@ private int findByHash(int hash, ReadOnlySpan text) return charBuffer.Match(text, indices); } + private int hashSequence(ReadOnlySpan text) => hashFunction.HashSequence(text); + private void rebuildLookup() { - // Doubling capacity will halve the number of hash collisions + // Doubling capacity will halve the number of moduloed hash collisions var newLookup = new ChainedLookup(chainedLookup.Capacity * 2, linearSearchLimit); // Populate a new lookup from our existing data. foreach(var itm in charBuffer) { - if(!newLookup.Store(itm.Span.GetHashOfContents(), itm.Pos)) - throw new Exception("Oops"); + if(!newLookup.Store(hashSequence(itm.Span), itm.Pos)) + throw new Exception($"Too many hash collisions. Increase {nameof(LinearSearchLimit)} to overcome."); } // Use the new lookup diff --git a/YellowCounter.FileSystemState/PathRedux/HashedCharBufferOptions.cs b/YellowCounter.FileSystemState/PathRedux/HashedCharBufferOptions.cs new file mode 100644 index 0000000..4c4bed2 --- /dev/null +++ b/YellowCounter.FileSystemState/PathRedux/HashedCharBufferOptions.cs @@ -0,0 +1,14 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace YellowCounter.FileSystemState.PathRedux +{ + public class HashedCharBufferOptions + { + public IHashFunction HashFunction { get; set; } + public int InitialCharCapacity { get; set; } + public int InitialHashCapacity { get; set; } + public int LinearSearchLimit { get; set; } + } +} diff --git a/YellowCounter.FileSystemState/PathRedux/IHashFunction.cs b/YellowCounter.FileSystemState/PathRedux/IHashFunction.cs new file mode 100644 index 0000000..e088c9f --- /dev/null +++ b/YellowCounter.FileSystemState/PathRedux/IHashFunction.cs @@ -0,0 +1,9 @@ +using System; + +namespace YellowCounter.FileSystemState.PathRedux +{ + public interface IHashFunction + { + int HashSequence(ReadOnlySpan arg); + } +} \ No newline at end of file From 95859bc0c3b83f0baa7647ed991c0ef5ee10614f Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 7 Mar 2020 10:40:24 +0000 Subject: [PATCH 15/26] Created hashed character buffer --- .../PathRedux/ChainedLookupTests.cs | 26 +++---- .../PathRedux/ControllableHashFunction.cs | 24 +++++++ .../PathRedux/DeterministicHashFunction.cs | 32 +++++++++ PathReduxTests/PathRedux/FixedHashFunction.cs | 28 -------- .../PathRedux/HashedCharBufferTests.cs | 71 ++++++++----------- .../{ChainedLookup.cs => HashBucket.cs} | 4 +- .../PathRedux/HashedCharBuffer.cs | 6 +- 7 files changed, 103 insertions(+), 88 deletions(-) create mode 100644 PathReduxTests/PathRedux/ControllableHashFunction.cs create mode 100644 PathReduxTests/PathRedux/DeterministicHashFunction.cs delete mode 100644 PathReduxTests/PathRedux/FixedHashFunction.cs rename YellowCounter.FileSystemState/PathRedux/{ChainedLookup.cs => HashBucket.cs} (95%) diff --git a/PathReduxTests/PathRedux/ChainedLookupTests.cs b/PathReduxTests/PathRedux/ChainedLookupTests.cs index a089376..0dff570 100644 --- a/PathReduxTests/PathRedux/ChainedLookupTests.cs +++ b/PathReduxTests/PathRedux/ChainedLookupTests.cs @@ -8,12 +8,12 @@ namespace PathReduxTests.PathRedux { [TestClass] - public class ChainedLookupTests + public class HashBucketTests { [TestMethod] - public void ChainedLookupStoreRetrieve() + public void HashBucketStoreRetrieve() { - var m = new ChainedLookup(2, 2); + var m = new HashBucket(2, 2); m.Store(0, 123456).ShouldBe(true); m.Store(0, 765432).ShouldBe(true); @@ -24,9 +24,9 @@ public void ChainedLookupStoreRetrieve() } [TestMethod] - public void ChainedLookupStoreFlowpast() + public void HashBucketStoreFlowpast() { - var m = new ChainedLookup(2, 2); + var m = new HashBucket(2, 2); m.Store(1, 123456).ShouldBe(true); m.Store(1, 765432).ShouldBe(false); @@ -37,9 +37,9 @@ public void ChainedLookupStoreFlowpast() } [TestMethod] - public void ChainedLookupStoreZero() + public void HashBucketStoreZero() { - var m = new ChainedLookup(2, 2); + var m = new HashBucket(2, 2); // It can store a zero m.Store(0, 0).ShouldBe(true); @@ -49,9 +49,9 @@ public void ChainedLookupStoreZero() } [TestMethod] - public void ChainedLookupChainLimit() + public void HashBucketChainLimit() { - var m = new ChainedLookup(8, 2); + var m = new HashBucket(8, 2); m.Store(0, 100).ShouldBe(true); m.Store(0, 200).ShouldBe(true); @@ -63,9 +63,9 @@ public void ChainedLookupChainLimit() } [TestMethod] - public void ChainedLookupOverlap() + public void HashBucketOverlap() { - var m = new ChainedLookup(8, 8); + var m = new HashBucket(8, 8); // The values are going to overlap. m.Store(0, 100).ShouldBe(true); @@ -78,9 +78,9 @@ public void ChainedLookupOverlap() } [TestMethod] - public void ChainedLookupOverlapLimited() + public void HashBucketOverlapLimited() { - var m = new ChainedLookup(8, 2); + var m = new HashBucket(8, 2); // If we set the max chain to a lower value then the overlap // won't occur. diff --git a/PathReduxTests/PathRedux/ControllableHashFunction.cs b/PathReduxTests/PathRedux/ControllableHashFunction.cs new file mode 100644 index 0000000..a11fa70 --- /dev/null +++ b/PathReduxTests/PathRedux/ControllableHashFunction.cs @@ -0,0 +1,24 @@ +using System; +using System.Collections.Generic; +using System.Text; +using YellowCounter.FileSystemState.PathRedux; + +namespace PathReduxTests.PathRedux +{ + public class ControllableHashFunction : IHashFunction + { + public int HashSequence(ReadOnlySpan arg) + { + // Use comma as delimiter between desired hash number and remaining text. + int commaPos = arg.IndexOf(','); + + if(commaPos == -1) + throw new Exception($"{nameof(ControllableHashFunction)} requires , in each string"); + + if(int.TryParse(arg.Slice(0, commaPos), out int result)) + return result; + + throw new Exception("Text before , must be an integer"); + } + } +} diff --git a/PathReduxTests/PathRedux/DeterministicHashFunction.cs b/PathReduxTests/PathRedux/DeterministicHashFunction.cs new file mode 100644 index 0000000..6c1bc22 --- /dev/null +++ b/PathReduxTests/PathRedux/DeterministicHashFunction.cs @@ -0,0 +1,32 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Text; +using YellowCounter.FileSystemState.PathRedux; + +namespace PathReduxTests.PathRedux +{ + public class DeterministicHashFunction : IHashFunction + { + // Want a deterministic hash function so our tests are repeatable. + // https://andrewlock.net/why-is-string-gethashcode-different-each-time-i-run-my-program-in-net-core/ + public int HashSequence(ReadOnlySpan str) + { + unchecked + { + int hash1 = (5381 << 16) + 5381; + int hash2 = hash1; + + for(int i = 0; i < str.Length; i += 2) + { + hash1 = ((hash1 << 5) + hash1) ^ str[i]; + if(i == str.Length - 1) + break; + hash2 = ((hash2 << 5) + hash2) ^ str[i + 1]; + } + + return hash1 + (hash2 * 1566083941); + } + } + } +} diff --git a/PathReduxTests/PathRedux/FixedHashFunction.cs b/PathReduxTests/PathRedux/FixedHashFunction.cs deleted file mode 100644 index 25adc4a..0000000 --- a/PathReduxTests/PathRedux/FixedHashFunction.cs +++ /dev/null @@ -1,28 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; -using YellowCounter.FileSystemState.PathRedux; - -namespace PathReduxTests.PathRedux -{ - internal class FixedHashFunction : IHashFunction - { - private Dictionary _force = new Dictionary(); - - public FixedHashFunction Fix(string arg, int value) - { - _force[arg] = value; - return this; - } - - public int HashSequence(ReadOnlySpan arg) - { - // Yes I know we are allocating a string here; this code is for testing - // not performance. - if(_force.TryGetValue(arg.ToString(), out int forcedHash)) - return forcedHash; - - throw new Exception($"Need a fixed hash value for {arg.ToString()}"); - } - } -} diff --git a/PathReduxTests/PathRedux/HashedCharBufferTests.cs b/PathReduxTests/PathRedux/HashedCharBufferTests.cs index 152a985..0a56f59 100644 --- a/PathReduxTests/PathRedux/HashedCharBufferTests.cs +++ b/PathReduxTests/PathRedux/HashedCharBufferTests.cs @@ -15,14 +15,9 @@ public class HashedCharBufferTests [TestMethod] public void HashedCharBufferAddAndRetrieveNoClash() { - // Fix the hash codes. - var hasher = new FixedHashFunction() - .Fix("Hello", 1) - .Fix("World", 2); - var buf = new HashedCharBuffer(new HashedCharBufferOptions() { - HashFunction = hasher, + HashFunction = new DeterministicHashFunction(), InitialCharCapacity = 20, InitialHashCapacity = 16, LinearSearchLimit = 3 @@ -41,86 +36,78 @@ public void HashedCharBufferAddAndRetrieveNoClash() [TestMethod] public void HashedCharBufferAddAndRetrieveClash() { - // Fix the hash codes to the same value - var hasher = new FixedHashFunction() - .Fix("Hello", 1) - .Fix("World", 1); - var buf = new HashedCharBuffer(new HashedCharBufferOptions() { - HashFunction = hasher, + HashFunction = new ControllableHashFunction(), InitialCharCapacity = 20, InitialHashCapacity = 16, LinearSearchLimit = 3 }); - buf.Store("Hello"); - buf.Store("World"); + buf.Store("1,Hello"); + buf.Store("1,World"); - buf.Find("Hello").ShouldBe(0); - buf.Find("World").ShouldBe(6); + // Confirm that both strings the same hashcode. + buf.HashFunction.HashSequence("1,Hello").ShouldBe(1); + buf.HashFunction.HashSequence("1,World").ShouldBe(1); - buf.Retrieve(0).ToString().ShouldBe("Hello"); - buf.Retrieve(6).ToString().ShouldBe("World"); + buf.Find("1,Hello").ShouldBe(0); + buf.Find("1,World").ShouldBe(8); + + buf.Retrieve(0).ToString().ShouldBe("1,Hello"); + buf.Retrieve(8).ToString().ShouldBe("1,World"); } [TestMethod] public void HashedCharBufferHashCollision() { - // Fix the hash codes to the same value - var hasher = new FixedHashFunction() - .Fix("Hello", 1) - .Fix("World", 1); - // Allow only 1 item in the linear search phase var buf = new HashedCharBuffer(new HashedCharBufferOptions() { - HashFunction = hasher, + HashFunction = new ControllableHashFunction(), InitialCharCapacity = 20, InitialHashCapacity = 16, LinearSearchLimit = 1 }); - buf.Store("Hello"); + buf.Store("1,Hello"); Should.Throw(() => { - buf.Store("World"); + buf.Store("1,World"); }, typeof(Exception)).Message.ShouldBe("Too many hash collisions. Increase LinearSearchLimit to overcome."); } [TestMethod] public void HashedCharBufferAddAndRetrieveClashRunOutX() { - // Fix the hash codes to the same value modulo 16 - var hasher = new FixedHashFunction() - .Fix("Hello", 1) - .Fix("World", 17); // Allow 1 items in the linear search phase var buf = new HashedCharBuffer(new HashedCharBufferOptions() { - HashFunction = hasher, + HashFunction = new ControllableHashFunction(), InitialCharCapacity = 20, - InitialHashCapacity = 16, + InitialHashCapacity = 8, LinearSearchLimit = 1 }); - buf.HashCapacity.ShouldBe(16); + buf.HashCapacity.ShouldBe(8); - buf.Store("Hello"); - buf.Store("World"); + // Fix the hash codes to the same value modulo 8 - buf.Find("Hello").ShouldBe(0); - buf.Find("World").ShouldBe(6); + buf.Store("1,Hello"); + buf.Store("9,World"); - buf.Retrieve(0).ToString().ShouldBe("Hello"); - buf.Retrieve(6).ToString().ShouldBe("World"); + buf.Find("1,Hello").ShouldBe(0); + buf.Find("9,World").ShouldBe(8); + + buf.Retrieve(0).ToString().ShouldBe("1,Hello"); + buf.Retrieve(8).ToString().ShouldBe("9,World"); // Hash capacity will have doubled to avoid clash of hashes - // 1 % 16 and 17 % 16 - // Once we double, we get 32 hash buckets so clash avoided. - buf.HashCapacity.ShouldBe(32); + // 1 % 8 and 9 % 8 + // Once we double, we get 16 hash buckets so clash avoided. + buf.HashCapacity.ShouldBe(16); } } } diff --git a/YellowCounter.FileSystemState/PathRedux/ChainedLookup.cs b/YellowCounter.FileSystemState/PathRedux/HashBucket.cs similarity index 95% rename from YellowCounter.FileSystemState/PathRedux/ChainedLookup.cs rename to YellowCounter.FileSystemState/PathRedux/HashBucket.cs index 99b6611..98d464e 100644 --- a/YellowCounter.FileSystemState/PathRedux/ChainedLookup.cs +++ b/YellowCounter.FileSystemState/PathRedux/HashBucket.cs @@ -6,14 +6,14 @@ namespace YellowCounter.FileSystemState.PathRedux { - public class ChainedLookup + public class HashBucket { private Memory mem; private readonly int capacity; private readonly int maxChain; private BitArray usage; - public ChainedLookup(int capacity, int maxChain) + public HashBucket(int capacity, int maxChain) { mem = new int[capacity]; usage = new BitArray(capacity); diff --git a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs index 11a0666..15477c1 100644 --- a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs +++ b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs @@ -8,13 +8,13 @@ public class HashedCharBuffer { private readonly int linearSearchLimit; private CharBuffer charBuffer; - private ChainedLookup chainedLookup; + private HashBucket chainedLookup; private IHashFunction hashFunction; public HashedCharBuffer(HashedCharBufferOptions options) { charBuffer = new CharBuffer(options.InitialCharCapacity); - chainedLookup = new ChainedLookup(options.InitialHashCapacity, options.LinearSearchLimit); + chainedLookup = new HashBucket(options.InitialHashCapacity, options.LinearSearchLimit); this.hashFunction = options.HashFunction; this.linearSearchLimit = options.LinearSearchLimit; @@ -81,7 +81,7 @@ private int findByHash(int hash, ReadOnlySpan text) private void rebuildLookup() { // Doubling capacity will halve the number of moduloed hash collisions - var newLookup = new ChainedLookup(chainedLookup.Capacity * 2, linearSearchLimit); + var newLookup = new HashBucket(chainedLookup.Capacity * 2, linearSearchLimit); // Populate a new lookup from our existing data. foreach(var itm in charBuffer) From 7116d729244c25c284ab9b1920329a9b34f6ea79 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 7 Mar 2020 14:00:19 +0000 Subject: [PATCH 16/26] Got compressed hash buffer working --- PathReduxTests/PathRedux/PathStorageTests.cs | 31 ++++ .../PathRedux/CharBuffer.cs | 44 ++++++ .../PathRedux/HashedCharBuffer.cs | 2 + .../PathRedux/PathStorage.cs | 138 ++++++++++++++++++ 4 files changed, 215 insertions(+) create mode 100644 PathReduxTests/PathRedux/PathStorageTests.cs create mode 100644 YellowCounter.FileSystemState/PathRedux/PathStorage.cs diff --git a/PathReduxTests/PathRedux/PathStorageTests.cs b/PathReduxTests/PathRedux/PathStorageTests.cs new file mode 100644 index 0000000..ae77e3c --- /dev/null +++ b/PathReduxTests/PathRedux/PathStorageTests.cs @@ -0,0 +1,31 @@ +using Microsoft.VisualStudio.TestTools.UnitTesting; +using System; +using System.Collections.Generic; +using System.Text; +using YellowCounter.FileSystemState.PathRedux; +using Shouldly; + +namespace PathReduxTests.PathRedux +{ + [TestClass] + public class PathStorageTests + { + [TestMethod] + public void PathStorage1() + { + var ps = new PathStorage(); + + var results = new List(); + + results.Add(ps.Store(@"C:\abc")); + results.Add(ps.Store(@"C:\abc\xyz")); + results.Add(ps.Store(@"C:\abc\cde")); + results.Add(ps.Store(@"C:\mmm\cde")); + + ps.CreateString(results[0]).ShouldBe(@"C:\abc"); + ps.CreateString(results[1]).ShouldBe(@"C:\abc\xyz"); + ps.CreateString(results[2]).ShouldBe(@"C:\abc\cde"); + ps.CreateString(results[3]).ShouldBe(@"C:\mmm\cde"); + } + } +} diff --git a/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs b/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs index 43e4dd9..25a95d7 100644 --- a/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs +++ b/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs @@ -82,7 +82,51 @@ public ReadOnlySpan Retrieve(int index) return begin.Slice(0, len); } + public string CreateString(IEnumerable indices) + { + int totalLen = 0; + var posLens = new List(); + // Gather up pos / lens + + var bufSpan = buffer.Span; + + foreach(var idx in indices) + { + var tail = bufSpan.Slice(idx); + var len = tail.IndexOf('\0'); + + totalLen += len; + posLens.Add(new PosLen(idx, len)); + //var text = tail.Slice(0, len); + } + + return String.Create(totalLen, (buffer, posLens, totalLen), + (chars, state) => + { + var span = state.buffer.Span; + var pos = state.totalLen; + + foreach(var posLen in posLens) + { + var text = span.Slice(posLen.Pos, posLen.Len); + + pos -= posLen.Len; + + text.CopyTo(chars.Slice(pos, posLen.Len)); + } + }); + } + private readonly struct PosLen + { + public PosLen(int pos, int len) + { + this.Pos = pos; + this.Len = len; + } + public int Pos { get; } + public int Len { get; } + } public Enumerator GetEnumerator() { diff --git a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs index 15477c1..ccd073e 100644 --- a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs +++ b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs @@ -64,6 +64,8 @@ public ReadOnlySpan Retrieve(int pos) return charBuffer.Retrieve(pos); } + public string CreateString(IEnumerable indices) => charBuffer.CreateString(indices); + public int Find(ReadOnlySpan text) { int hash = hashSequence(text); diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs new file mode 100644 index 0000000..7300a20 --- /dev/null +++ b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs @@ -0,0 +1,138 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace YellowCounter.FileSystemState.PathRedux +{ + public class PathStorage + { + private HashedCharBuffer buf; + private HashBucket buckets; + private List entries; + + public PathStorage() + { + buf = new HashedCharBuffer(new HashedCharBufferOptions() + { + HashFunction = new HashFunction(), + InitialCharCapacity = 1024, + InitialHashCapacity = 256, + LinearSearchLimit = 128 + }); + + buckets = new HashBucket(128, 16); + + entries = new List(); + + // Create a root entry so 0 is not a valid index + entries.Add(new Entry(-1,-1)); + } + + public int Store(ReadOnlySpan arg) + { + var hash = arg.GetHashOfContents(); + + foreach(var idx in buckets.Retrieve(hash)) + { + if(match(idx, arg)) + return idx; + } + + // Find a slash or backslash. + int slashPos = arg.LastIndexOfAny(new[] { '\\', '/' }); + + int parentIdx; + int textRef; + + // No more slash delimiters, so store a root entry (parent index 0). + if(slashPos == -1) + { + parentIdx = 0; + textRef = buf.Store(arg); + } + else + { + // Recursively call back to ourselves to store all text + // up to the parent directory name. This might find an + // existing entry or need to create one. + parentIdx = this.Store(arg.Slice(0, slashPos)); + + // Store the text from the slash onwards as our entry. + textRef = buf.Store(arg.Slice(slashPos)); + } + + int result = entries.Count; + entries.Add(new Entry(textRef, parentIdx)); + + return result; + } + + public string CreateString(int idx) + { + return buf.CreateString(chain(idx)); + } + + private IEnumerable chain(int idx) + { + int cursorIdx = idx; + + while(cursorIdx != 0) + { + var entry = entries[cursorIdx]; + + yield return entry.TextRef; + cursorIdx = entry.ParentIdx; + } + } + + private bool match(int idx, ReadOnlySpan arg) + { + int argStart = arg.Length; + int cursorIdx = idx; + + while(true) + { + var entry = entries[cursorIdx]; + + var text = buf.Retrieve(entry.TextRef); + + argStart -= text.Length; + + if(argStart < 0) + return false; + + var argSlice = arg.Slice(argStart, text.Length); + + if(!text.SequenceEqual(argSlice)) + return false; + + // Loop round to our parent entry + cursorIdx = entry.ParentIdx; + + if(cursorIdx == 0) + { + // If the target has no parent, and we've examined all of arg + // then we've got a correct match + if(argStart == 0) + return true; + + return false; + } + } + + } + + private readonly struct Entry + { + public Entry(int textRef, int parentIdx) + { + this.TextRef = textRef; + this.ParentIdx = parentIdx; + } + + public int TextRef { get; } + public int ParentIdx { get; } + } + + } +} From 67f3c79416c3a3402910a4e08767dccde7ba0e3a Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Tue, 10 Mar 2020 07:53:21 +0000 Subject: [PATCH 17/26] Renamed to HashBucket --- YellowCounter.FileSystemState/PathRedux/HashBucket.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/YellowCounter.FileSystemState/PathRedux/HashBucket.cs b/YellowCounter.FileSystemState/PathRedux/HashBucket.cs index 98d464e..1706db2 100644 --- a/YellowCounter.FileSystemState/PathRedux/HashBucket.cs +++ b/YellowCounter.FileSystemState/PathRedux/HashBucket.cs @@ -25,13 +25,13 @@ public HashBucket(int capacity, int maxChain) public bool Store(int hash, int value) { - int key = keyFromHash(hash); + int bucket = bucketFromHash(hash); var span = mem.Span; int chainLen = 0; // Look for an empty slot in our buffer - for(int i = key; i < capacity; i++) + for(int i = bucket; i < capacity; i++) { if(!usage[i]) { @@ -57,11 +57,11 @@ public bool Store(int hash, int value) /// /// /// - private int keyFromHash(int hash) => (int)unchecked((uint)hash % (uint)Capacity); + private int bucketFromHash(int hash) => (int)unchecked((uint)hash % (uint)Capacity); public ReadOnlySpan Retrieve(int hash) { - int key = keyFromHash(hash); + int key = bucketFromHash(hash); var span = mem.Span; int chainLen = 0; From bbceb2bfbc9b37268eac2d69a974f731c92ae8e7 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Thu, 12 Mar 2020 08:01:43 +0000 Subject: [PATCH 18/26] Removed RelativeDir thingy --- YellowCounter.FileSystemState/FileState.cs | 2 +- .../FileSystemState.cs | 16 +++++++------- .../PathRedux/IPathStorage.cs | 10 +++++++++ .../PathRedux/PathStorage.cs | 4 ++-- .../PathToFileStateHashtable.cs | 22 +++++++------------ 5 files changed, 29 insertions(+), 25 deletions(-) create mode 100644 YellowCounter.FileSystemState/PathRedux/IPathStorage.cs diff --git a/YellowCounter.FileSystemState/FileState.cs b/YellowCounter.FileSystemState/FileState.cs index 9a99185..1f74121 100644 --- a/YellowCounter.FileSystemState/FileState.cs +++ b/YellowCounter.FileSystemState/FileState.cs @@ -16,7 +16,7 @@ internal class FileState [NonSerialized] public long ChangeVersion; - public string RelativeDir; + public string Directory; public string FileName; public DateTimeOffset LastWriteTimeUtc; public long Length; diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index 2d6ca84..ae22a1f 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -23,7 +23,7 @@ public FileSystemState(string rootDir, string filter = "*", EnumerationOptions o EnumerationOptions = options ?? new EnumerationOptions(); - _state = new PathToFileStateHashtable(new StringInternPool(), this.RootDir.Length); + _state = new PathToFileStateHashtable(new StringInternPool()); } public string RootDir { get; set; } @@ -108,23 +108,23 @@ private FileChangeList convertToFileChanges( { var createResults = creates .Except(renames.Select(x => x.NewFile)) - .Select(x => new FileChange(this.RootDir + x.RelativeDir, x.FileName, WatcherChangeTypes.Created)) + .Select(x => new FileChange(x.Directory, x.FileName, WatcherChangeTypes.Created)) ; var changeResults = changes - .Select(x => new FileChange(this.RootDir + x.RelativeDir, x.FileName, WatcherChangeTypes.Changed)) + .Select(x => new FileChange(x.Directory, x.FileName, WatcherChangeTypes.Changed)) ; var removeResults = removals .Except(renames.Select(x => x.OldFile)) - .Select(x => new FileChange(this.RootDir + x.RelativeDir, x.FileName, WatcherChangeTypes.Deleted)) + .Select(x => new FileChange(x.Directory, x.FileName, WatcherChangeTypes.Deleted)) ; var renameResults = renames.Select(x => new FileChange( - this.RootDir + x.NewFile.RelativeDir, + x.NewFile.Directory, x.NewFile.FileName, WatcherChangeTypes.Renamed, - this.RootDir + x.OldFile.RelativeDir, + x.OldFile.Directory, x.OldFile.FileName)) ; @@ -189,7 +189,7 @@ private FileChangeList convertToFileChanges( // Group by last write time, length and directory or filename x.LastWriteTimeUtc, x.Length, - Name = byName ? x.RelativeDir : x.FileName + Name = byName ? x.Directory : x.FileName }, (x, y) => new { @@ -203,7 +203,7 @@ private FileChangeList convertToFileChanges( .ToList(); var removesByTime = removals - .GroupBy(x => new { x.LastWriteTimeUtc, x.Length, Name = byName ? x.RelativeDir : x.FileName }, + .GroupBy(x => new { x.LastWriteTimeUtc, x.Length, Name = byName ? x.Directory : x.FileName }, (x, y) => new { x.LastWriteTimeUtc, x.Length, x.Name, Removes = y.ToList() }) .ToList(); diff --git a/YellowCounter.FileSystemState/PathRedux/IPathStorage.cs b/YellowCounter.FileSystemState/PathRedux/IPathStorage.cs new file mode 100644 index 0000000..b39cc01 --- /dev/null +++ b/YellowCounter.FileSystemState/PathRedux/IPathStorage.cs @@ -0,0 +1,10 @@ +using System; + +namespace YellowCounter.FileSystemState.PathRedux +{ + public interface IPathStorage + { + string CreateString(int idx); + int Store(ReadOnlySpan arg); + } +} \ No newline at end of file diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs index 7300a20..6fd6dcd 100644 --- a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs +++ b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs @@ -4,7 +4,7 @@ namespace YellowCounter.FileSystemState.PathRedux { - public class PathStorage + public class PathStorage : IPathStorage { private HashedCharBuffer buf; private HashBucket buckets; @@ -25,7 +25,7 @@ public PathStorage() entries = new List(); // Create a root entry so 0 is not a valid index - entries.Add(new Entry(-1,-1)); + entries.Add(new Entry(-1, -1)); } public int Store(ReadOnlySpan arg) diff --git a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs index a3d1a38..d7f5310 100644 --- a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs +++ b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs @@ -11,26 +11,19 @@ internal class PathToFileStateHashtable { Dictionary> dict; private readonly IStringInternPool stringInternPool; - private readonly int truncate; - public PathToFileStateHashtable(IStringInternPool stringInternPool, int truncate = 0) + public PathToFileStateHashtable(IStringInternPool stringInternPool) { dict = new Dictionary>(); this.stringInternPool = stringInternPool; - this.truncate = truncate; } internal void Mark(ref FileSystemEntry input,long version) { - // If we are scanning folder c:\verylongdirectoryname\ there is no need to store - // the same text c:\verylongdirectoryname\ over and over again so we remove the - // root from the directory name leaving the relative path - var relativeDir = input.Directory.Slice(truncate); - // Without allocating strings, calculate a hashcode based on the // directory and filename. int hashCode = HashCode.Combine( - relativeDir.GetHashOfContents(), + input.Directory.GetHashOfContents(), input.FileName.GetHashOfContents()); if(dict.TryGetValue(hashCode, out var fileStates)) @@ -45,7 +38,7 @@ internal void Mark(ref FileSystemEntry input,long version) // Use Equals() to match to avoid allocating strings. if(input.FileName.Equals(existing.FileName, StringComparison.Ordinal) - && relativeDir.Equals(existing.RelativeDir, StringComparison.Ordinal)) + && input.Directory.Equals(existing.Directory, StringComparison.Ordinal)) { // Found the file; compare to our existing record so we can // detect if it has been modified. @@ -59,13 +52,13 @@ internal void Mark(ref FileSystemEntry input,long version) // Hash collision! Add on the end of the list. if(!found) { - fileStates.Add(newFileState(input, ref relativeDir, version)); + fileStates.Add(newFileState(input, version)); } } else { // Not seen before, create a 1-element list and add to the dictionary. - dict.Add(hashCode, new List() { newFileState(input, ref relativeDir, version) }); + dict.Add(hashCode, new List() { newFileState(input, version) }); } } @@ -87,7 +80,7 @@ private void markExisting(FileState fs, FileSystemEntry input, long version) } } - private FileState newFileState(FileSystemEntry input, ref ReadOnlySpan relativeDir, long version) + private FileState newFileState(FileSystemEntry input, long version) { var fileState = new FileState(); @@ -98,8 +91,9 @@ private FileState newFileState(FileSystemEntry input, ref ReadOnlySpan rel // Here's where we're allocating the strings. Note we only do this when // we first see a file, not on each subsequent scan for changes. var fn = input.FileName; + var dir = input.Directory; - fileState.RelativeDir = stringInternPool.Intern(ref relativeDir); + fileState.Directory = stringInternPool.Intern(ref dir); fileState.FileName = stringInternPool.Intern(ref fn); //fileState.Directory = input.Directory.ToString(); From 6b207bb137c0890c307341b9c8b7951f3f8b5aab Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 14 Mar 2020 16:58:23 +0000 Subject: [PATCH 19/26] Removed use of StringIntern --- YellowCounter.FileSystemState/FileState.cs | 4 +- .../FileSystemState.cs | 54 +++++++++++--- .../PathRedux/PathStorage.cs | 31 ++++++++ .../PathToFileStateHashtable.cs | 70 +++++++++---------- 4 files changed, 111 insertions(+), 48 deletions(-) diff --git a/YellowCounter.FileSystemState/FileState.cs b/YellowCounter.FileSystemState/FileState.cs index 1f74121..082f7b3 100644 --- a/YellowCounter.FileSystemState/FileState.cs +++ b/YellowCounter.FileSystemState/FileState.cs @@ -16,8 +16,8 @@ internal class FileState [NonSerialized] public long ChangeVersion; - public string Directory; - public string FileName; + public int DirectoryRef; + public int FilenameRef; public DateTimeOffset LastWriteTimeUtc; public long Length; diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index ae22a1f..334ef42 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -5,6 +5,7 @@ using System.Runtime.InteropServices; using System.Runtime.Serialization.Formatters.Binary; using System.Linq; +using YellowCounter.FileSystemState.PathRedux; namespace YellowCounter.FileSystemState { @@ -23,13 +24,16 @@ public FileSystemState(string rootDir, string filter = "*", EnumerationOptions o EnumerationOptions = options ?? new EnumerationOptions(); - _state = new PathToFileStateHashtable(new StringInternPool()); + this.pathStorage = new PathStorage(); + _state = new PathToFileStateHashtable(this.pathStorage); } public string RootDir { get; set; } public string Filter { get; set; } public EnumerationOptions EnumerationOptions { get; set; } + private readonly PathStorage pathStorage; + public void LoadState() { // Set initial baseline by reading current directory state without returning @@ -108,24 +112,24 @@ private FileChangeList convertToFileChanges( { var createResults = creates .Except(renames.Select(x => x.NewFile)) - .Select(x => new FileChange(x.Directory, x.FileName, WatcherChangeTypes.Created)) + .Select(x => newFileChange(x.DirectoryRef, x.FilenameRef, WatcherChangeTypes.Created)) ; var changeResults = changes - .Select(x => new FileChange(x.Directory, x.FileName, WatcherChangeTypes.Changed)) + .Select(x => newFileChange(x.DirectoryRef, x.FilenameRef, WatcherChangeTypes.Changed)) ; var removeResults = removals .Except(renames.Select(x => x.OldFile)) - .Select(x => new FileChange(x.Directory, x.FileName, WatcherChangeTypes.Deleted)) + .Select(x => newFileChange(x.DirectoryRef, x.FilenameRef, WatcherChangeTypes.Deleted)) ; - var renameResults = renames.Select(x => new FileChange( - x.NewFile.Directory, - x.NewFile.FileName, + var renameResults = renames.Select(x => newFileChange2( + x.NewFile.DirectoryRef, + x.NewFile.FilenameRef, WatcherChangeTypes.Renamed, - x.OldFile.Directory, - x.OldFile.FileName)) + x.OldFile.DirectoryRef, + x.OldFile.FilenameRef)) ; var result = new FileChangeList(); @@ -136,6 +140,34 @@ private FileChangeList convertToFileChanges( result.AddRange(renameResults); return result; + + FileChange newFileChange( + int directoryRef, + int filenameRef, + WatcherChangeTypes changeType) + { + return new FileChange( + pathStorage.CreateString(directoryRef), + pathStorage.CreateString(filenameRef), + changeType); + } + + FileChange newFileChange2( + int newDirectoryRef, + int newFilenameRef, + WatcherChangeTypes changeType, + int oldDirectoryRef, + int oldFilenameRef + ) + { + return new FileChange( + pathStorage.CreateString(newDirectoryRef), + pathStorage.CreateString(newFilenameRef), + changeType, + pathStorage.CreateString(oldDirectoryRef), + pathStorage.CreateString(oldFilenameRef) + ); + } } private ( @@ -189,7 +221,7 @@ private FileChangeList convertToFileChanges( // Group by last write time, length and directory or filename x.LastWriteTimeUtc, x.Length, - Name = byName ? x.Directory : x.FileName + Name = byName ? x.DirectoryRef : x.FilenameRef }, (x, y) => new { @@ -203,7 +235,7 @@ private FileChangeList convertToFileChanges( .ToList(); var removesByTime = removals - .GroupBy(x => new { x.LastWriteTimeUtc, x.Length, Name = byName ? x.Directory : x.FileName }, + .GroupBy(x => new { x.LastWriteTimeUtc, x.Length, Name = byName ? x.DirectoryRef : x.FilenameRef }, (x, y) => new { x.LastWriteTimeUtc, x.Length, x.Name, Removes = y.ToList() }) .ToList(); diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs index 6fd6dcd..bc096a7 100644 --- a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs +++ b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs @@ -64,9 +64,40 @@ public int Store(ReadOnlySpan arg) int result = entries.Count; entries.Add(new Entry(textRef, parentIdx)); + if(!buckets.Store(hash, result)) + { + // Rebuild buckets from List twice as big + rebuildBuckets(); + + if(!buckets.Store(hash, result)) + throw new Exception("Run out..."); + } + return result; } + private void rebuildBuckets() + { + var newBuckets = new HashBucket(buckets.Capacity * 2, buckets.MaxChain); + + for(int idx = 0; idx < entries.Count; idx++) + { + var h = new HashCode(); + + foreach(var textRef in chain(idx)) + { + var text = buf.Retrieve(textRef); + h.Add(text.GetHashOfContents()); + } + + int hash = h.ToHashCode(); + + newBuckets.Store(hash, idx); + } + + this.buckets = newBuckets; + } + public string CreateString(int idx) { return buf.CreateString(chain(idx)); diff --git a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs index d7f5310..4cc5336 100644 --- a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs +++ b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs @@ -3,6 +3,7 @@ using System.Runtime.Serialization; using System.Linq; using System.IO.Enumeration; +using YellowCounter.FileSystemState.PathRedux; namespace YellowCounter.FileSystemState { @@ -10,21 +11,27 @@ namespace YellowCounter.FileSystemState internal class PathToFileStateHashtable { Dictionary> dict; - private readonly IStringInternPool stringInternPool; + private readonly IPathStorage pathStorage; - public PathToFileStateHashtable(IStringInternPool stringInternPool) + public PathToFileStateHashtable(IPathStorage pathStorage) { dict = new Dictionary>(); - this.stringInternPool = stringInternPool; + + this.pathStorage = pathStorage; } internal void Mark(ref FileSystemEntry input,long version) { - // Without allocating strings, calculate a hashcode based on the - // directory and filename. - int hashCode = HashCode.Combine( - input.Directory.GetHashOfContents(), - input.FileName.GetHashOfContents()); + int dirRef = pathStorage.Store(input.Directory); + int filenameRef = pathStorage.Store(input.FileName); + + int hashCode = HashCode.Combine(dirRef.GetHashCode(), filenameRef.GetHashCode()); + + //// Without allocating strings, calculate a hashcode based on the + //// directory and filename. + //int hashCode = HashCode.Combine( + // input.Directory.GetHashOfContents(), + // input.FileName.GetHashOfContents()); if(dict.TryGetValue(hashCode, out var fileStates)) { @@ -37,8 +44,7 @@ internal void Mark(ref FileSystemEntry input,long version) // matches in here. Do a proper comparision on filename/directory. // Use Equals() to match to avoid allocating strings. - if(input.FileName.Equals(existing.FileName, StringComparison.Ordinal) - && input.Directory.Equals(existing.Directory, StringComparison.Ordinal)) + if(existing.FilenameRef == filenameRef && existing.DirectoryRef == dirRef) { // Found the file; compare to our existing record so we can // detect if it has been modified. @@ -52,13 +58,30 @@ internal void Mark(ref FileSystemEntry input,long version) // Hash collision! Add on the end of the list. if(!found) { - fileStates.Add(newFileState(input, version)); + fileStates.Add(newFileState(input)); } } else { // Not seen before, create a 1-element list and add to the dictionary. - dict.Add(hashCode, new List() { newFileState(input, version) }); + dict.Add(hashCode, new List() { newFileState(input) }); + } + + FileState newFileState(FileSystemEntry input) + { + var fileState = new FileState(); + + fileState.LastSeenVersion = version; + fileState.CreateVersion = version; + fileState.ChangeVersion = version; + + fileState.DirectoryRef = dirRef; + fileState.FilenameRef = filenameRef; + + fileState.LastWriteTimeUtc = input.LastWriteTimeUtc; + fileState.Length = input.Length; + + return fileState; } } @@ -80,30 +103,7 @@ private void markExisting(FileState fs, FileSystemEntry input, long version) } } - private FileState newFileState(FileSystemEntry input, long version) - { - var fileState = new FileState(); - - fileState.LastSeenVersion = version; - fileState.CreateVersion = version; - fileState.ChangeVersion = version; - - // Here's where we're allocating the strings. Note we only do this when - // we first see a file, not on each subsequent scan for changes. - var fn = input.FileName; - var dir = input.Directory; - fileState.Directory = stringInternPool.Intern(ref dir); - fileState.FileName = stringInternPool.Intern(ref fn); - - //fileState.Directory = input.Directory.ToString(); - //fileState.Path = input.FileName.ToString(); - - fileState.LastWriteTimeUtc = input.LastWriteTimeUtc; - fileState.Length = input.Length; - - return fileState; - } public IEnumerable Read() { From 9dbd61d5d72796e8776faea86d5447586ebd2d50 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 14 Mar 2020 17:33:26 +0000 Subject: [PATCH 20/26] Allow wraparound on hashbucket. --- ...ainedLookupTests.cs => HashBucketTests.cs} | 15 ++++- .../PathRedux/HashBucket.cs | 55 ++++++++++++------- .../PathRedux/PathStorage.cs | 10 ++-- 3 files changed, 52 insertions(+), 28 deletions(-) rename PathReduxTests/PathRedux/{ChainedLookupTests.cs => HashBucketTests.cs} (85%) diff --git a/PathReduxTests/PathRedux/ChainedLookupTests.cs b/PathReduxTests/PathRedux/HashBucketTests.cs similarity index 85% rename from PathReduxTests/PathRedux/ChainedLookupTests.cs rename to PathReduxTests/PathRedux/HashBucketTests.cs index 0dff570..19a906a 100644 --- a/PathReduxTests/PathRedux/ChainedLookupTests.cs +++ b/PathReduxTests/PathRedux/HashBucketTests.cs @@ -29,11 +29,11 @@ public void HashBucketStoreFlowpast() var m = new HashBucket(2, 2); m.Store(1, 123456).ShouldBe(true); - m.Store(1, 765432).ShouldBe(false); + m.Store(1, 765432).ShouldBe(true); var result = m.Retrieve(1); - result.ToArray().ShouldBe(new[] { 123456 }); + result.ToArray().ShouldBe(new[] { 123456, 765432 }); } [TestMethod] @@ -91,5 +91,16 @@ public void HashBucketOverlapLimited() m.Retrieve(0).ToArray().ShouldBe(new[] { 100, 200 }); m.Retrieve(1).ToArray().ShouldBe(new[] { 200 }); } + + [TestMethod] + public void HashBucketWraparound() + { + var m = new HashBucket(4, 2); + + m.Store(3, 100).ShouldBe(true); + m.Store(3, 200).ShouldBe(true); + + m.Retrieve(3).ToArray().ShouldBe(new[] { 100, 200 }); + } } } diff --git a/YellowCounter.FileSystemState/PathRedux/HashBucket.cs b/YellowCounter.FileSystemState/PathRedux/HashBucket.cs index 1706db2..e1bb2b1 100644 --- a/YellowCounter.FileSystemState/PathRedux/HashBucket.cs +++ b/YellowCounter.FileSystemState/PathRedux/HashBucket.cs @@ -1,4 +1,5 @@ using System; +using System.Buffers; using System.Collections; using System.Collections.Generic; using System.Collections.Specialized; @@ -15,38 +16,47 @@ public class HashBucket public HashBucket(int capacity, int maxChain) { - mem = new int[capacity]; + mem = new int[capacity + maxChain]; usage = new BitArray(capacity); + this.capacity = capacity; this.maxChain = maxChain; } - public int Capacity => mem.Length; + public int Capacity => this.capacity; + public int MaxChain => this.maxChain; public bool Store(int hash, int value) { int bucket = bucketFromHash(hash); var span = mem.Span; - int chainLen = 0; - // Look for an empty slot in our buffer - for(int i = bucket; i < capacity; i++) + for(int c = 0; c < maxChain; c++) { - if(!usage[i]) + int i = bucket + c; + int j = i % capacity; + + bool wrapAround = i != j; + + if(!usage[j]) { - span[i] = value; - usage[i] = true; + span[j] = value; + usage[j] = true; + + // If wrapping around we have two copies of the values, + // one at the normal position and one in the runoff area + // at the end of the memory buffer. + // This so we have a contiguous span to slice for the + // return. + if(wrapAround) + { + span[i] = value; + } return true; } - chainLen++; - - // Don't build up too long a chain of values - we'll build a new - // buffer instead. - if(chainLen >= maxChain) - return false; } return false; @@ -59,22 +69,27 @@ public bool Store(int hash, int value) /// private int bucketFromHash(int hash) => (int)unchecked((uint)hash % (uint)Capacity); + public ReadOnlySpan Retrieve(int hash) { - int key = bucketFromHash(hash); + int bucket = bucketFromHash(hash); var span = mem.Span; - int chainLen = 0; - for(int i = key; i < capacity && chainLen <= maxChain; i++) + int c = 0; + + while(c < maxChain) { - if(!usage[i]) + int j = (bucket + c) % capacity; + + if(!usage[j]) break; - chainLen++; + c++; } - return mem.Span.Slice(key, chainLen); + return span.Slice(bucket, c); } + } } diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs index bc096a7..952e9d4 100644 --- a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs +++ b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs @@ -9,6 +9,7 @@ public class PathStorage : IPathStorage private HashedCharBuffer buf; private HashBucket buckets; private List entries; + private const int Root = -1; public PathStorage() { @@ -23,9 +24,6 @@ public PathStorage() buckets = new HashBucket(128, 16); entries = new List(); - - // Create a root entry so 0 is not a valid index - entries.Add(new Entry(-1, -1)); } public int Store(ReadOnlySpan arg) @@ -47,7 +45,7 @@ public int Store(ReadOnlySpan arg) // No more slash delimiters, so store a root entry (parent index 0). if(slashPos == -1) { - parentIdx = 0; + parentIdx = Root; textRef = buf.Store(arg); } else @@ -107,7 +105,7 @@ private IEnumerable chain(int idx) { int cursorIdx = idx; - while(cursorIdx != 0) + while(cursorIdx != Root) { var entry = entries[cursorIdx]; @@ -140,7 +138,7 @@ private bool match(int idx, ReadOnlySpan arg) // Loop round to our parent entry cursorIdx = entry.ParentIdx; - if(cursorIdx == 0) + if(cursorIdx == Root) { // If the target has no parent, and we've examined all of arg // then we've got a correct match From 00c535f50600186b2dadef81ee5bdb705dc685f3 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sun, 15 Mar 2020 09:57:56 +0000 Subject: [PATCH 21/26] Comment changes --- PathReduxTests/PathRedux/CharBufferTests.cs | 30 +++++++++++++++++++ .../PathRedux/ControllableHashFunction.cs | 12 ++++++++ .../FileSystemState.cs | 11 ------- .../PathRedux/CharBuffer.cs | 5 +++- .../PathRedux/HashedCharBuffer.cs | 2 ++ .../PathRedux/PathStorage.cs | 6 ++-- .../PathToFileStateHashtable.cs | 2 -- 7 files changed, 51 insertions(+), 17 deletions(-) diff --git a/PathReduxTests/PathRedux/CharBufferTests.cs b/PathReduxTests/PathRedux/CharBufferTests.cs index ca6f8e3..2a7d9d1 100644 --- a/PathReduxTests/PathRedux/CharBufferTests.cs +++ b/PathReduxTests/PathRedux/CharBufferTests.cs @@ -75,5 +75,35 @@ public void CharBufferEnumerate() results.ShouldBe(new[] { "Hello", "World" }); } + + [TestMethod] + public void CharBufferMaxCapacity() + { + // To store the text "Hello" without expanding, we need 5 chars for Hello, + // 1 char for the null terminator of Hello, and 1 char for the null terminator + // of the overall buffer. + var charBuffer = new CharBuffer(7); + + int idx1 = charBuffer.Store("Hello"); + idx1.ShouldNotBe(-1); + charBuffer.Capacity.ShouldBe(7); + + charBuffer.Retrieve(idx1).ToString().ShouldBe("Hello"); + + int c = 0; + foreach(var itm in charBuffer) + { + if(c == 0) + { + itm.Pos.ShouldBe(0); + itm.Span.ToString().ShouldBe("Hello"); + } + else + { + throw new Exception("Should only have one item"); + } + c++; + } + } } } diff --git a/PathReduxTests/PathRedux/ControllableHashFunction.cs b/PathReduxTests/PathRedux/ControllableHashFunction.cs index a11fa70..495a5a4 100644 --- a/PathReduxTests/PathRedux/ControllableHashFunction.cs +++ b/PathReduxTests/PathRedux/ControllableHashFunction.cs @@ -5,6 +5,18 @@ namespace PathReduxTests.PathRedux { + + /// + /// This hash function allows us to fix the hashcode to known values, based on + /// the number before the comma. + /// + /// The string must be in the format: + /// "99999,Something" + /// "99999,Another thing" + /// Both these will get a hashcode of 99999. + /// + /// Using this we can deliberately create hash collisions. + /// public class ControllableHashFunction : IHashFunction { public int HashSequence(ReadOnlySpan arg) diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index 334ef42..e613ccb 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -84,17 +84,6 @@ private void gatherChanges() public void Accept(ref FileSystemEntry fileSystemEntry) { _state.Mark(ref fileSystemEntry, _version); - - //string path = fileSystemEntry.FileName.ToString(); - - //FileState fs = new FileState(); - //fs.Directory = fileSystemEntry.Directory.ToString(); - //fs.Path = path; - //fs.LastWriteTimeUtc = fileSystemEntry.LastWriteTimeUtc; - //fs.Length = fileSystemEntry.Length; - - //_state.Mark(fs, _version); - } private void acceptChanges() diff --git a/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs b/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs index 25a95d7..06d23c3 100644 --- a/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs +++ b/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs @@ -33,6 +33,8 @@ public void Resize(int capacity) public int Store(ReadOnlySpan input) { + // We need space for our text, our null terminator, and an extra + // null terminator for the end of the buffer. if(input.Length + pos + 1 >= buffer.Length) return -1; @@ -158,7 +160,8 @@ public bool MoveNext() var tail = bufSpan.Slice(pos); - // Reached the end? End enumerating. + // Reached the end? End enumerating. The end of the buffer + // has a double null terminator \0\0. if(tail[0] == '\0') return false; diff --git a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs index ccd073e..e30aff9 100644 --- a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs +++ b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs @@ -48,6 +48,8 @@ public int Store(ReadOnlySpan text) charBuffer.Resize(newSize); pos = charBuffer.Store(text); + if(pos == -1) + throw new Exception("Resizing charBuffer didn't give us enough space"); } if(!chainedLookup.Store(hash, pos)) diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs index 952e9d4..3458643 100644 --- a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs +++ b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs @@ -9,7 +9,7 @@ public class PathStorage : IPathStorage private HashedCharBuffer buf; private HashBucket buckets; private List entries; - private const int Root = -1; + private const int Root = -1; // The root entry's ParentIdx is set to this. public PathStorage() { @@ -42,7 +42,7 @@ public int Store(ReadOnlySpan arg) int parentIdx; int textRef; - // No more slash delimiters, so store a root entry (parent index 0). + // No more slash delimiters, so store a root entry (parent index -1). if(slashPos == -1) { parentIdx = Root; @@ -68,7 +68,7 @@ public int Store(ReadOnlySpan arg) rebuildBuckets(); if(!buckets.Store(hash, result)) - throw new Exception("Run out..."); + throw new Exception($"Too many hash collisions in {nameof(PathStorage)}"); } return result; diff --git a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs index 4cc5336..34df876 100644 --- a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs +++ b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs @@ -42,8 +42,6 @@ internal void Mark(ref FileSystemEntry input,long version) { // We've only matched on hashcode so far, so there could be false // matches in here. Do a proper comparision on filename/directory. - - // Use Equals() to match to avoid allocating strings. if(existing.FilenameRef == filenameRef && existing.DirectoryRef == dirRef) { // Found the file; compare to our existing record so we can From cb1cb36ff734945e12c913cef1f5a8be22328973 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Mon, 6 Apr 2020 09:55:54 +0100 Subject: [PATCH 22/26] Working on rebuild of buffers --- PathReduxTests/PathRedux/PathStorageTests.cs | 11 ++++- PathReduxTests/Watcher/WatcherTests.cs | 45 +++++++++++++++++++ .../PathRedux/HashedCharBuffer.cs | 4 +- .../PathRedux/PathStorage.cs | 17 ++++--- .../PathRedux/PathStorageOptions.cs | 16 +++++++ 5 files changed, 83 insertions(+), 10 deletions(-) create mode 100644 PathReduxTests/Watcher/WatcherTests.cs create mode 100644 YellowCounter.FileSystemState/PathRedux/PathStorageOptions.cs diff --git a/PathReduxTests/PathRedux/PathStorageTests.cs b/PathReduxTests/PathRedux/PathStorageTests.cs index ae77e3c..ddb2518 100644 --- a/PathReduxTests/PathRedux/PathStorageTests.cs +++ b/PathReduxTests/PathRedux/PathStorageTests.cs @@ -13,7 +13,16 @@ public class PathStorageTests [TestMethod] public void PathStorage1() { - var ps = new PathStorage(); + // Trying to trigger it rebuilding the text -> character buffer + var ps = new PathStorage(new PathStorageOptions() + { + HashFunction = new DeterministicHashFunction(), + InitialCharCapacity = 4, + InitialHashCapacity = 2, + LinearSearchLimit = 128, + HashBucketMaxChain = 128, + HashBucketInitialCapacity = 2, + }); var results = new List(); diff --git a/PathReduxTests/Watcher/WatcherTests.cs b/PathReduxTests/Watcher/WatcherTests.cs new file mode 100644 index 0000000..31796d9 --- /dev/null +++ b/PathReduxTests/Watcher/WatcherTests.cs @@ -0,0 +1,45 @@ +using Microsoft.VisualStudio.TestTools.UnitTesting; +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; +using YellowCounter.FileSystemState; +using Shouldly; + +namespace PathReduxTests.Watcher +{ + [TestClass] + public class WatcherTests + { + [TestMethod] + public void FileSystemWatcherNoChange() + { + var dir = GetRandomDirectory(); + + try + { + + File.WriteAllText(Path.Combine(dir, "text1.txt"), "Hello"); + File.WriteAllText(Path.Combine(dir, "blah.txt"), "Hello"); + + var watcher = new FileSystemState(dir, options: new EnumerationOptions { RecurseSubdirectories = true }); + watcher.LoadState(); + + var q = watcher.GetChanges(); + q.Count.ShouldBe(0); + + } + finally + { + Directory.Delete(dir, true); + } + } + + private string GetRandomDirectory() + { + var path = Path.Combine(Path.GetTempPath(), Path.GetRandomFileName()); + Directory.CreateDirectory(path); + return path; + } + } +} diff --git a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs index e30aff9..ed13015 100644 --- a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs +++ b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs @@ -42,8 +42,8 @@ public int Store(ReadOnlySpan text) if(pos == -1) { int newSize = charBuffer.Capacity * 2; - if(newSize < text.Length + charBuffer.Capacity) - newSize = charBuffer.Capacity + text.Length; + if(newSize < text.Length + charBuffer.Capacity + 2) // Allow 2 for null terminators + newSize = charBuffer.Capacity + text.Length + 2; charBuffer.Resize(newSize); diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs index 3458643..1eb09ae 100644 --- a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs +++ b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Text; namespace YellowCounter.FileSystemState.PathRedux @@ -11,17 +12,19 @@ public class PathStorage : IPathStorage private List entries; private const int Root = -1; // The root entry's ParentIdx is set to this. - public PathStorage() + public PathStorage(PathStorageOptions options) { buf = new HashedCharBuffer(new HashedCharBufferOptions() { - HashFunction = new HashFunction(), - InitialCharCapacity = 1024, - InitialHashCapacity = 256, - LinearSearchLimit = 128 + HashFunction = options.HashFunction, + InitialCharCapacity = options.InitialCharCapacity, + InitialHashCapacity = options.InitialHashCapacity, + LinearSearchLimit = options.LinearSearchLimit }); - buckets = new HashBucket(128, 16); + buckets = new HashBucket( + options.HashBucketInitialCapacity, + options.HashBucketMaxChain); entries = new List(); } @@ -82,7 +85,7 @@ private void rebuildBuckets() { var h = new HashCode(); - foreach(var textRef in chain(idx)) + foreach(var textRef in chain(idx).Reverse()) { var text = buf.Retrieve(textRef); h.Add(text.GetHashOfContents()); diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorageOptions.cs b/YellowCounter.FileSystemState/PathRedux/PathStorageOptions.cs new file mode 100644 index 0000000..435089e --- /dev/null +++ b/YellowCounter.FileSystemState/PathRedux/PathStorageOptions.cs @@ -0,0 +1,16 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace YellowCounter.FileSystemState.PathRedux +{ + public class PathStorageOptions + { + public int HashBucketInitialCapacity { get; set; } + public int HashBucketMaxChain { get; set; } + public IHashFunction HashFunction { get; set; } + public int InitialCharCapacity { get; set; } + public int InitialHashCapacity { get; set; } + public int LinearSearchLimit { get; set; } + } +} From 67a5e6017a6bc8d521f490038ec723bc7ff5ef6f Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Wed, 8 Apr 2020 09:39:12 +0100 Subject: [PATCH 23/26] Working on hashing ReadOnlySequence --- PathReduxTests/PathRedux/PathStorageTests.cs | 26 +++++++++++++++++++ .../UnitTests.cs | 5 ++-- YellowCounter.FileSystemState/FileState.cs | 10 +++++++ .../FileSystemState.cs | 19 ++++++++++---- .../PathRedux/CharBuffer.cs | 4 ++- .../PathRedux/HashedCharBuffer.cs | 7 +++-- .../PathRedux/IHashFunction.cs | 1 + .../PathRedux/PathStorage.cs | 25 ++++++++++++++++-- .../PathToFileStateHashtable.cs | 18 ++++++++++++- 9 files changed, 100 insertions(+), 15 deletions(-) diff --git a/PathReduxTests/PathRedux/PathStorageTests.cs b/PathReduxTests/PathRedux/PathStorageTests.cs index ddb2518..a1d5f6f 100644 --- a/PathReduxTests/PathRedux/PathStorageTests.cs +++ b/PathReduxTests/PathRedux/PathStorageTests.cs @@ -30,11 +30,37 @@ public void PathStorage1() results.Add(ps.Store(@"C:\abc\xyz")); results.Add(ps.Store(@"C:\abc\cde")); results.Add(ps.Store(@"C:\mmm\cde")); + results.Add(ps.Store(@"C:\abc")); ps.CreateString(results[0]).ShouldBe(@"C:\abc"); ps.CreateString(results[1]).ShouldBe(@"C:\abc\xyz"); ps.CreateString(results[2]).ShouldBe(@"C:\abc\cde"); ps.CreateString(results[3]).ShouldBe(@"C:\mmm\cde"); + results[4].ShouldBe(results[0]); + } + + [TestMethod] + public void PathStorage2() + { + // Trying to trigger it rebuilding the text -> character buffer + var ps = new PathStorage(new PathStorageOptions() + { + HashFunction = new DeterministicHashFunction(), + InitialCharCapacity = 4, + InitialHashCapacity = 2, + LinearSearchLimit = 128, + HashBucketMaxChain = 128, + HashBucketInitialCapacity = 2, + }); + + var results = new List(); + + results.Add(ps.Store(@"C:\abc")); + results.Add(ps.Store(@"C:\abc\xyz")); + results.Add(ps.Store(@"C:\abc")); + + ps.CreateString(results[0]).ShouldBe(@"C:\abc"); + results[2].ShouldBe(results[0]); } } } diff --git a/YellowCounter.FileSystemState.Tests/UnitTests.cs b/YellowCounter.FileSystemState.Tests/UnitTests.cs index cfe05db..e0d216f 100644 --- a/YellowCounter.FileSystemState.Tests/UnitTests.cs +++ b/YellowCounter.FileSystemState.Tests/UnitTests.cs @@ -301,11 +301,12 @@ public static void FileSystemWatcher_Recursive() [Fact] public static void FileSystemWatcher_BigDir() { - string currentDir = @"C:\Users\SpanWork\Documents"; - + //string currentDir = @"C:\Users\SpanWork\Documents"; + string currentDir = @"C:\Users\SpanWork\Documents\Olleco\Scrapbook\DBAzure"; FileSystemState watcher = new FileSystemState(currentDir, options: new EnumerationOptions { RecurseSubdirectories = true }); watcher.LoadState(); var q = watcher.GetChanges(); + Assert.Empty(q); } } diff --git a/YellowCounter.FileSystemState/FileState.cs b/YellowCounter.FileSystemState/FileState.cs index 082f7b3..5df7944 100644 --- a/YellowCounter.FileSystemState/FileState.cs +++ b/YellowCounter.FileSystemState/FileState.cs @@ -16,6 +16,7 @@ internal class FileState [NonSerialized] public long ChangeVersion; + public FileStateFlags Flags; public int DirectoryRef; public int FilenameRef; public DateTimeOffset LastWriteTimeUtc; @@ -23,4 +24,13 @@ internal class FileState internal FileState Clone() => (FileState)this.MemberwiseClone(); } + + [Flags] + public enum FileStateFlags : byte + { + None = 0, + Seen = 1, + Created = 2, + Changed = 4, + } } diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index e613ccb..a7f1d01 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -19,12 +19,21 @@ public FileSystemState(string rootDir, string filter = "*", EnumerationOptions o this.RootDir = rootDir ?? throw new ArgumentNullException(nameof(rootDir)); this.Filter = filter ?? throw new ArgumentNullException(nameof(filter)); - if (!Directory.Exists(rootDir)) + if(!Directory.Exists(rootDir)) throw new DirectoryNotFoundException(); EnumerationOptions = options ?? new EnumerationOptions(); - this.pathStorage = new PathStorage(); + this.pathStorage = new PathStorage(new PathStorageOptions() + { + HashFunction = new HashFunction(), + InitialCharCapacity = 1024, + InitialHashCapacity = 256, + LinearSearchLimit = 128, + HashBucketMaxChain = 128, + HashBucketInitialCapacity = 64 + }); + _state = new PathToFileStateHashtable(this.pathStorage); } @@ -172,11 +181,11 @@ int oldFilenameRef foreach(var x in _state.Read()) { - if(x.LastSeenVersion == _version) + if(x.Flags.HasFlag(FileStateFlags.Seen)) { - if(x.CreateVersion == _version) + if(x.Flags.HasFlag(FileStateFlags.Created)) creates.Add(x); - else + else if(x.Flags.HasFlag(FileStateFlags.Changed)) changes.Add(x); } else diff --git a/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs b/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs index 06d23c3..f2c87c9 100644 --- a/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs +++ b/YellowCounter.FileSystemState/PathRedux/CharBuffer.cs @@ -102,6 +102,8 @@ public string CreateString(IEnumerable indices) //var text = tail.Slice(0, len); } + // String in REVERSE ORDER of indices - this is because we start at + // the end and then point back to the parent, grandparent etc. return String.Create(totalLen, (buffer, posLens, totalLen), (chars, state) => { @@ -181,7 +183,7 @@ public ref struct Item } - public ReadOnlySequence Retrieve(ReadOnlySpan indices) + public ReadOnlySequence Retrieve(IEnumerable indices) { Segment root = null; Segment current = null; diff --git a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs index ed13015..53a96d9 100644 --- a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs +++ b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs @@ -1,4 +1,5 @@ using System; +using System.Buffers; using System.Collections.Generic; using System.Text; @@ -61,10 +62,8 @@ public int Store(ReadOnlySpan text) return pos; } - public ReadOnlySpan Retrieve(int pos) - { - return charBuffer.Retrieve(pos); - } + public ReadOnlySpan Retrieve(int pos) => charBuffer.Retrieve(pos); + public ReadOnlySequence Retrieve(IEnumerable indices) => charBuffer.Retrieve(indices); public string CreateString(IEnumerable indices) => charBuffer.CreateString(indices); diff --git a/YellowCounter.FileSystemState/PathRedux/IHashFunction.cs b/YellowCounter.FileSystemState/PathRedux/IHashFunction.cs index e088c9f..d98f8fc 100644 --- a/YellowCounter.FileSystemState/PathRedux/IHashFunction.cs +++ b/YellowCounter.FileSystemState/PathRedux/IHashFunction.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; namespace YellowCounter.FileSystemState.PathRedux { diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs index 1eb09ae..30b09d7 100644 --- a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs +++ b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Diagnostics; using System.Linq; using System.Text; @@ -7,6 +8,7 @@ namespace YellowCounter.FileSystemState.PathRedux { public class PathStorage : IPathStorage { + private IHashFunction hashFunction; private HashedCharBuffer buf; private HashBucket buckets; private List entries; @@ -14,6 +16,8 @@ public class PathStorage : IPathStorage public PathStorage(PathStorageOptions options) { + this.hashFunction = options.HashFunction; + buf = new HashedCharBuffer(new HashedCharBufferOptions() { HashFunction = options.HashFunction, @@ -31,12 +35,15 @@ public PathStorage(PathStorageOptions options) public int Store(ReadOnlySpan arg) { - var hash = arg.GetHashOfContents(); + var hash = hashFunction.HashSequence(arg); foreach(var idx in buckets.Retrieve(hash)) { if(match(idx, arg)) + { + Debug.WriteLine($"Found match for {arg.ToString()} ({hash})= {idx}"); return idx; + } } // Find a slash or backslash. @@ -67,13 +74,20 @@ public int Store(ReadOnlySpan arg) if(!buckets.Store(hash, result)) { + + Debug.WriteLine($"Start rebuildBuckets"); + // Rebuild buckets from List twice as big rebuildBuckets(); + Debug.WriteLine($"End rebuildBuckets"); + if(!buckets.Store(hash, result)) throw new Exception($"Too many hash collisions in {nameof(PathStorage)}"); } + Debug.WriteLine($"Created {arg.ToString()} ({hash})= {result}"); + return result; } @@ -88,7 +102,7 @@ private void rebuildBuckets() foreach(var textRef in chain(idx).Reverse()) { var text = buf.Retrieve(textRef); - h.Add(text.GetHashOfContents()); + h.Add(hashFunction.HashSequence(text)); } int hash = h.ToHashCode(); @@ -99,6 +113,13 @@ private void rebuildBuckets() this.buckets = newBuckets; } + public int HashEntry(int idx) + { + var text = buf.Retrieve(chain(idx)); + + return 0; + } + public string CreateString(int idx) { return buf.CreateString(chain(idx)); diff --git a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs index 34df876..c2c504e 100644 --- a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs +++ b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs @@ -4,6 +4,7 @@ using System.Linq; using System.IO.Enumeration; using YellowCounter.FileSystemState.PathRedux; +using System.Diagnostics; namespace YellowCounter.FileSystemState { @@ -69,6 +70,8 @@ FileState newFileState(FileSystemEntry input) { var fileState = new FileState(); + fileState.Flags = FileStateFlags.Created | FileStateFlags.Seen; + fileState.LastSeenVersion = version; fileState.CreateVersion = version; fileState.ChangeVersion = version; @@ -85,6 +88,9 @@ FileState newFileState(FileSystemEntry input) private void markExisting(FileState fs, FileSystemEntry input, long version) { + // Mark that we've seen the file. + fs.Flags |= FileStateFlags.Seen; + // Mark that we've seen the file. fs.LastSeenVersion = version; @@ -92,6 +98,8 @@ private void markExisting(FileState fs, FileSystemEntry input, long version) if(fs.LastWriteTimeUtc != input.LastWriteTimeUtc || fs.Length != input.Length) { + fs.Flags |= FileStateFlags.Changed; + // Mark that this version was a change fs.ChangeVersion = version; @@ -120,7 +128,9 @@ public void Sweep(long version) { // Remove any item in the list which we didn't see on the last mark // phase (every item that is seen gets the LastSeenVersion updated) - list.RemoveAll(x => x.LastSeenVersion != version); + //list.RemoveAll(x => x.LastSeenVersion != version); + + list.RemoveAll(x => !x.Flags.HasFlag(FileStateFlags.Seen)); // In the normal case where there are no hash collisions, this will // remove the one and only item from the list. We can then remove @@ -130,6 +140,12 @@ public void Sweep(long version) { toRemove.Add(hash); } + + // Clear the flags on all remaining items. + foreach(var x in list) + { + x.Flags = FileStateFlags.None; + } } // We can't remove the items while iterating so remove here instead. From 1786b624184bb2a219d9fdb5f93bcd7f671ca935 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 11 Apr 2020 11:24:09 +0100 Subject: [PATCH 24/26] Switched over from HashFunction to HashCode --- .../HashCodes/ControllableHashCode.cs | 44 +++++++++++++++ .../HashCodes/DeterministicHashCode.cs | 55 +++++++++++++++++++ .../PathRedux/ControllableHashFunction.cs | 36 ------------ .../PathRedux/DeterministicHashFunction.cs | 32 ----------- .../PathRedux/HashedCharBufferTests.cs | 15 ++--- PathReduxTests/PathRedux/PathStorageTests.cs | 5 +- .../UnitTests.cs | 4 +- .../FileSystemState.cs | 3 +- .../HashCodes/HashCodeExtensions.cs | 33 +++++++++++ .../HashCodes/IHashCode.cs | 12 ++++ .../HashCodes/StandardHashCode.cs | 17 ++++++ .../PathRedux/HashFunction.cs | 11 ---- .../PathRedux/HashedCharBuffer.cs | 8 +-- .../PathRedux/HashedCharBufferOptions.cs | 3 +- .../PathRedux/IHashFunction.cs | 10 ---- .../PathRedux/PathStorage.cs | 27 +++++---- .../PathRedux/PathStorageOptions.cs | 3 +- 17 files changed, 200 insertions(+), 118 deletions(-) create mode 100644 PathReduxTests/HashCodes/ControllableHashCode.cs create mode 100644 PathReduxTests/HashCodes/DeterministicHashCode.cs delete mode 100644 PathReduxTests/PathRedux/ControllableHashFunction.cs delete mode 100644 PathReduxTests/PathRedux/DeterministicHashFunction.cs create mode 100644 YellowCounter.FileSystemState/HashCodes/HashCodeExtensions.cs create mode 100644 YellowCounter.FileSystemState/HashCodes/IHashCode.cs create mode 100644 YellowCounter.FileSystemState/HashCodes/StandardHashCode.cs delete mode 100644 YellowCounter.FileSystemState/PathRedux/HashFunction.cs delete mode 100644 YellowCounter.FileSystemState/PathRedux/IHashFunction.cs diff --git a/PathReduxTests/HashCodes/ControllableHashCode.cs b/PathReduxTests/HashCodes/ControllableHashCode.cs new file mode 100644 index 0000000..13e80d5 --- /dev/null +++ b/PathReduxTests/HashCodes/ControllableHashCode.cs @@ -0,0 +1,44 @@ +using System; +using System.Collections.Generic; +using System.Text; +using YellowCounter.FileSystemState.HashCodes; + +namespace PathReduxTests.HashCodes +{ + public class ControllableHashCode : IHashCode + { + private StringBuilder stringBuilder = new StringBuilder(); + private bool dead = false; + + public void Add(char value) + { + stringBuilder.Append(value); + } + + public int ToHashCode() + { + deadCheck(); + + string arg = stringBuilder.ToString(); + + // Use comma as delimiter between desired hash number and remaining text. + int commaPos = arg.IndexOf(','); + + if(commaPos == -1) + throw new Exception($"{nameof(ControllableHashCode)} requires , in each string"); + + if(int.TryParse(arg.Substring(0, commaPos), out int result)) + return result; + + throw new Exception("Text before , must be an integer"); + } + + private void deadCheck() + { + if(dead) + throw new Exception("Cannot call ToHashCode() twice"); + + dead = true; + } + } +} diff --git a/PathReduxTests/HashCodes/DeterministicHashCode.cs b/PathReduxTests/HashCodes/DeterministicHashCode.cs new file mode 100644 index 0000000..43cf462 --- /dev/null +++ b/PathReduxTests/HashCodes/DeterministicHashCode.cs @@ -0,0 +1,55 @@ +using System; +using System.Collections.Generic; +using System.Text; +using YellowCounter.FileSystemState.HashCodes; + +namespace PathReduxTests.HashCodes +{ + // Want a deterministic hash function so our tests are repeatable. + // https://andrewlock.net/why-is-string-gethashcode-different-each-time-i-run-my-program-in-net-core/ + + public class DeterministicHashCode : IHashCode + { + private bool dead = false; + private bool odd = false; + private int hash1 = 352654597; //(5381 << 16) + 5381; + private int hash2 = 352654597; + + public void Add(char value) + { + unchecked + { + if(!odd) + { + hash1 = ((hash1 << 5) + hash1) ^ value; + + } + else + { + hash2 = ((hash2 << 5) + hash2) ^ value; + + } + } + + odd = !odd; + } + + public int ToHashCode() + { + deadCheck(); + + unchecked + { + return hash1 + (hash2 * 1566083941); + } + } + + private void deadCheck() + { + if(dead) + throw new Exception("Cannot call ToHashCode() twice"); + + dead = true; + } + } +} diff --git a/PathReduxTests/PathRedux/ControllableHashFunction.cs b/PathReduxTests/PathRedux/ControllableHashFunction.cs deleted file mode 100644 index 495a5a4..0000000 --- a/PathReduxTests/PathRedux/ControllableHashFunction.cs +++ /dev/null @@ -1,36 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; -using YellowCounter.FileSystemState.PathRedux; - -namespace PathReduxTests.PathRedux -{ - - /// - /// This hash function allows us to fix the hashcode to known values, based on - /// the number before the comma. - /// - /// The string must be in the format: - /// "99999,Something" - /// "99999,Another thing" - /// Both these will get a hashcode of 99999. - /// - /// Using this we can deliberately create hash collisions. - /// - public class ControllableHashFunction : IHashFunction - { - public int HashSequence(ReadOnlySpan arg) - { - // Use comma as delimiter between desired hash number and remaining text. - int commaPos = arg.IndexOf(','); - - if(commaPos == -1) - throw new Exception($"{nameof(ControllableHashFunction)} requires , in each string"); - - if(int.TryParse(arg.Slice(0, commaPos), out int result)) - return result; - - throw new Exception("Text before , must be an integer"); - } - } -} diff --git a/PathReduxTests/PathRedux/DeterministicHashFunction.cs b/PathReduxTests/PathRedux/DeterministicHashFunction.cs deleted file mode 100644 index 6c1bc22..0000000 --- a/PathReduxTests/PathRedux/DeterministicHashFunction.cs +++ /dev/null @@ -1,32 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics.CodeAnalysis; -using System.Text; -using YellowCounter.FileSystemState.PathRedux; - -namespace PathReduxTests.PathRedux -{ - public class DeterministicHashFunction : IHashFunction - { - // Want a deterministic hash function so our tests are repeatable. - // https://andrewlock.net/why-is-string-gethashcode-different-each-time-i-run-my-program-in-net-core/ - public int HashSequence(ReadOnlySpan str) - { - unchecked - { - int hash1 = (5381 << 16) + 5381; - int hash2 = hash1; - - for(int i = 0; i < str.Length; i += 2) - { - hash1 = ((hash1 << 5) + hash1) ^ str[i]; - if(i == str.Length - 1) - break; - hash2 = ((hash2 << 5) + hash2) ^ str[i + 1]; - } - - return hash1 + (hash2 * 1566083941); - } - } - } -} diff --git a/PathReduxTests/PathRedux/HashedCharBufferTests.cs b/PathReduxTests/PathRedux/HashedCharBufferTests.cs index 0a56f59..d435c3b 100644 --- a/PathReduxTests/PathRedux/HashedCharBufferTests.cs +++ b/PathReduxTests/PathRedux/HashedCharBufferTests.cs @@ -1,5 +1,6 @@ using Microsoft.VisualStudio.TestTools.UnitTesting; using NSubstitute; +using PathReduxTests.HashCodes; using Shouldly; using System; using System.Collections.Generic; @@ -17,7 +18,7 @@ public void HashedCharBufferAddAndRetrieveNoClash() { var buf = new HashedCharBuffer(new HashedCharBufferOptions() { - HashFunction = new DeterministicHashFunction(), + NewHashCode = () => new DeterministicHashCode(), InitialCharCapacity = 20, InitialHashCapacity = 16, LinearSearchLimit = 3 @@ -38,7 +39,7 @@ public void HashedCharBufferAddAndRetrieveClash() { var buf = new HashedCharBuffer(new HashedCharBufferOptions() { - HashFunction = new ControllableHashFunction(), + NewHashCode = () => new ControllableHashCode(), InitialCharCapacity = 20, InitialHashCapacity = 16, LinearSearchLimit = 3 @@ -47,9 +48,9 @@ public void HashedCharBufferAddAndRetrieveClash() buf.Store("1,Hello"); buf.Store("1,World"); - // Confirm that both strings the same hashcode. - buf.HashFunction.HashSequence("1,Hello").ShouldBe(1); - buf.HashFunction.HashSequence("1,World").ShouldBe(1); + //// Confirm that both strings the same hashcode. + //buf.HashFunction.HashSequence("1,Hello").ShouldBe(1); + //buf.HashFunction.HashSequence("1,World").ShouldBe(1); buf.Find("1,Hello").ShouldBe(0); buf.Find("1,World").ShouldBe(8); @@ -64,7 +65,7 @@ public void HashedCharBufferHashCollision() // Allow only 1 item in the linear search phase var buf = new HashedCharBuffer(new HashedCharBufferOptions() { - HashFunction = new ControllableHashFunction(), + NewHashCode = () => new ControllableHashCode(), InitialCharCapacity = 20, InitialHashCapacity = 16, LinearSearchLimit = 1 @@ -85,7 +86,7 @@ public void HashedCharBufferAddAndRetrieveClashRunOutX() // Allow 1 items in the linear search phase var buf = new HashedCharBuffer(new HashedCharBufferOptions() { - HashFunction = new ControllableHashFunction(), + NewHashCode = () => new ControllableHashCode(), InitialCharCapacity = 20, InitialHashCapacity = 8, LinearSearchLimit = 1 diff --git a/PathReduxTests/PathRedux/PathStorageTests.cs b/PathReduxTests/PathRedux/PathStorageTests.cs index a1d5f6f..f8496a4 100644 --- a/PathReduxTests/PathRedux/PathStorageTests.cs +++ b/PathReduxTests/PathRedux/PathStorageTests.cs @@ -4,6 +4,7 @@ using System.Text; using YellowCounter.FileSystemState.PathRedux; using Shouldly; +using PathReduxTests.HashCodes; namespace PathReduxTests.PathRedux { @@ -16,7 +17,7 @@ public void PathStorage1() // Trying to trigger it rebuilding the text -> character buffer var ps = new PathStorage(new PathStorageOptions() { - HashFunction = new DeterministicHashFunction(), + NewHashCode = () => new DeterministicHashCode(), InitialCharCapacity = 4, InitialHashCapacity = 2, LinearSearchLimit = 128, @@ -45,7 +46,7 @@ public void PathStorage2() // Trying to trigger it rebuilding the text -> character buffer var ps = new PathStorage(new PathStorageOptions() { - HashFunction = new DeterministicHashFunction(), + NewHashCode = () => new DeterministicHashCode(), InitialCharCapacity = 4, InitialHashCapacity = 2, LinearSearchLimit = 128, diff --git a/YellowCounter.FileSystemState.Tests/UnitTests.cs b/YellowCounter.FileSystemState.Tests/UnitTests.cs index e0d216f..7044f41 100644 --- a/YellowCounter.FileSystemState.Tests/UnitTests.cs +++ b/YellowCounter.FileSystemState.Tests/UnitTests.cs @@ -301,8 +301,8 @@ public static void FileSystemWatcher_Recursive() [Fact] public static void FileSystemWatcher_BigDir() { - //string currentDir = @"C:\Users\SpanWork\Documents"; - string currentDir = @"C:\Users\SpanWork\Documents\Olleco\Scrapbook\DBAzure"; + string currentDir = @"C:\Users\SpanWork\Documents"; + //string currentDir = @"C:\Users\SpanWork\Documents\Olleco\Scrapbook\DBAzure"; FileSystemState watcher = new FileSystemState(currentDir, options: new EnumerationOptions { RecurseSubdirectories = true }); watcher.LoadState(); diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index a7f1d01..b5bfd12 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -6,6 +6,7 @@ using System.Runtime.Serialization.Formatters.Binary; using System.Linq; using YellowCounter.FileSystemState.PathRedux; +using YellowCounter.FileSystemState.HashCodes; namespace YellowCounter.FileSystemState { @@ -26,7 +27,7 @@ public FileSystemState(string rootDir, string filter = "*", EnumerationOptions o this.pathStorage = new PathStorage(new PathStorageOptions() { - HashFunction = new HashFunction(), + NewHashCode = () => new StandardHashCode(), InitialCharCapacity = 1024, InitialHashCapacity = 256, LinearSearchLimit = 128, diff --git a/YellowCounter.FileSystemState/HashCodes/HashCodeExtensions.cs b/YellowCounter.FileSystemState/HashCodes/HashCodeExtensions.cs new file mode 100644 index 0000000..fc24afa --- /dev/null +++ b/YellowCounter.FileSystemState/HashCodes/HashCodeExtensions.cs @@ -0,0 +1,33 @@ +using System; +using System.Buffers; +using System.Collections.Generic; +using System.Text; + +namespace YellowCounter.FileSystemState.HashCodes +{ + public static class HashCodeExtensions + { + public static int HashSequence(this IHashCode hashCode, ReadOnlySpan span) + { + foreach(var elem in span) + { + hashCode.Add(elem); + } + + return hashCode.ToHashCode(); + } + + public static int HashSequence(this IHashCode hashCode, ReadOnlySequence seq) + { + foreach(var mem in seq) + { + foreach(var elem in mem.Span) + { + hashCode.Add(elem); + } + } + + return hashCode.ToHashCode(); + } + } +} diff --git a/YellowCounter.FileSystemState/HashCodes/IHashCode.cs b/YellowCounter.FileSystemState/HashCodes/IHashCode.cs new file mode 100644 index 0000000..00104dc --- /dev/null +++ b/YellowCounter.FileSystemState/HashCodes/IHashCode.cs @@ -0,0 +1,12 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace YellowCounter.FileSystemState.HashCodes +{ + public interface IHashCode + { + void Add(char value); + int ToHashCode(); + } +} diff --git a/YellowCounter.FileSystemState/HashCodes/StandardHashCode.cs b/YellowCounter.FileSystemState/HashCodes/StandardHashCode.cs new file mode 100644 index 0000000..70c8672 --- /dev/null +++ b/YellowCounter.FileSystemState/HashCodes/StandardHashCode.cs @@ -0,0 +1,17 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace YellowCounter.FileSystemState.HashCodes +{ + public struct StandardHashCode : IHashCode + { + private HashCode hashCode; + public void Add(char value) + { + hashCode.Add(value); + } + + public int ToHashCode() => hashCode.ToHashCode(); + } +} diff --git a/YellowCounter.FileSystemState/PathRedux/HashFunction.cs b/YellowCounter.FileSystemState/PathRedux/HashFunction.cs deleted file mode 100644 index f858fd9..0000000 --- a/YellowCounter.FileSystemState/PathRedux/HashFunction.cs +++ /dev/null @@ -1,11 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace YellowCounter.FileSystemState.PathRedux -{ - public class HashFunction : IHashFunction - { - public int HashSequence(ReadOnlySpan arg) => arg.GetHashOfContents(); - } -} diff --git a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs index 53a96d9..c057930 100644 --- a/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs +++ b/YellowCounter.FileSystemState/PathRedux/HashedCharBuffer.cs @@ -2,6 +2,7 @@ using System.Buffers; using System.Collections.Generic; using System.Text; +using YellowCounter.FileSystemState.HashCodes; namespace YellowCounter.FileSystemState.PathRedux { @@ -10,21 +11,20 @@ public class HashedCharBuffer private readonly int linearSearchLimit; private CharBuffer charBuffer; private HashBucket chainedLookup; - private IHashFunction hashFunction; + private readonly Func newHashCode; public HashedCharBuffer(HashedCharBufferOptions options) { charBuffer = new CharBuffer(options.InitialCharCapacity); chainedLookup = new HashBucket(options.InitialHashCapacity, options.LinearSearchLimit); - this.hashFunction = options.HashFunction; + this.newHashCode = options.NewHashCode; this.linearSearchLimit = options.LinearSearchLimit; } public int LinearSearchLimit => this.linearSearchLimit; public int CharCapacity => charBuffer.Capacity; public int HashCapacity => chainedLookup.Capacity; - public IHashFunction HashFunction => hashFunction; /// /// Returns index position @@ -79,7 +79,7 @@ private int findByHash(int hash, ReadOnlySpan text) return charBuffer.Match(text, indices); } - private int hashSequence(ReadOnlySpan text) => hashFunction.HashSequence(text); + private int hashSequence(ReadOnlySpan text) => newHashCode().HashSequence(text); private void rebuildLookup() { diff --git a/YellowCounter.FileSystemState/PathRedux/HashedCharBufferOptions.cs b/YellowCounter.FileSystemState/PathRedux/HashedCharBufferOptions.cs index 4c4bed2..0823c7c 100644 --- a/YellowCounter.FileSystemState/PathRedux/HashedCharBufferOptions.cs +++ b/YellowCounter.FileSystemState/PathRedux/HashedCharBufferOptions.cs @@ -1,12 +1,13 @@ using System; using System.Collections.Generic; using System.Text; +using YellowCounter.FileSystemState.HashCodes; namespace YellowCounter.FileSystemState.PathRedux { public class HashedCharBufferOptions { - public IHashFunction HashFunction { get; set; } + public Func NewHashCode { get; set; } public int InitialCharCapacity { get; set; } public int InitialHashCapacity { get; set; } public int LinearSearchLimit { get; set; } diff --git a/YellowCounter.FileSystemState/PathRedux/IHashFunction.cs b/YellowCounter.FileSystemState/PathRedux/IHashFunction.cs deleted file mode 100644 index d98f8fc..0000000 --- a/YellowCounter.FileSystemState/PathRedux/IHashFunction.cs +++ /dev/null @@ -1,10 +0,0 @@ -using System; -using System.Collections.Generic; - -namespace YellowCounter.FileSystemState.PathRedux -{ - public interface IHashFunction - { - int HashSequence(ReadOnlySpan arg); - } -} \ No newline at end of file diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs index 30b09d7..ffe6af0 100644 --- a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs +++ b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs @@ -3,24 +3,26 @@ using System.Diagnostics; using System.Linq; using System.Text; +using YellowCounter.FileSystemState.HashCodes; namespace YellowCounter.FileSystemState.PathRedux { public class PathStorage : IPathStorage { - private IHashFunction hashFunction; private HashedCharBuffer buf; private HashBucket buckets; private List entries; private const int Root = -1; // The root entry's ParentIdx is set to this. + private Func newHashCode; + public PathStorage(PathStorageOptions options) { - this.hashFunction = options.HashFunction; + this.newHashCode = options.NewHashCode; buf = new HashedCharBuffer(new HashedCharBufferOptions() { - HashFunction = options.HashFunction, + NewHashCode = options.NewHashCode, InitialCharCapacity = options.InitialCharCapacity, InitialHashCapacity = options.InitialHashCapacity, LinearSearchLimit = options.LinearSearchLimit @@ -35,13 +37,13 @@ public PathStorage(PathStorageOptions options) public int Store(ReadOnlySpan arg) { - var hash = hashFunction.HashSequence(arg); + var hash = newHashCode().HashSequence(arg); foreach(var idx in buckets.Retrieve(hash)) { if(match(idx, arg)) { - Debug.WriteLine($"Found match for {arg.ToString()} ({hash})= {idx}"); + //Debug.WriteLine($"Found match for {arg.ToString()} ({hash})= {idx}"); return idx; } } @@ -75,18 +77,18 @@ public int Store(ReadOnlySpan arg) if(!buckets.Store(hash, result)) { - Debug.WriteLine($"Start rebuildBuckets"); + //Debug.WriteLine($"Start rebuildBuckets"); // Rebuild buckets from List twice as big rebuildBuckets(); - Debug.WriteLine($"End rebuildBuckets"); + //Debug.WriteLine($"End rebuildBuckets"); if(!buckets.Store(hash, result)) throw new Exception($"Too many hash collisions in {nameof(PathStorage)}"); } - Debug.WriteLine($"Created {arg.ToString()} ({hash})= {result}"); + //Debug.WriteLine($"Created {arg.ToString()} ({hash})= {result}"); return result; } @@ -97,15 +99,18 @@ private void rebuildBuckets() for(int idx = 0; idx < entries.Count; idx++) { - var h = new HashCode(); + var hashCode = newHashCode(); foreach(var textRef in chain(idx).Reverse()) { var text = buf.Retrieve(textRef); - h.Add(hashFunction.HashSequence(text)); + foreach(var elem in text) + { + hashCode.Add(elem); + } } - int hash = h.ToHashCode(); + int hash = hashCode.ToHashCode(); newBuckets.Store(hash, idx); } diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorageOptions.cs b/YellowCounter.FileSystemState/PathRedux/PathStorageOptions.cs index 435089e..a6a06ed 100644 --- a/YellowCounter.FileSystemState/PathRedux/PathStorageOptions.cs +++ b/YellowCounter.FileSystemState/PathRedux/PathStorageOptions.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.Text; +using YellowCounter.FileSystemState.HashCodes; namespace YellowCounter.FileSystemState.PathRedux { @@ -8,7 +9,7 @@ public class PathStorageOptions { public int HashBucketInitialCapacity { get; set; } public int HashBucketMaxChain { get; set; } - public IHashFunction HashFunction { get; set; } + public Func NewHashCode { get; set; } public int InitialCharCapacity { get; set; } public int InitialHashCapacity { get; set; } public int LinearSearchLimit { get; set; } From 62e3f23e5b6fab77504a876e9e4866fd90b478dd Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 25 Apr 2020 09:38:39 +0100 Subject: [PATCH 25/26] Removed 3x 64-bit version flags --- YellowCounter.FileSystemState/FileState.cs | 11 +-------- .../FileSystemState.cs | 6 ++--- .../PathToFileStateHashtable.cs | 24 ++++--------------- 3 files changed, 7 insertions(+), 34 deletions(-) diff --git a/YellowCounter.FileSystemState/FileState.cs b/YellowCounter.FileSystemState/FileState.cs index 5df7944..0c624a4 100644 --- a/YellowCounter.FileSystemState/FileState.cs +++ b/YellowCounter.FileSystemState/FileState.cs @@ -8,21 +8,12 @@ namespace YellowCounter.FileSystemState [Serializable] internal class FileState { - [NonSerialized] - public long LastSeenVersion; // removal notification are implemented something similar to "mark and sweep". This value is incremented in the mark phase - - [NonSerialized] - public long CreateVersion; - [NonSerialized] - public long ChangeVersion; - + //[NonSerialized] public FileStateFlags Flags; public int DirectoryRef; public int FilenameRef; public DateTimeOffset LastWriteTimeUtc; public long Length; - - internal FileState Clone() => (FileState)this.MemberwiseClone(); } [Flags] diff --git a/YellowCounter.FileSystemState/FileSystemState.cs b/YellowCounter.FileSystemState/FileSystemState.cs index b5bfd12..d7dc5df 100644 --- a/YellowCounter.FileSystemState/FileSystemState.cs +++ b/YellowCounter.FileSystemState/FileSystemState.cs @@ -12,7 +12,6 @@ namespace YellowCounter.FileSystemState { public class FileSystemState : IAcceptFileSystemEntry { - private long _version = 0L; private PathToFileStateHashtable _state; public FileSystemState(string rootDir, string filter = "*", EnumerationOptions options = null) @@ -93,14 +92,13 @@ private void gatherChanges() public void Accept(ref FileSystemEntry fileSystemEntry) { - _state.Mark(ref fileSystemEntry, _version); + _state.Mark(ref fileSystemEntry); } private void acceptChanges() { // Clear out the files that have been removed or renamed from our state. - _state.Sweep(_version); - _version++; + _state.Sweep(); } private FileChangeList convertToFileChanges( diff --git a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs index c2c504e..2f69514 100644 --- a/YellowCounter.FileSystemState/PathToFileStateHashtable.cs +++ b/YellowCounter.FileSystemState/PathToFileStateHashtable.cs @@ -21,19 +21,13 @@ public PathToFileStateHashtable(IPathStorage pathStorage) this.pathStorage = pathStorage; } - internal void Mark(ref FileSystemEntry input,long version) + internal void Mark(ref FileSystemEntry input) { int dirRef = pathStorage.Store(input.Directory); int filenameRef = pathStorage.Store(input.FileName); int hashCode = HashCode.Combine(dirRef.GetHashCode(), filenameRef.GetHashCode()); - //// Without allocating strings, calculate a hashcode based on the - //// directory and filename. - //int hashCode = HashCode.Combine( - // input.Directory.GetHashOfContents(), - // input.FileName.GetHashOfContents()); - if(dict.TryGetValue(hashCode, out var fileStates)) { bool found = false; @@ -47,7 +41,7 @@ internal void Mark(ref FileSystemEntry input,long version) { // Found the file; compare to our existing record so we can // detect if it has been modified. - markExisting(existing, input, version); + markExisting(existing, input); found = true; break; @@ -72,10 +66,6 @@ FileState newFileState(FileSystemEntry input) fileState.Flags = FileStateFlags.Created | FileStateFlags.Seen; - fileState.LastSeenVersion = version; - fileState.CreateVersion = version; - fileState.ChangeVersion = version; - fileState.DirectoryRef = dirRef; fileState.FilenameRef = filenameRef; @@ -86,23 +76,17 @@ FileState newFileState(FileSystemEntry input) } } - private void markExisting(FileState fs, FileSystemEntry input, long version) + private void markExisting(FileState fs, FileSystemEntry input) { // Mark that we've seen the file. fs.Flags |= FileStateFlags.Seen; - // Mark that we've seen the file. - fs.LastSeenVersion = version; - // Has it changed since we last saw it? if(fs.LastWriteTimeUtc != input.LastWriteTimeUtc || fs.Length != input.Length) { fs.Flags |= FileStateFlags.Changed; - // Mark that this version was a change - fs.ChangeVersion = version; - // Update the last write time / file length. fs.LastWriteTimeUtc = input.LastWriteTimeUtc; fs.Length = input.Length; @@ -119,7 +103,7 @@ public IEnumerable Read() } } - public void Sweep(long version) + public void Sweep() { var toRemove = new List(); From 620654156c5b10612d43e271eaaa08a21a5ed236 Mon Sep 17 00:00:00 2001 From: Alan Singfield Date: Sat, 25 Apr 2020 09:56:34 +0100 Subject: [PATCH 26/26] Improved comments slightly --- .../PathRedux/PathStorage.cs | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs index ffe6af0..3ba5eb2 100644 --- a/YellowCounter.FileSystemState/PathRedux/PathStorage.cs +++ b/YellowCounter.FileSystemState/PathRedux/PathStorage.cs @@ -7,6 +7,19 @@ namespace YellowCounter.FileSystemState.PathRedux { + /// + /// Storing a long list of full paths from a recursive directory search involves + /// a lot of repeats: + /// C:\abc\def + /// C:\abc\def\ghi + /// C:\abc\def\jkl + /// C:\abc\def\mno + /// + /// This class implements a Parent Pointer Tree, it splits the path by the directory + /// separator, stores the final text after the \, then a pointer to the entry for + /// the parent directory. This occurs recursively so we only store the text for each + /// folder name once. + /// public class PathStorage : IPathStorage { private HashedCharBuffer buf; @@ -43,7 +56,6 @@ public int Store(ReadOnlySpan arg) { if(match(idx, arg)) { - //Debug.WriteLine($"Found match for {arg.ToString()} ({hash})= {idx}"); return idx; } } @@ -76,20 +88,13 @@ public int Store(ReadOnlySpan arg) if(!buckets.Store(hash, result)) { - - //Debug.WriteLine($"Start rebuildBuckets"); - // Rebuild buckets from List twice as big rebuildBuckets(); - //Debug.WriteLine($"End rebuildBuckets"); - if(!buckets.Store(hash, result)) throw new Exception($"Too many hash collisions in {nameof(PathStorage)}"); } - //Debug.WriteLine($"Created {arg.ToString()} ({hash})= {result}"); - return result; }