diff --git a/.envrc b/.envrc
new file mode 100644
index 0000000000..2797f0f929
--- /dev/null
+++ b/.envrc
@@ -0,0 +1,3 @@
+source_up_if_exists
+
+use flake
diff --git a/.vimrc b/.vimrc
new file mode 100644
index 0000000000..4c8a8a8279
--- /dev/null
+++ b/.vimrc
@@ -0,0 +1,8 @@
+" example search path configuration
+set path=lib/runtime/**,lib/**
+
+" set build target
+" let g:target = "pcg"
+
+" set test target
+" let g:test_target = "utils-test"
diff --git a/lib/compiler/include/compiler/algorithm_config.variant.toml b/lib/compiler/include/compiler/algorithm_config.variant.toml
new file mode 100644
index 0000000000..4e58104875
--- /dev/null
+++ b/lib/compiler/include/compiler/algorithm_config.variant.toml
@@ -0,0 +1,18 @@
+namespace = "FlexFlow"
+name = "AlgorithmConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "compiler/data_parallelism/data_parallelism_config.dtg.h",
+  "compiler/unity_algorithm/unity_search_config.dtg.h",
+]
+
+[[values]]
+type = "::FlexFlow::DataParallelismConfig"
+
+[[values]]
+type = "::FlexFlow::UnitySearchConfig"
diff --git a/lib/compiler/include/compiler/compiler.h b/lib/compiler/include/compiler/compiler.h
index 178ab19a53..8697c06beb 100644
--- a/lib/compiler/include/compiler/compiler.h
+++ b/lib/compiler/include/compiler/compiler.h
@@ -1,42 +1,22 @@
 #ifndef _FLEXFLOW_COMPILER_COMPILER_H
 #define _FLEXFLOW_COMPILER_COMPILER_H
 
-#include "pcg/cost_values.h"
-#include "pcg/machine_view.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
-#include "pcg/tensor_mapping.h"
+#include "compiler/algorithm_config.dtg.h"
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/search_result.dtg.h"
+#include "pcg/machine_specification.dtg.h"
 
 namespace FlexFlow {
 
 enum class SearchAlgorithm {
   DATA_PARALLEL,
-};
-
-using SearchAlgorithmConfig = std::variant<>;
-using SearchSolution = std::variant<>;
-
-struct SearchResult {
-  ParallelComputationGraph pcg;
-  TensorMapping tensor_mapping;
-  SearchSolution solution;
-  CostValues cost_values;
+  UNITY,
 };
 
 SearchResult optimize(ComputationGraph const &,
                       MachineSpecification const &,
                       CostEstimator const &,
-                      SearchAlgorithm,
-                      optional<AlgorithmConfig> const &);
-
-// struct SearchSolution {
-//   LabelledMultiDiGraph<PCGOperatorAttrs, ParallelTensorShape> optimized_pcg;
-//   std::unordered_map<Node, MachineView> device_assignments;
-//   /* std::unordered_map<tensor_guid_t,
-//   std::unordered_set<parallel_tensor_guid_t>> tensor_mappings; */
-// };
-//
-// SearchSolution run_data_parallelize(ComputationGraph const &,
-// MachineSpecification const &);
+                      AlgorithmConfig const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml b/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml
new file mode 100644
index 0000000000..68512fa473
--- /dev/null
+++ b/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml
@@ -0,0 +1,14 @@
+namespace = "FlexFlow"
+name = "DataParallelismConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+]
+
+[[fields]]
+name = "degree"
+type = "int"
diff --git a/lib/compiler/include/compiler/graph_optimize_result.struct.toml b/lib/compiler/include/compiler/graph_optimize_result.struct.toml
deleted file mode 100644
index 22f29cbd59..0000000000
--- a/lib/compiler/include/compiler/graph_optimize_result.struct.toml
+++ /dev/null
@@ -1,16 +0,0 @@
-namespace = "FlexFlow"
-name = "GraphOptimizeResult"
-features = [ ]
-
-includes = [ 
-  "compiler/machine_mapping/machine_mapping.dtg.h",
-  "pcg/parallel_computation_graph/parallel_computation_graph.h"
-]
-
-[[fields]]
-name = "pcg"
-type = "::FlexFlow::ParallelComputationGraph"
-
-[[fields]]
-name = "machine_mapping"
-type = "::FlexFlow::MachineMapping"
diff --git a/lib/compiler/include/compiler/allowed_machine_views.h b/lib/compiler/include/compiler/machine_mapping/allowed_machine_views.h
similarity index 100%
rename from lib/compiler/include/compiler/allowed_machine_views.h
rename to lib/compiler/include/compiler/machine_mapping/allowed_machine_views.h
diff --git a/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h
new file mode 100644
index 0000000000..b08ca57851
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h
@@ -0,0 +1,32 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_AND_UPDATE_MACHINE_MAPPING_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_AND_UPDATE_MACHINE_MAPPING_H
+
+#include "compiler/search_result.dtg.h"
+#include "substitutions/pcg_pattern_match.dtg.h"
+#include "substitutions/sub_parallel_computation_graph.dtg.h"
+#include "substitutions/substitution.dtg.h"
+
+namespace FlexFlow {
+/**
+ * @brief Applies \p substitution to \p mapped_pcg at the location specified by
+ * \p match, returning the resulting SearchResult (mapped pcg)
+ *
+ * @param mapped_pcg
+ * @param substitution
+ * @param match The location at which to apply substitution. This location in
+ * sub_pcg should match substitution's PCGPattern. Likely created by running
+ * FlexFlow::find_pattern_matches(PCGPattern const &,
+ * SubParallelComputationGraph const &).
+ * @return SearchResult A mapped pcg similar to mapped_pcg, but with
+ * the subgraph of the pcg specified by match replaced with the result of the
+ * output expression of substitution and the machine mapping updated to account
+ * for the new output
+ */
+SearchResult apply_substitution_and_update_machine_mapping(
+    SearchResult const &mapped_pcg,
+    Substitution const &sub,
+    PCGPatternMatch const &match);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
index 7375cde985..796225637e 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
@@ -2,6 +2,8 @@
 #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_H
 
 #include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_result.h"
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h"
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/machine_specification.dtg.h"
 #include "pcg/operator_task_space.dtg.h"
@@ -14,6 +16,13 @@ MachineMapping combine_disjoint_mappings(MachineMapping const &,
 
 bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2);
 
+parallel_layer_guid_t
+    get_layer_from_path(PCGBinarySPDecomposition const &sp_decomposition,
+                        BinaryTreePath const &path);
+
+std::optional<MachineMapping> get_machine_mapping_from_machine_mapping_result(
+    PCGBinarySPDecomposition const &, MachineMappingResult const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
new file mode 100644
index 0000000000..43af640e02
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H
+
+#include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/search_result.dtg.h"
+
+namespace FlexFlow {
+std::optional<MachineMapping>
+    get_naive_mapping(ParallelComputationGraph &pcg,
+                      MachineSpecification const &resources,
+                      DeviceType const &device_type);
+
+std::optional<MachineMapping>
+    get_random_mutation(SearchResult mapped_pcg,
+                        MachineSpecification const &resource,
+                        DeviceType const &device_type);
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h
index 68d02aaa54..168ba6c3d5 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h
@@ -9,6 +9,9 @@
 
 namespace FlexFlow {
 
+bool is_valid_machine_mapping_problem_tree(
+    MachineMappingProblemTree const &problem_tree);
+
 MachineMappingProblemTree
     get_machine_mapping_problem_tree(ParallelComputationGraph const &pcg,
                                      PCGBinarySPDecomposition const &sp);
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h
index 29e9e7c90b..3d1dc91d24 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h
@@ -4,6 +4,7 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h"
 #include "utils/full_binary_tree/binary_tree_path.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h"
 #include "utils/graph/series_parallel/sp_decomposition_tree_node_type.dtg.h"
@@ -27,6 +28,9 @@ std::optional<MachineMappingProblemTree>
     mm_problem_tree_get_subtree_at_path(MachineMappingProblemTree const &,
                                         BinaryTreePath const &);
 
+std::string as_dot(MachineMappingProblemTree const &);
+void debug_print_dot(MachineMappingProblemTree const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml
index fe76683eb7..7493c68387 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml
@@ -11,6 +11,7 @@ includes = [
   "op-attrs/parallel_tensor_shape.dtg.h",
   "<vector>",
   "pcg/machine_view.dtg.h",
+  "pcg/operator_task_space.dtg.h",
 ]
 
 src_includes = [
@@ -34,3 +35,6 @@ type = "std::vector<::FlexFlow::ParallelTensorShape>"
 name = "output_shapes"
 type = "std::vector<::FlexFlow::ParallelTensorShape>"
 
+[[fields]]
+name = "op_task_space"
+type = "::FlexFlow::OperatorTaskSpace"
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
index b21fea5f24..db2f4e6f0d 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
@@ -31,6 +31,8 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &);
     make_singleton_machine_mapping_result(float runtime,
                                           MachineView const &machine_view);
 
+[[nodiscard]] float get_runtime_cost(MachineMappingResult const &mm_result);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h
new file mode 100644
index 0000000000..a27ecbc8f4
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h
@@ -0,0 +1,57 @@
+#ifndef _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_ALGORITHM_H
+#define _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_ALGORITHM_H
+
+#include "compiler/mcmc/generic_mcmc_config.dtg.h"
+#include "compiler/mcmc/generic_mcmc_state.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/random_utils.h"
+#include <optional>
+
+namespace FlexFlow {
+
+template <typename State, typename ScoringFunc>
+void modify_state_for_minimization(
+    Generic_MCMC_state<State, float> &best_state,
+    Generic_MCMC_state<State, float> &current_state,
+    State candidate,
+    ScoringFunc scorer,
+    float temperature) {
+  float best_estimate = best_state.get_score();
+  float new_estimate = scorer(candidate);
+  float delta = new_estimate - best_estimate;
+  if (delta < 0 || (randf() < exp(-delta / temperature))) {
+    current_state = Generic_MCMC_state<State, float>(candidate, new_estimate);
+    if (delta < 0) {
+      best_state = current_state;
+    }
+  }
+}
+
+// GeneratingFunc : State -> nn_int -> std::optional<State>
+// ScoringFunc : State -> float
+
+template <typename State, typename GeneratingFunc, typename ScoringFunc>
+Generic_MCMC_state<State, float>
+    minimize_score(State const &starting_state,
+                   GeneratingFunc const &generator,
+                   ScoringFunc const &scorer,
+                   GenericMCMCConfig const &search_config) {
+  using MCMCState = Generic_MCMC_state<State, float>;
+  MCMCState best_state = MCMCState(starting_state, scorer(starting_state));
+  MCMCState current_state = best_state;
+  for (nonnegative_int i : nonnegative_range(search_config.num_iterations)) {
+    std::optional<State> candidate = generator(current_state.get_state(), i);
+    if (candidate != std::nullopt) {
+      modify_state_for_minimization(best_state,
+                                    current_state,
+                                    candidate.value(),
+                                    scorer,
+                                    search_config.temperature);
+    }
+  }
+  return best_state;
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml b/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml
new file mode 100644
index 0000000000..e11c84f0bd
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "GenericMCMCConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h"
+]
+
+[[fields]]
+name = "temperature"
+type = "float"
+
+[[fields]]
+name = "num_iterations"
+type = "::FlexFlow::nonnegative_int"
\ No newline at end of file
diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h b/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h
new file mode 100644
index 0000000000..6a6aada32b
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_STATE_H
+#define _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_STATE_H
+#include "utils/nonnegative_int/nonnegative_int.h"
+
+namespace FlexFlow {
+
+template <typename State, typename Score>
+struct Generic_MCMC_state {
+public:
+  Generic_MCMC_state(State const &state, Score const &score)
+      : state(state), score(score) {}
+
+  State const &get_state() const {
+    return state;
+  }
+  Score const &get_score() const {
+    return score;
+  }
+
+private:
+  State state;
+  Score score;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h
new file mode 100644
index 0000000000..c2d8737184
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_COMPILER_MCMC_OVER_MAPPED_PCG_H
+#define _FLEXFLOW_COMPILER_MCMC_OVER_MAPPED_PCG_H
+
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/mcmc/mcmc_over_mapped_pcg_config.dtg.h"
+#include "compiler/search_result.dtg.h"
+#include "pcg/computation_graph.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/substitution.h"
+
+namespace FlexFlow {
+
+SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
+                                 CostEstimator const &cost_estimator,
+                                 MachineSpecification const &resources,
+                                 MCMCOverMappedPCGConfig const &search_config);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml
new file mode 100644
index 0000000000..e1548a581e
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml
@@ -0,0 +1,28 @@
+namespace = "FlexFlow"
+name = "MCMCOverMappedPCGConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "pcg/device_type.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h"
+]
+
+[[fields]]
+name = "temperature"
+type = "float"
+
+[[fields]]
+name = "num_iterations"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "substitution_interval"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "device_type"
+type = "::FlexFlow::DeviceType"
\ No newline at end of file
diff --git a/lib/compiler/include/compiler/graph_optimize_result.h b/lib/compiler/include/compiler/search_result.h
similarity index 54%
rename from lib/compiler/include/compiler/graph_optimize_result.h
rename to lib/compiler/include/compiler/search_result.h
index f3843e2a93..197b36e9ea 100644
--- a/lib/compiler/include/compiler/graph_optimize_result.h
+++ b/lib/compiler/include/compiler/search_result.h
@@ -1,12 +1,12 @@
 #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_GRAPH_OPTIMIZE_RESULT_H
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_GRAPH_OPTIMIZE_RESULT_H
 
-#include "compiler/graph_optimize_result.dtg.h"
+#include "compiler/search_result.dtg.h"
 
 namespace FlexFlow {
 
-std::string format_as(GraphOptimizeResult const &);
-std::ostream &operator<<(std::ostream &, GraphOptimizeResult const &);
+std::string format_as(SearchResult const &);
+std::ostream &operator<<(std::ostream &, SearchResult const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/search_result.struct.toml b/lib/compiler/include/compiler/search_result.struct.toml
new file mode 100644
index 0000000000..120d182c75
--- /dev/null
+++ b/lib/compiler/include/compiler/search_result.struct.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "SearchResult"
+features = [
+]
+
+includes = [
+  "pcg/parallel_computation_graph/parallel_computation_graph.h",
+  "compiler/machine_mapping/machine_mapping.h",
+]
+
+[[fields]]
+name = "pcg"
+type = "::FlexFlow::ParallelComputationGraph"
+
+[[fields]]
+name = "machine_mapping"
+type = "::FlexFlow::MachineMapping"
diff --git a/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h b/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h
index d43edaa79d..bb7459c767 100644
--- a/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h
+++ b/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h
@@ -1,6 +1,8 @@
 #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_SERIES_PARALLEL_GET_PCG_BALANCED_BINARY_SP_DECOMPOSITION_H
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_SERIES_PARALLEL_GET_PCG_BALANCED_BINARY_SP_DECOMPOSITION_H
 
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h"
+
 namespace FlexFlow {
 
 std::optional<PCGBinarySPDecomposition>
diff --git a/lib/compiler/include/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h b/lib/compiler/include/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h
index 86fa1a59aa..e4fd841787 100644
--- a/lib/compiler/include/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h
+++ b/lib/compiler/include/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h
@@ -27,6 +27,10 @@ std::optional<PCGBinarySPDecomposition>
 std::unordered_multiset<parallel_layer_guid_t>
     get_parallel_layers(PCGBinarySPDecomposition const &);
 
+PCGBinarySPDecomposition
+    pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+        BinarySPDecompositionTree const &);
+
 SPDecompositionTreeNodeType get_node_type(PCGBinarySPDecomposition const &);
 
 std::unordered_set<BinaryTreePath>
diff --git a/lib/compiler/include/compiler/unity_algorithm.h b/lib/compiler/include/compiler/unity_algorithm.h
deleted file mode 100644
index 232f2b9563..0000000000
--- a/lib/compiler/include/compiler/unity_algorithm.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H
-#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H
-
-#include "compiler/cost_estimator/cost_estimator.h"
-#include "compiler/graph_optimize_result.dtg.h"
-#include "optimizer_config.dtg.h"
-#include "pcg/computation_graph.h"
-#include "pcg/machine_specification.dtg.h"
-#include "substitutions/sub_parallel_computation_graph.h"
-
-namespace FlexFlow {
-
-GraphOptimizeResult graph_optimize(
-    ParallelComputationGraph &pcg,
-    CostEstimator const &cost_estimator,
-    MachineSpecification const &resources,
-    std::function<std::unordered_set<MachineView>(
-        ParallelLayerAttrs const &, MachineSpecification const &)> const
-        &allowed_machine_views,
-    OptimizerConfig const &opt_config);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/compiler/include/compiler/graph_optimize_state.h b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h
similarity index 63%
rename from lib/compiler/include/compiler/graph_optimize_state.h
rename to lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h
index 404111ff8b..9f609f3118 100644
--- a/lib/compiler/include/compiler/graph_optimize_state.h
+++ b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h
@@ -1,16 +1,17 @@
-#ifndef _FLEXFLOW_COMPILER_MCMC_STATE_H
-#define _FLEXFLOW_COMPILER_MCMC_STATE_H
+#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_STATE_H
+#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_STATE_H
 
-#include "compiler/graph_optimize_result.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 
 namespace FlexFlow {
 
 struct GraphOptimizeState {
-  explicit GraphOptimizeState(GraphOptimizeResult const &graph_optimize_result,
+  GraphOptimizeState() = delete;
+  explicit GraphOptimizeState(ParallelComputationGraph const &pcg,
                               float runtime);
 
-  GraphOptimizeResult graph_optimize_result;
-  float runtime;
+  ParallelComputationGraph pcg;
+  float runtime_with_optimal_mm;
 
   bool operator==(GraphOptimizeState const &other) const;
   bool operator!=(GraphOptimizeState const &other) const;
diff --git a/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h
new file mode 100644
index 0000000000..618e764f80
--- /dev/null
+++ b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H
+#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H
+
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/search_result.dtg.h"
+#include "compiler/unity_algorithm/unity_search_config.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "substitutions/substitution.h"
+
+namespace FlexFlow {
+
+SearchResult graph_optimize(ParallelComputationGraph &pcg,
+                            CostEstimator const &cost_estimator,
+                            MachineSpecification const &resources,
+                            UnitySearchConfig const &search_config);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/optimizer_config.struct.toml b/lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml
similarity index 90%
rename from lib/compiler/include/compiler/optimizer_config.struct.toml
rename to lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml
index b7f4f71e9c..9ec22cf916 100644
--- a/lib/compiler/include/compiler/optimizer_config.struct.toml
+++ b/lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "OptimizerConfig"
+name = "UnitySearchConfig"
 features = [
   "eq",
   "hash",
diff --git a/lib/compiler/src/compiler/compiler.cc b/lib/compiler/src/compiler/compiler.cc
new file mode 100644
index 0000000000..a58651f01a
--- /dev/null
+++ b/lib/compiler/src/compiler/compiler.cc
@@ -0,0 +1,26 @@
+#include "compiler/compiler.h"
+#include "compiler/unity_algorithm/unity_algorithm.h"
+#include "pcg/pcg_from_computation_graph.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+SearchResult optimize(ComputationGraph const &computation_graph,
+                      MachineSpecification const &machine_specification,
+                      CostEstimator const &cost_estimator,
+                      AlgorithmConfig const &search_config) {
+  return search_config.visit<SearchResult>(overload{
+      [&](DataParallelismConfig const &config) -> SearchResult {
+        throw std::runtime_error(
+            "Data parallel search algorithm is not implemented yet");
+      },
+      [&](UnitySearchConfig const &config) {
+        ParallelComputationGraph pcg =
+            pcg_from_computation_graph(computation_graph);
+        return graph_optimize(
+            pcg, cost_estimator, machine_specification, config);
+      },
+  });
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/graph_optimize_result.cc b/lib/compiler/src/compiler/graph_optimize_result.cc
deleted file mode 100644
index f48c119603..0000000000
--- a/lib/compiler/src/compiler/graph_optimize_result.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "compiler/graph_optimize_result.h"
-
-namespace FlexFlow {
-
-std::string format_as(GraphOptimizeResult const &r) {
-  return fmt::format("<GraphOptimizeResult\npcg={}\nmachine_mapping={}>",
-                     as_dot(r.pcg),
-                     r.machine_mapping);
-}
-
-std::ostream &operator<<(std::ostream &s, GraphOptimizeResult const &r) {
-  return (s << fmt::to_string(r));
-}
-
-} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/graph_optimize_state.cc b/lib/compiler/src/compiler/graph_optimize_state.cc
deleted file mode 100644
index 1091b92866..0000000000
--- a/lib/compiler/src/compiler/graph_optimize_state.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "compiler/graph_optimize_state.h"
-#include "compiler/graph_optimize_result.h"
-#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
-
-namespace FlexFlow {
-
-GraphOptimizeState::GraphOptimizeState(
-    GraphOptimizeResult const &graph_optimize_result, float runtime)
-    : graph_optimize_result(graph_optimize_result), runtime(runtime) {}
-
-bool GraphOptimizeState::operator==(GraphOptimizeState const &other) const {
-  // Note(@wmdi): This is a hack to implement a partially correct homomorphism
-  // check. Switch to the homomorphism check used in substitutions right after
-  // https://github.com/flexflow/FlexFlow/pull/1471 is merged.
-  auto layers1 = topological_ordering(graph_optimize_result.pcg);
-  auto layers2 = topological_ordering(other.graph_optimize_result.pcg);
-  if (layers1.size() != layers2.size()) {
-    return false;
-  }
-  std::unordered_map<parallel_tensor_guid_t, parallel_tensor_guid_t> mapping;
-  for (size_t i = 0; i < layers1.size(); ++i) {
-    if (get_parallel_layer_attrs(graph_optimize_result.pcg, layers1[i]) !=
-        get_parallel_layer_attrs(other.graph_optimize_result.pcg, layers2[i])) {
-      return false;
-    }
-    auto inputs1 = get_incoming_tensors(graph_optimize_result.pcg, layers1[i]);
-    auto inputs2 =
-        get_incoming_tensors(other.graph_optimize_result.pcg, layers2[i]);
-    if (inputs1.size() != inputs2.size()) {
-      return false;
-    }
-    for (size_t j = 0; j < inputs1.size(); ++j) {
-      if (inputs1[j] != mapping.at(inputs2[j])) {
-        return false;
-      }
-    }
-    auto outputs1 = get_layer_outputs(graph_optimize_result.pcg, layers1[i]);
-    auto outputs2 =
-        get_layer_outputs(other.graph_optimize_result.pcg, layers2[i]);
-    if (outputs1.size() != outputs2.size()) {
-      return false;
-    }
-    for (size_t j = 0; j < outputs1.size(); ++j) {
-      mapping.emplace(outputs2[j], outputs1[j]);
-    }
-  }
-  return true;
-}
-
-bool GraphOptimizeState::operator!=(GraphOptimizeState const &other) const {
-  return !(*this == other);
-}
-
-bool GraphOptimizeState::operator<(GraphOptimizeState const &other) const {
-  return runtime < other.runtime;
-}
-
-std::string format_as(GraphOptimizeState const &st) {
-  return fmt::format("<GraphOptimizeState graph_optimize_result={} runtime={}>",
-                     st.graph_optimize_result,
-                     st.runtime);
-}
-
-std::ostream &operator<<(std::ostream &s, GraphOptimizeState const &st) {
-  return (s << fmt::to_string(st));
-}
-
-} // namespace FlexFlow
-
-namespace std {
-
-size_t hash<::FlexFlow::GraphOptimizeState>::operator()(
-    ::FlexFlow::GraphOptimizeState const &state) const {
-  // TODO(@wmdi): Eventually it might be good to use a proper graph hash like
-  // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash
-  size_t seed = 0;
-  auto layers = topological_ordering(state.graph_optimize_result.pcg);
-  ::FlexFlow::hash_combine(seed, layers.size());
-  for (auto layer : layers) {
-    ::FlexFlow::hash_combine(
-        seed, get_parallel_layer_attrs(state.graph_optimize_result.pcg, layer));
-    auto inputs = get_incoming_tensors(state.graph_optimize_result.pcg, layer);
-    ::FlexFlow::hash_combine(seed, inputs.size());
-    for (auto input : inputs) {
-      for (size_t i = 0; i < layers.size(); ++i) {
-        if (get_source_layer(input) == layers[i]) {
-          ::FlexFlow::hash_combine(seed, i);
-          break;
-        }
-      }
-    }
-  }
-  return seed;
-}
-
-} // namespace std
diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc
similarity index 79%
rename from lib/compiler/src/compiler/allowed_machine_views.cc
rename to lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc
index 6f86d1d82a..b4df1451ca 100644
--- a/lib/compiler/src/compiler/allowed_machine_views.cc
+++ b/lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc
@@ -1,4 +1,4 @@
-#include "compiler/allowed_machine_views.h"
+#include "compiler/machine_mapping/allowed_machine_views.h"
 #include "pcg/machine_specification.h"
 #include "pcg/machine_view.h"
 #include "pcg/multi_dimensional_stride.dtg.h"
@@ -57,6 +57,8 @@ static std::unordered_set<MachineView>
         product(transform(tensor_dims, [](nonnegative_int num_devices) {
           return nonnegative_int{num_devices.unwrap_nonnegative() - 1};
         }));
+    min_num_devices_with_full_stride_volume =
+        std::max(min_num_devices_with_full_stride_volume, 1_n);
     return ceildiv(total_devices, min_num_devices_with_full_stride_volume);
   };
 
@@ -66,13 +68,19 @@ static std::unordered_set<MachineView>
     nonnegative_int max_stride_upper_bound =
         get_max_stride_upper_bound(tensor_dims, total_devices);
 
-    std::vector<stride_t> single_stride_range =
-        transform(nonnegative_range(1_n, max_stride_upper_bound + 1_n),
-                  [](nonnegative_int stride) { return stride_t{stride}; });
+    std::vector<std::vector<stride_t>> stride_options =
+        transform(tensor_dims, [&](nonnegative_int dim_size) {
+          if (dim_size != 1_n) {
+            return transform(
+                nonnegative_range(1_n, max_stride_upper_bound + 1_n),
+                [](nonnegative_int stride) { return stride_t{stride}; });
+          } else {
+            return std::vector<stride_t>{stride_t{1_n}};
+          }
+        });
+
     std::unordered_multiset<std::vector<stride_t>> raw_stride_vectors =
-        cartesian_product(
-            repeat_element(/*num_times=*/num_elements(tensor_dims),
-                           /*element=*/single_stride_range));
+        cartesian_product(stride_options);
     std::unordered_multiset<MultiDimensionalStride> strides =
         transform(raw_stride_vectors, [](auto const &stride_vec) {
           return MultiDimensionalStride{stride_vec};
@@ -94,10 +102,18 @@ static std::unordered_set<MachineView>
   };
 
   auto candidate_dimensions = [](OperatorTaskSpace const &task) {
-    std::unordered_set<MachineSpecificationDimension> options = {
-        MachineSpecificationDimension::INTER_NODE,
-        MachineSpecificationDimension::INTRA_NODE};
-    return get_all_permutations_with_repetition(options, num_dims(task));
+    std::vector<std::vector<MachineSpecificationDimension>> dimension_options =
+        transform(task.degrees, [](nonnegative_int dim_size) {
+          if (dim_size == 1_n) {
+            return std::vector<MachineSpecificationDimension>{
+                MachineSpecificationDimension::INTRA_NODE};
+          } else {
+            return std::vector<MachineSpecificationDimension>{
+                MachineSpecificationDimension::INTER_NODE,
+                MachineSpecificationDimension::INTRA_NODE};
+          }
+        });
+    return cartesian_product(dimension_options);
   };
 
   std::vector<nonnegative_int> tensor_dims = task.degrees;
diff --git a/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
new file mode 100644
index 0000000000..252384985b
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
@@ -0,0 +1,197 @@
+#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
+#include "substitutions/apply_substitution/apply_substitution.h"
+#include "substitutions/apply_substitution/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h"
+#include "substitutions/open_parallel_tensor_guid_t.h"
+#include "substitutions/pcg_pattern_match.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/sub_parallel_computation_graph_data.dtg.h"
+#include "substitutions/sub_parallel_computation_graph_edge.h"
+#include "utils/containers/is_subseteq_of.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/merge_maps.h"
+#include "utils/containers/restrict_keys.h"
+#include "utils/containers/set_minus.h"
+#include "utils/containers/values.h"
+
+namespace FlexFlow {
+
+SearchResult apply_substitution_and_update_machine_mapping(
+    SearchResult const &mapped_pcg,
+    Substitution const &sub,
+    PCGPatternMatch const &match) {
+  SubParallelComputationGraph spcg = sub_pcg_from_full_pcg(mapped_pcg.pcg);
+
+  auto substitution_output_result =
+      evaluate_substitution_output(spcg, sub, match);
+  SubParallelComputationGraph substitution_output_graph =
+      substitution_output_result.first;
+  OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping =
+      substitution_output_result.second;
+
+  SubParallelComputationGraphData output_graph_data =
+      get_sub_pcg_data(substitution_output_graph);
+  SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg);
+
+  std::unordered_set<parallel_layer_guid_t> pre_nodes =
+      keys(pre_data.node_data);
+  std::unordered_set<parallel_layer_guid_t> matched_nodes =
+      unordered_set_of(values(match.node_assignment));
+  std::unordered_set<parallel_layer_guid_t> post_nodes_from_original_graph =
+      set_minus(pre_nodes, matched_nodes);
+
+  std::unordered_map<parallel_layer_guid_t, MachineView> machine_views =
+      mapped_pcg.machine_mapping.machine_views;
+
+  std::unordered_set<MachineView> substituted_machine_views =
+      transform(matched_nodes, [&](parallel_layer_guid_t const &node) {
+        return machine_views.at(node);
+      });
+  MachineView first_substituted_machine_view =
+      *substituted_machine_views.begin();
+
+  std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs> post_node_data =
+      [&] {
+        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+            post_node_data_from_orig = restrict_keys(
+                pre_data.node_data, post_nodes_from_original_graph);
+        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+            post_node_data_from_sub = output_graph_data.node_data;
+
+        for (auto [layer, attrs] : post_node_data_from_sub) {
+          machine_views.insert_or_assign(layer, first_substituted_machine_view);
+        }
+
+        return merge_disjoint_maps(post_node_data_from_orig,
+                                   post_node_data_from_sub);
+      }();
+
+  std::unordered_set<SubParallelComputationGraphEdge> post_edges = [&] {
+    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_orig =
+        filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) {
+          if (e.raw_edge.has<DataflowInputEdge>()) {
+            return true;
+          } else {
+            DataflowEdge dfe = e.raw_edge.get<DataflowEdge>();
+            parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node};
+            parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node};
+            return !(contains(matched_nodes, src) ||
+                     contains(matched_nodes, dst));
+          }
+        });
+
+    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_sub =
+        filter(output_graph_data.edges,
+               [&](SubParallelComputationGraphEdge const &e) {
+                 return !e.raw_edge.has<DataflowInputEdge>();
+               });
+
+    bidict<PatternNodeOutput, parallel_tensor_guid_t>
+        output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match(
+            match, sub.pcg_pattern, spcg);
+    bidict<parallel_tensor_guid_t, OutputGraphExprNodeOutput>
+        output_post_outexpr_mapping = get_output_graph_expr_output_mapping(
+            output_expr_to_result_sub_pcg_mapping,
+            sub.output_graph_expr,
+            substitution_output_graph);
+
+    std::unordered_set<SubParallelComputationGraphEdge> incoming_to_sub_edges;
+    for (auto const &[pattern_input, base_graph_tensor] :
+         match.input_assignment) {
+      OutputGraphExprInput output_expr_input =
+          sub.inputs_mapping.at_l(pattern_input);
+      input_parallel_tensor_guid_t output_graph_input =
+          output_expr_to_result_sub_pcg_mapping.input_mapping.at_r(
+              output_expr_input);
+      std::unordered_set<parallel_tensor_use_t> uses = get_parallel_tensor_uses(
+          substitution_output_graph,
+          open_parallel_tensor_guid_from_input(output_graph_input));
+      for (parallel_tensor_use_t const &use : uses) {
+        SubParallelComputationGraphEdge new_edge =
+            subpcg_edge_from_tensor_and_use(base_graph_tensor, use);
+        incoming_to_sub_edges.insert(new_edge);
+      }
+    }
+
+    std::unordered_set<SubParallelComputationGraphEdge> outgoing_from_sub_edges;
+    for (ParallelComputationGraphEdge const &outgoing_edge :
+         get_subgraph_outgoing_edges(spcg, matched_nodes)) {
+      parallel_tensor_guid_t original_tensor =
+          get_parallel_tensor(outgoing_edge);
+      PatternNodeOutput pattern_tensor =
+          output_orig_pattern_mapping.at_r(original_tensor);
+      OutputGraphExprNodeOutput output_graph_tensor =
+          sub.outputs_mapping.at_l(pattern_tensor);
+      parallel_tensor_guid_t new_tensor =
+          output_post_outexpr_mapping.at_r(output_graph_tensor);
+
+      SubParallelComputationGraphEdge new_edge =
+          subpcg_edge_from_tensor_and_dst(
+              new_tensor,
+              get_dst_layer(outgoing_edge),
+              get_dst_layer_input_idx(outgoing_edge));
+      outgoing_from_sub_edges.insert(new_edge);
+    }
+
+    return set_union(std::vector{
+        post_edges_from_orig,
+        post_edges_from_sub,
+        incoming_to_sub_edges,
+        outgoing_from_sub_edges,
+    });
+  }();
+
+  std::unordered_set<input_parallel_tensor_guid_t> post_inputs =
+      pre_data.inputs;
+
+  std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+      post_value_data = [&] {
+        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+            post_value_data_from_orig = filter_keys(
+                pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) {
+                  return visit_open_parallel_tensor_guid(
+                      t,
+                      overload{
+                          [&](parallel_tensor_guid_t const &t) {
+                            return contains(post_nodes_from_original_graph,
+                                            get_source_layer(t));
+                          },
+                          [](input_parallel_tensor_guid_t const &) {
+                            return true;
+                          },
+                      });
+                });
+
+        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+            post_value_data_from_sub = output_graph_data.value_data;
+        return merge_disjoint_maps(post_value_data_from_orig,
+                                   post_value_data_from_sub);
+      }();
+
+  SubParallelComputationGraphData post_data = SubParallelComputationGraphData{
+      post_node_data,
+      post_edges,
+      post_inputs,
+      post_value_data,
+  };
+
+  assert(is_subseteq_of(keys(post_node_data), keys(machine_views)));
+
+  for (auto it = machine_views.begin(); it != machine_views.end();) {
+    if (post_node_data.find(it->first) == post_node_data.end()) {
+      it = machine_views.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
+  assert(keys(post_node_data) == keys(machine_views));
+
+  return SearchResult{
+      pcg_from_sub_pcg_by_dropping_inputs(sub_pcg_from_graph_data(post_data)),
+      MachineMapping{machine_views}};
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 49d528e4ab..0743301e8f 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -16,9 +16,13 @@
 #include "pcg/machine_view.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "utils/containers/contains.h"
+#include "utils/containers/contains_key.h"
 #include "utils/containers/flatmap.h"
 #include "utils/containers/generate_map.h"
 #include "utils/containers/get_all_assignments.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/merge_maps.h"
+#include "utils/containers/set_minus.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/exception.h"
 #include "utils/overload.h"
@@ -80,17 +84,23 @@ MachineMappingResult
                                     &parallel_split_transformation) {
 
   auto get_boundary_machine_view_assignments =
-      [&](std::unordered_set<BinaryTreePath> const &boundary_layers)
+      [&](std::unordered_set<BinaryTreePath> const &boundary_layers,
+          MachineMappingProblemTree const &t,
+          BinaryTreePathEntry const &prefix)
       -> std::unordered_set<ParallelLayerGuidObliviousMachineMapping> {
+    std::unordered_set<BinaryTreePath> unconstrained_boundary_layers =
+        set_minus(boundary_layers,
+                  keys(restrict_to_child(constraints, prefix).machine_views));
+
     std::unordered_map<BinaryTreePath, std::unordered_set<MachineView>>
         allowed = generate_map(
-            boundary_layers,
+            unconstrained_boundary_layers,
             [&](BinaryTreePath const &l) -> std::unordered_set<MachineView> {
+              MachineMappingProblemTree subtree_at_path =
+                  expect(mm_problem_tree_get_subtree_at_path(t, l),
+                         "Failed to get subtree at path");
               UnmappedOpCostEstimateKey leaf =
-                  mm_problem_tree_get_subtree_at_path(
-                      MachineMappingProblemTree{series_split}, l)
-                      .value()
-                      .get<UnmappedOpCostEstimateKey>();
+                  subtree_at_path.get<UnmappedOpCostEstimateKey>();
               return context.allowed_machine_views(leaf, resources);
             });
     return transform(
@@ -138,24 +148,37 @@ MachineMappingResult
 
   for (ParallelLayerGuidObliviousMachineMapping const
            &assigned_pre_machine_views :
-       get_boundary_machine_view_assignments(get_src_layers(tensor_movement))) {
+       get_boundary_machine_view_assignments(get_src_layers(tensor_movement),
+                                             series_split.get_left_child(),
+                                             BinaryTreePathEntry::LEFT_CHILD)) {
 
     MachineMappingResult pre_result =
         eval_pre_boundary_mapping(assigned_pre_machine_views);
 
+    if (is_infeasible(pre_result)) {
+      continue;
+    }
+
     for (ParallelLayerGuidObliviousMachineMapping const
              &assigned_post_machine_views :
          get_boundary_machine_view_assignments(
-             get_dst_layers(tensor_movement))) {
+             get_dst_layers(tensor_movement),
+             series_split.get_right_child(),
+             BinaryTreePathEntry::RIGHT_CHILD)) {
 
       MachineMappingResult post_result =
           eval_post_boundary_mapping(assigned_post_machine_views);
 
+      if (is_infeasible(post_result)) {
+        continue;
+      }
+
       TensorSetMovement comm_across_split =
           concretize_abstracted_tensor_set_movement(
               tensor_movement,
-              /*pre_mapping=*/assigned_pre_machine_views,
-              /*post_mapping=*/assigned_post_machine_views);
+              /*pre_mapping=*/pre_result.raw_result.value().machine_mapping,
+              /*post_mapping=*/post_result.raw_result.value().machine_mapping);
+
       float cost_across_split =
           context.cost_estimator.estimate_cost(comm_across_split);
 
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
index 82c8274808..07bde820e9 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
@@ -1,7 +1,16 @@
 #include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
+#include "pcg/machine_specification.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.dtg.h"
+#include "pcg/operator_task_space.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "utils/containers/are_disjoint.h"
 #include "utils/containers/keys.h"
+#include "utils/containers/map_keys.h"
 #include "utils/containers/merge_maps.h"
+#include "utils/containers/transform.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h"
 
 namespace FlexFlow {
 
@@ -15,4 +24,39 @@ bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) {
   return are_disjoint(keys(m1.machine_views), keys(m2.machine_views));
 }
 
+parallel_layer_guid_t
+    get_layer_from_path(PCGBinarySPDecomposition const &sp_decomposition,
+                        BinaryTreePath const &path) {
+  std::optional<PCGBinarySPDecomposition> subtree_optional =
+      get_subtree_at_path(
+          sp_decomposition, generic_impl_for_pcg_sp_tree(), path);
+
+  if (!subtree_optional.has_value()) {
+    throw std::runtime_error(fmt::format("Invalid tree path {}", path));
+  }
+
+  PCGBinarySPDecomposition subtree = subtree_optional.value();
+  if (!subtree.is_leaf()) {
+    throw std::runtime_error(
+        fmt::format("Invalid tree path to a leaf: found {} instead", subtree));
+  }
+  return subtree.require_leaf();
+}
+
+std::optional<MachineMapping> get_machine_mapping_from_machine_mapping_result(
+    PCGBinarySPDecomposition const &sp_decomposition,
+    MachineMappingResult const &mm_result) {
+
+  return transform(
+      mm_result.raw_result,
+      [&](FeasibleMachineMappingResult const &feasible_mm_result) {
+        return MachineMapping{
+            map_keys(feasible_mm_result.machine_mapping.raw_mapping,
+                     [&](BinaryTreePath const &path) {
+                       return get_layer_from_path(sp_decomposition, path);
+                     }),
+        };
+      });
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
new file mode 100644
index 0000000000..15648eab74
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
@@ -0,0 +1,52 @@
+#include "compiler/machine_mapping/machine_mapping_mutation_set.h"
+#include "compiler/machine_mapping/allowed_machine_views.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.h"
+#include "utils/containers/vector_of.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/random_utils.h"
+#include "utils/vector.h"
+
+namespace FlexFlow {
+
+std::optional<MachineMapping>
+    get_naive_mapping(ParallelComputationGraph &pcg,
+                      MachineSpecification const &resources,
+                      DeviceType const &device_type) {
+  std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
+  std::unordered_map<parallel_layer_guid_t, MachineView> machine_views;
+  for (parallel_layer_guid_t layer : layers) {
+    OperatorTaskSpace task = get_operator_task_space(pcg, layer);
+    std::unordered_set<MachineView> allowed_machine_views =
+        get_allowed_machine_views(resources, task, DeviceType::GPU);
+    if (allowed_machine_views.empty()) {
+      return std::nullopt;
+    }
+    machine_views.insert({layer, *(allowed_machine_views.begin())});
+  }
+  return MachineMapping{machine_views};
+}
+
+std::optional<MachineMapping>
+    get_random_mutation(SearchResult mapped_pcg,
+                        MachineSpecification const &resources,
+                        DeviceType const &device_type) {
+  ParallelComputationGraph pcg = mapped_pcg.pcg;
+  std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
+  if (layers.size() == 0) {
+    return std::nullopt;
+  }
+  parallel_layer_guid_t random_layer = select_random(layers);
+
+  MachineMapping machine_mapping = mapped_pcg.machine_mapping;
+  MachineView machine_view = machine_mapping.machine_views.at(random_layer);
+  OperatorTaskSpace task = get_operator_task_space(pcg, random_layer);
+
+  std::vector<MachineView> allowed_machine_views =
+      vector_of(get_allowed_machine_views(resources, task, device_type));
+  MachineView random_new_machine_view = select_random(allowed_machine_views);
+
+  machine_mapping.machine_views.at(random_layer) = random_new_machine_view;
+  return machine_mapping;
+}
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
index 367af3701e..1d000ff041 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
@@ -1,14 +1,50 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
 #include "compiler/machine_mapping/transitive_reduced_pcg.h"
 #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "utils/containers/all_of.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
 
+bool is_valid_machine_mapping_problem_tree(
+    MachineMappingProblemTree const &problem_tree) {
+  return problem_tree.visit<bool>(overload{
+      [&](MMProblemTreeSeriesSplit const &series_split) {
+        AbstractedTensorSetMovement tensor_movement =
+            series_split.tensor_set_movement;
+
+        auto contains_paths =
+            [](MachineMappingProblemTree const &t,
+               std::unordered_set<BinaryTreePath> const &paths) {
+              return all_of(paths, [&](BinaryTreePath const &p) {
+                return mm_problem_tree_get_subtree_at_path(t, p).has_value();
+              });
+            };
+
+        return contains_paths(series_split.get_left_child(),
+                              get_src_layers(tensor_movement)) &&
+               contains_paths(series_split.get_right_child(),
+                              get_dst_layers(tensor_movement)) &&
+               is_valid_machine_mapping_problem_tree(
+                   series_split.get_left_child()) &&
+               is_valid_machine_mapping_problem_tree(
+                   series_split.get_right_child());
+      },
+      [&](MMProblemTreeParallelSplit const &parallel_split) {
+        return is_valid_machine_mapping_problem_tree(
+                   parallel_split.get_left_child()) &&
+               is_valid_machine_mapping_problem_tree(
+                   parallel_split.get_right_child());
+      },
+      [&](UnmappedOpCostEstimateKey const &leaf) { return true; },
+  });
+}
+
 MachineMappingProblemTree get_machine_mapping_problem_tree(
     ParallelComputationGraph const &pcg,
     PCGBinarySPDecomposition const &sp_decomposition_tree) {
@@ -23,31 +59,43 @@ MachineMappingProblemTree get_machine_mapping_problem_tree(
         [&](PCGBinarySeriesSplit const &series) {
           AbstractedTensorSetMovement tensor_movement =
               get_abstracted_tensor_set_movement_across_split(tr_pcg, series);
-          return MachineMappingProblemTree{
+          MachineMappingProblemTree result = MachineMappingProblemTree{
               MMProblemTreeSeriesSplit{
                   /*tensor_set_movement=*/tensor_movement,
                   /*lhs=*/to_problem_tree(series.get_left_child()),
                   /*rhs=*/to_problem_tree(series.get_right_child()),
               },
           };
+          assert(is_valid_machine_mapping_problem_tree(result));
+          return result;
         },
         [&](PCGBinaryParallelSplit const &parallel) {
-          return MachineMappingProblemTree{
+          MachineMappingProblemTree result = MachineMappingProblemTree{
               MMProblemTreeParallelSplit{
                   to_problem_tree(parallel.get_left_child()),
                   to_problem_tree(parallel.get_right_child()),
               },
           };
+          assert(is_valid_machine_mapping_problem_tree(result));
+          return result;
         },
         [&](parallel_layer_guid_t const &leaf) {
-          return MachineMappingProblemTree{
+          MachineMappingProblemTree result = MachineMappingProblemTree{
               get_unmapped_op_cost_estimate_key_for_layer(pcg, leaf),
           };
+          assert(is_valid_machine_mapping_problem_tree(result));
+          return result;
         },
     });
   };
 
-  return to_problem_tree(sp_decomposition_tree);
+  MachineMappingProblemTree mm_tree = to_problem_tree(sp_decomposition_tree);
+
+  if (!is_valid_machine_mapping_problem_tree(mm_tree)) {
+    throw std::runtime_error("Invalid machine mapping problem tree generated");
+  }
+
+  return mm_tree;
 }
 
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc
index 1e39a7be19..7834938e41 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc
@@ -1,4 +1,6 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_all_leaf_paths.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_leaves.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h"
@@ -88,4 +90,54 @@ std::optional<MachineMappingProblemTree>
       tree, generic_binary_sp_impl_for_mm_problem_tree(), path);
 }
 
+std::string as_dot(MachineMappingProblemTree const &tree) {
+  std::function<std::string(MMProblemTreeSeriesSplit const &)>
+      get_series_label =
+          [](MMProblemTreeSeriesSplit const &series) -> std::string {
+    auto path_as_dot = [](BinaryTreePath const &path) -> std::string {
+      return "(" +
+             join_strings(path.entries,
+                          ", ",
+                          [](BinaryTreePathEntry const &entry) -> std::string {
+                            if (entry == BinaryTreePathEntry::LEFT_CHILD) {
+                              return "l";
+                            } else {
+                              assert(entry == BinaryTreePathEntry::RIGHT_CHILD);
+                              return "r";
+                            }
+                          }) +
+             ")";
+    };
+
+    auto path_set_as_dot =
+        [&](std::unordered_set<BinaryTreePath> const &path_set) -> std::string {
+      return "(" + join_strings(path_set, ", ", path_as_dot) + ")";
+    };
+
+    return fmt::format(
+        "srcs={} dsts={}",
+        path_set_as_dot(get_src_layers(series.tensor_set_movement)),
+        path_set_as_dot(get_dst_layers(series.tensor_set_movement)));
+  };
+
+  std::function<std::string(MMProblemTreeParallelSplit const &)>
+      get_parallel_label =
+          [](MMProblemTreeParallelSplit const &parallel) -> std::string {
+    return "P";
+  };
+
+  std::function<std::string(UnmappedOpCostEstimateKey const &)> get_leaf_label =
+      [](UnmappedOpCostEstimateKey const &leaf) -> std::string { return ""; };
+
+  return as_dot(tree,
+                generic_binary_sp_impl_for_mm_problem_tree(),
+                get_series_label,
+                get_parallel_label,
+                get_leaf_label);
+}
+
+void debug_print_dot(MachineMappingProblemTree const &tree) {
+  std::cout << as_dot(tree) << std::endl;
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc
index 990b287f8b..b6d701cb98 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc
@@ -1,4 +1,5 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "pcg/operator_task_space.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 
@@ -18,6 +19,8 @@ UnmappedOpCostEstimateKey get_unmapped_op_cost_estimate_key_for_layer(
       transform(get_incoming_weights(pcg, layer), get_tensor_shape),
       /*output_shapes=*/
       transform(get_layer_outputs(pcg, layer), get_tensor_shape),
+      /*op_task_space=*/
+      get_operator_task_space(pcg, layer),
   };
 }
 
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
index 3409f7f871..031b7f7fc5 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -135,4 +135,12 @@ MachineMappingResult
   };
 }
 
+float get_runtime_cost(MachineMappingResult const &mm_result) {
+  if (mm_result.raw_result == std::nullopt) {
+    return std::numeric_limits<float>::infinity();
+  } else {
+    return mm_result.raw_result.value().runtime;
+  }
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc
new file mode 100644
index 0000000000..1bf4f5c2b7
--- /dev/null
+++ b/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc
@@ -0,0 +1 @@
+#include "compiler/mcmc/generic_mcmc_algorithm.h"
diff --git a/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc b/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc
new file mode 100644
index 0000000000..6aa4dd5eff
--- /dev/null
+++ b/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc
@@ -0,0 +1,12 @@
+#include "compiler/mcmc/generic_mcmc_state.h"
+#include "utils/archetypes/ordered_value_type.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+using State = value_type<0>;
+using Score = ordered_value_type<1>;
+
+template struct Generic_MCMC_state<State, Score>;
+template struct Generic_MCMC_state<State, float>;
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
new file mode 100644
index 0000000000..ab7769679e
--- /dev/null
+++ b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
@@ -0,0 +1,73 @@
+#include "compiler/mcmc/mcmc_over_mapped_pcg.h"
+#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h"
+#include "compiler/machine_mapping/machine_mapping_mutation_set.h"
+#include "compiler/mcmc/generic_mcmc_algorithm.h"
+#include "compiler/search_result.h"
+#include "compiler/task_graph_simulator/task_simulator.h"
+#include "substitutions/pcg_pattern.h"
+#include "substitutions/pcg_pattern_match.h"
+#include "substitutions/unity_substitution_set.h"
+#include "utils/optional.h"
+
+namespace FlexFlow {
+
+SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
+                                 CostEstimator const &cost_estimator,
+                                 MachineSpecification const &resources,
+                                 MCMCOverMappedPCGConfig const &search_config) {
+
+  std::vector<Substitution> substitutions = get_substitution_set(resources);
+
+  std::optional<MachineMapping> naive_mapping =
+      get_naive_mapping(pcg, resources, search_config.device_type);
+  if (naive_mapping == std::nullopt) {
+    throw std::runtime_error("Failed to find any solutions");
+  }
+
+  SearchResult starting_state = SearchResult{pcg, naive_mapping.value()};
+
+  auto generating_func = [&](SearchResult mapped_pcg,
+                             nonnegative_int i) -> std::optional<SearchResult> {
+    if (i.unwrap_nonnegative() %
+            search_config.substitution_interval.unwrap_nonnegative() ==
+        0) {
+      // substitutions every (substitution_interval) iterations
+      std::optional<Substitution> random_substitution =
+          get_random_substitution(resources);
+      if (random_substitution != std::nullopt) {
+        std::optional<PCGPatternMatch> pattern_match =
+            get_random_pattern_match(random_substitution.value().pcg_pattern,
+                                     sub_pcg_from_full_pcg(mapped_pcg.pcg));
+        if (pattern_match != std::nullopt) {
+          return apply_substitution_and_update_machine_mapping(
+              mapped_pcg, random_substitution.value(), pattern_match.value());
+        }
+      }
+      return std::nullopt;
+    } else {
+      // machine mapping mutations otherwise
+      std::optional<MachineMapping> new_machine_mapping =
+          get_random_mutation(mapped_pcg, resources, search_config.device_type);
+      if (new_machine_mapping == std::nullopt) {
+        return std::nullopt;
+      }
+      return SearchResult{mapped_pcg.pcg, new_machine_mapping.value()};
+    }
+  };
+
+  auto scoring_func = [&](SearchResult mapped_pcg) -> float {
+    return task_simulator_estimate_forward_pass_time(
+        mapped_pcg.pcg, cost_estimator, mapped_pcg.machine_mapping, resources);
+  };
+
+  GenericMCMCConfig config =
+      GenericMCMCConfig{/*temperature*/ search_config.temperature,
+                        /*num_iterations*/ search_config.num_iterations};
+
+  Generic_MCMC_state<SearchResult, float> result =
+      minimize_score(starting_state, generating_func, scoring_func, config);
+
+  return result.get_state();
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/search_result.cc b/lib/compiler/src/compiler/search_result.cc
new file mode 100644
index 0000000000..0afc10723a
--- /dev/null
+++ b/lib/compiler/src/compiler/search_result.cc
@@ -0,0 +1,15 @@
+#include "compiler/search_result.h"
+
+namespace FlexFlow {
+
+std::string format_as(SearchResult const &r) {
+  return fmt::format("<SearchResult\npcg={}\nmachine_mapping={}>",
+                     as_dot(r.pcg),
+                     r.machine_mapping);
+}
+
+std::ostream &operator<<(std::ostream &s, SearchResult const &r) {
+  return (s << fmt::to_string(r));
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.cc b/lib/compiler/src/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.cc
index 5eb993c6ef..7b4670c608 100644
--- a/lib/compiler/src/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.cc
+++ b/lib/compiler/src/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.cc
@@ -1,7 +1,10 @@
 #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
+#include "compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.h"
+#include "compiler/series_parallel/pcg/pcg_binary_parallel_split.h"
 #include "compiler/series_parallel/pcg/pcg_binary_series_split.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/find_paths_to_leaf.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_leaves.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/left_associative_binary_sp_tree_from_nary.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -82,8 +85,63 @@ BinarySPDecompositionTree
 }
 
 std::optional<PCGBinarySPDecomposition>
-    get_pcg_balanced_binary_sp_decomposition(ParallelComputationGraph const &) {
-  NOT_IMPLEMENTED();
+    get_pcg_balanced_binary_sp_decomposition(
+        ParallelComputationGraph const &pcg) {
+  SeriesParallelDecomposition sp_decomp =
+      expect(get_pcg_series_parallel_decomposition(pcg),
+             "Failed to get SP decomposition of PCG");
+  BinarySPDecompositionTree binary_sp_tree =
+      left_associative_binary_sp_tree_from_nary(sp_decomp);
+  return pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+      binary_sp_tree);
+}
+
+PCGBinarySeriesSplit pcg_binary_series_split_from_binary_series_split(
+    BinarySeriesSplit const &split) {
+  return PCGBinarySeriesSplit{
+      pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+          split.get_left_child()),
+      pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+          split.get_right_child()),
+  };
+}
+
+PCGBinaryParallelSplit pcg_binary_parallel_split_from_binary_parallel_split(
+    BinaryParallelSplit const &split) {
+  return PCGBinaryParallelSplit{
+      pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+          split.get_left_child()),
+      pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+          split.get_right_child()),
+  };
+}
+
+PCGBinarySPDecomposition
+    pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+        BinarySPDecompositionTree const &sp_tree) {
+
+  return sp_tree.visit<PCGBinarySPDecomposition>(overload{
+      [](BinarySeriesSplit const &series) -> PCGBinarySPDecomposition {
+        return PCGBinarySPDecomposition{
+            pcg_binary_series_split_from_binary_series_split(series),
+        };
+      },
+      [](BinaryParallelSplit const &parallel) -> PCGBinarySPDecomposition {
+        return PCGBinarySPDecomposition{
+            PCGBinaryParallelSplit{
+                pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+                    parallel.get_left_child()),
+                pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+                    parallel.get_right_child()),
+            },
+        };
+      },
+      [](Node const &node) -> PCGBinarySPDecomposition {
+        return PCGBinarySPDecomposition{
+            parallel_layer_guid_t{node},
+        };
+      },
+  });
 }
 
 std::unordered_multiset<parallel_layer_guid_t>
diff --git a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
new file mode 100644
index 0000000000..22e319321b
--- /dev/null
+++ b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
@@ -0,0 +1,61 @@
+#include "compiler/unity_algorithm/graph_optimize_state.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
+
+namespace FlexFlow {
+
+GraphOptimizeState::GraphOptimizeState(ParallelComputationGraph const &pcg,
+                                       float runtime_with_optimal_mm)
+    : pcg(pcg), runtime_with_optimal_mm(runtime_with_optimal_mm) {}
+
+bool GraphOptimizeState::operator==(GraphOptimizeState const &other) const {
+  return pcgs_are_isomorphic(pcg, other.pcg);
+}
+
+bool GraphOptimizeState::operator!=(GraphOptimizeState const &other) const {
+  return !(*this == other);
+}
+
+bool GraphOptimizeState::operator<(GraphOptimizeState const &other) const {
+  return runtime_with_optimal_mm < other.runtime_with_optimal_mm;
+}
+
+std::string format_as(GraphOptimizeState const &st) {
+  return fmt::format("<GraphOptimizeState pcg={} runtime_with_optimal_mm={}>",
+                     as_dot(st.pcg),
+                     st.runtime_with_optimal_mm);
+}
+
+std::ostream &operator<<(std::ostream &s, GraphOptimizeState const &st) {
+  return (s << fmt::to_string(st));
+}
+
+} // namespace FlexFlow
+
+namespace std {
+
+size_t hash<::FlexFlow::GraphOptimizeState>::operator()(
+    ::FlexFlow::GraphOptimizeState const &state) const {
+  // TODO(@wmdi): Eventually it might be good to use a proper graph hash like
+  // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash
+  size_t seed = 0;
+  std::vector<::FlexFlow::parallel_layer_guid_t> layers =
+      topological_ordering(state.pcg);
+  ::FlexFlow::hash_combine(seed, layers.size());
+  for (::FlexFlow::parallel_layer_guid_t const &layer : layers) {
+    ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(state.pcg, layer));
+    std::vector<::FlexFlow::parallel_tensor_guid_t> inputs =
+        get_incoming_tensors(state.pcg, layer);
+    ::FlexFlow::hash_combine(seed, inputs.size());
+    for (::FlexFlow::parallel_tensor_guid_t input : inputs) {
+      for (size_t i = 0; i < layers.size(); ++i) {
+        if (get_source_layer(input) == layers.at(i)) {
+          ::FlexFlow::hash_combine(seed, i);
+          break;
+        }
+      }
+    }
+  }
+  return seed;
+}
+
+} // namespace std
diff --git a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
new file mode 100644
index 0000000000..caaefbfdbf
--- /dev/null
+++ b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
@@ -0,0 +1,138 @@
+#include "compiler/unity_algorithm/unity_algorithm.h"
+#include "compiler/machine_mapping/allowed_machine_views.h"
+#include "compiler/machine_mapping/get_optimal_machine_mapping.h"
+#include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/machine_mapping/machine_mapping_cache.h"
+#include "compiler/machine_mapping/machine_mapping_constraints.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "compiler/machine_mapping/machine_mapping_result.h"
+#include "compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h"
+#include "compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.h"
+#include "compiler/unity_algorithm/graph_optimize_state.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/operator_task_space.h"
+#include "substitutions/apply_substitution/apply_substitution.h"
+#include "substitutions/pcg_pattern.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/substitution.h"
+#include "substitutions/unity_substitution_set.h"
+#include "utils/containers/generate_map.h"
+#include "utils/deduplicated_priority_queue.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/optional.h"
+
+namespace FlexFlow {
+
+/*
+ * Applies a substitution to all possible positions in PCG
+ */
+std::vector<ParallelComputationGraph>
+    all_pcgs_obtained_by_applying_a_substitution(
+        ParallelComputationGraph const &pcg,
+        std::vector<Substitution> const &substitutions) {
+  std::vector<ParallelComputationGraph> results;
+  SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(pcg);
+  for (Substitution const &substitution : substitutions) {
+    for (PCGPatternMatch const &pattern_match :
+         find_pattern_matches(substitution.pcg_pattern, subpcg)) {
+      SubParallelComputationGraph subpcg_from_substitution =
+          apply_substitution(subpcg, substitution, pattern_match);
+      results.push_back(
+          pcg_from_sub_pcg_by_dropping_inputs(subpcg_from_substitution));
+    }
+  }
+  return results;
+}
+
+SearchResult graph_optimize(ParallelComputationGraph &pcg,
+                            CostEstimator const &cost_estimator,
+                            MachineSpecification const &resources,
+                            UnitySearchConfig const &search_config) {
+
+  std::vector<Substitution> substitutions = get_substitution_set(resources);
+
+  MachineMappingCache cached_subgraph_costs = empty_machine_mapping_cache();
+  DeduplicatedPriorityQueue<GraphOptimizeState> candidates;
+
+  MachineMappingContext context = MachineMappingContext{
+      /*cost_estimator=*/cost_estimator,
+      /*allowed_machine_views=*/
+      [&](UnmappedOpCostEstimateKey const &key,
+          MachineSpecification const &resources)
+          -> std::unordered_set<MachineView> {
+        return get_allowed_machine_views(
+            resources, key.op_task_space, DeviceType::GPU);
+      },
+  };
+
+  auto optimize_pcg = [&](ParallelComputationGraph const &pcg)
+      -> std::pair<GraphOptimizeState, std::optional<MachineMapping>> {
+    PCGBinarySPDecomposition sp_decomp =
+        expect(get_pcg_balanced_binary_sp_decomposition(pcg),
+               "Failed to get SP decomposition of PCG");
+
+    MachineMappingProblemTree problem_tree =
+        get_machine_mapping_problem_tree(pcg, sp_decomp);
+    MachineMappingConstraints constraints =
+        get_unconstrained_solution_for_layers(get_all_leaf_paths(problem_tree));
+
+    MachineMappingResult mm_result = get_optimal_machine_mapping(
+        cached_subgraph_costs, context, problem_tree, resources, constraints);
+
+    return {
+        GraphOptimizeState{
+            /*pcg=*/pcg,
+            /*runtime_with_optimal_mm=*/get_runtime_cost(mm_result),
+        },
+        get_machine_mapping_from_machine_mapping_result(sp_decomp, mm_result),
+    };
+  };
+
+  GraphOptimizeState best_state = optimize_pcg(pcg).first;
+  candidates.push(best_state);
+
+  for (int iteration = 0;
+       !candidates.empty() && iteration < search_config.budget;
+       ++iteration) {
+    GraphOptimizeState current_state = candidates.top();
+    candidates.pop();
+
+    if (current_state < best_state) {
+      best_state = current_state;
+    } else if (current_state.runtime_with_optimal_mm >
+               best_state.runtime_with_optimal_mm * search_config.alpha) {
+      continue;
+    }
+
+    for (ParallelComputationGraph const &new_pcg :
+         all_pcgs_obtained_by_applying_a_substitution(current_state.pcg,
+                                                      substitutions)) {
+      std::optional<GraphOptimizeState> new_pcg_optimize_result =
+          optimize_pcg(new_pcg).first;
+      if (new_pcg_optimize_result == std::nullopt) {
+        continue;
+      }
+      GraphOptimizeState new_state = new_pcg_optimize_result.value();
+      if (new_state.runtime_with_optimal_mm <= search_config.threshold &&
+          get_nodes(new_pcg.raw_graph).size() <= search_config.max_num_ops) {
+        candidates.push(new_state);
+      }
+    }
+  }
+
+  std::optional<MachineMapping> best_mapping =
+      optimize_pcg(best_state.pcg).second;
+
+  if (best_mapping == std::nullopt) {
+    throw std::runtime_error("Failed to find any solutions");
+  }
+
+  return SearchResult{
+      /*pcg=*/best_state.pcg,
+      /*machine_mapping=*/best_mapping.value(),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/unity_algorithm.cc b/lib/compiler/src/unity_algorithm.cc
deleted file mode 100644
index 86a211c535..0000000000
--- a/lib/compiler/src/unity_algorithm.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-#include "compiler/unity_algorithm.h"
-#include "compiler/graph_optimize_state.h"
-#include "compiler/machine_mapping/get_optimal_machine_mapping.h"
-#include "pcg/machine_specification.dtg.h"
-#include "substitutions/substitution.h"
-#include "utils/deduplicated_priority_queue.h"
-#include "utils/graph/node/algorithms.h"
-namespace FlexFlow {
-
-/*
- * Gets all substitutions applicable to a PCG
- */
-std::vector<Substitution>
-    get_all_applicable_substitutions(ParallelComputationGraph const &pcg) {
-  NOT_IMPLEMENTED();
-}
-
-/*
- * Applies a substitution to all possible positions in PCG
- */
-std::vector<ParallelComputationGraph>
-    apply_substitution(ParallelComputationGraph const &pcg,
-                       Substitution const &) {
-  NOT_IMPLEMENTED();
-}
-
-GraphOptimizeResult graph_optimize(
-    ParallelComputationGraph &pcg,
-    CostEstimator const &cost_estimator,
-    MachineSpecification const &resources,
-    std::function<std::unordered_set<MachineView>(
-        ParallelLayerAttrs const &, MachineSpecification const &)> const
-        &allowed_machine_views,
-    OptimizerConfig const &opt_config) {
-  NOT_IMPLEMENTED();
-
-  // std::vector<Substitution> substitutions =
-  //     get_all_applicable_substitutions(pcg);
-  //
-  // MachineMappingCache cached_subgraph_costs;
-  // DeduplicatedPriorityQueue<GraphOptimizeState> candidates;
-  //
-  // MachineMappingResult original_pcg_cost =
-  //     get_optimal_machine_mapping(pcg,
-  //                                 allowed_machine_views,
-  //                                 cost_estimator,
-  //                                 resources,
-  //                                 cached_subgraph_costs);
-  //
-  // GraphOptimizeState initial_state = {
-  //     GraphOptimizeResult(pcg, original_pcg_cost.machine_mapping),
-  //     original_pcg_cost.runtime};
-  //
-  // GraphOptimizeState best_state = initial_state;
-  // candidates.push(initial_state);
-  //
-  // for (int iteration = 0; !candidates.empty() && iteration <
-  // opt_config.budget;
-  //      ++iteration) {
-  //   GraphOptimizeState current_state = candidates.top();
-  //   candidates.pop();
-  //
-  //   if (current_state.runtime < best_state.runtime) {
-  //     best_state = current_state;
-  //   } else if (current_state.runtime > best_state.runtime * opt_config.alpha)
-  //   {
-  //     continue;
-  //   }
-  //
-  //   for (Substitution const &substitution : substitutions) {
-  //     for (ParallelComputationGraph const &new_pcg : apply_substitution(
-  //              current_state.graph_optimize_result.pcg, substitution)) {
-  //       MachineMappingResult new_pcg_cost =
-  //           get_optimal_machine_mapping(new_pcg,
-  //                                       allowed_machine_views,
-  //                                       cost_estimator,
-  //                                       resources,
-  //                                       cached_subgraph_costs);
-  //       GraphOptimizeState new_state{
-  //           GraphOptimizeResult(new_pcg, new_pcg_cost.machine_mapping),
-  //           new_pcg_cost.runtime};
-  //       if (new_pcg_cost.runtime <= opt_config.threshold &&
-  //           get_nodes(new_pcg.raw_graph).size() <= opt_config.max_num_ops) {
-  //         candidates.push(new_state);
-  //       }
-  //     }
-  //   }
-  // }
-
-  // return best_state.graph_optimize_result;
-}
-
-} // namespace FlexFlow
diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc
deleted file mode 100644
index 817cc80700..0000000000
--- a/lib/compiler/test/src/allowed_machine_views.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-#include "compiler/allowed_machine_views.h"
-#include "doctest/doctest.h"
-#include "utils/containers/extend.h"
-#include "utils/containers/range.h"
-#include "utils/containers/transform.h"
-#include "utils/containers/unordered_set_of.h"
-#include "utils/containers/zip.h"
-#include "utils/fmt/unordered_set.h"
-
-using namespace FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-
-  TEST_CASE("get_allowed_machine_views") {
-
-    SUBCASE("1 degree of parallelism") {
-      MachineSpecification ms = MachineSpecification{
-          /*num_nodes=*/1_n,
-          /*num_cpus_per_node=*/5_n,
-          /*num_gpus_per_node=*/5_n,
-          /*inter_node_bandwidth=*/0,
-          /*intra_node_bandwidth=*/0,
-      };
-
-      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
-
-      std::unordered_set<MachineView> correct = {
-          MachineView{
-              MachineSpaceCoordinate{
-                  /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1_n},
-                                    MachineSpecificationDimension::INTRA_NODE}},
-          },
-
-          MachineView{
-              MachineSpaceCoordinate{
-                  /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1_n},
-                                    MachineSpecificationDimension::INTRA_NODE}},
-          },
-          MachineView{
-              MachineSpaceCoordinate{
-                  /*node_idx=*/0_n, /*device_idx=*/2_n, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1_n},
-                                    MachineSpecificationDimension::INTRA_NODE}},
-          },
-          MachineView{
-              MachineSpaceCoordinate{
-                  /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
-              {MachineViewDimension{stride_t{2_n},
-                                    MachineSpecificationDimension::INTRA_NODE}},
-          },
-      };
-
-      std::unordered_set<MachineView> result =
-          get_allowed_machine_views(ms, task, DeviceType::GPU);
-
-      CHECK(correct == result);
-    }
-
-    SUBCASE("2 degrees of parallelism") {
-
-      MachineSpecification ms = MachineSpecification{
-          /*num_nodes=*/3_n,
-          /*num_cpus_per_node=*/3_n,
-          /*num_gpus_per_node=*/3_n,
-          /*inter_node_bandwidth=*/0,
-          /*intra_node_bandwidth=*/0,
-      };
-      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 3_n}};
-
-      auto make_2d_view = [&](nonnegative_int start_node_idx,
-                              nonnegative_int start_device_idx,
-                              nonnegative_int stride1,
-                              nonnegative_int stride2,
-                              MachineSpecificationDimension m1,
-                              MachineSpecificationDimension m2) {
-        return MachineView{
-            MachineSpaceCoordinate{
-                start_node_idx, start_device_idx, DeviceType::GPU},
-            {MachineViewDimension{stride_t{stride1}, m1},
-             MachineViewDimension{stride_t{stride2}, m2}},
-        };
-      };
-
-      auto intra = MachineSpecificationDimension::INTRA_NODE;
-      auto inter = MachineSpecificationDimension::INTER_NODE;
-      std::unordered_set<MachineView> correct = {
-          make_2d_view(
-              0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra),
-          make_2d_view(
-              1_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra),
-          make_2d_view(
-              0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, inter, intra),
-
-          make_2d_view(
-              0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter),
-          make_2d_view(
-              0_n, 1_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter),
-          make_2d_view(
-              0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, intra, inter),
-      };
-
-      std::unordered_set<MachineView> result =
-          get_allowed_machine_views(ms, task, DeviceType::GPU);
-
-      CHECK(correct == result);
-    }
-  }
-}
diff --git a/lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc b/lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc
new file mode 100644
index 0000000000..f176621a18
--- /dev/null
+++ b/lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc
@@ -0,0 +1,156 @@
+#include "compiler/machine_mapping/allowed_machine_views.h"
+#include "doctest/doctest.h"
+#include "utils/containers/extend.h"
+#include "utils/containers/range.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/unordered_set_of.h"
+#include "utils/containers/zip.h"
+#include "utils/fmt/unordered_set.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+
+  TEST_CASE("get_allowed_machine_views") {
+
+    auto make_2d_view = [&](nonnegative_int start_node_idx,
+                            nonnegative_int start_device_idx,
+                            nonnegative_int stride_1,
+                            nonnegative_int stride_2,
+                            MachineSpecificationDimension m1,
+                            MachineSpecificationDimension m2) {
+      return MachineView{
+          MachineSpaceCoordinate{
+              start_node_idx, start_device_idx, DeviceType::GPU},
+          {MachineViewDimension{stride_t{stride_1}, m1},
+           MachineViewDimension{stride_t{stride_2}, m2}},
+      };
+    };
+    auto intra = MachineSpecificationDimension::INTRA_NODE;
+    auto inter = MachineSpecificationDimension::INTER_NODE;
+
+    SUBCASE("1 degree of parallelism") {
+      MachineSpecification ms = MachineSpecification{
+          /*num_nodes=*/1_n,
+          /*num_cpus_per_node=*/5_n,
+          /*num_gpus_per_node=*/5_n,
+          /*inter_node_bandwidth=*/0,
+          /*intra_node_bandwidth=*/0,
+      };
+
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
+
+      std::unordered_set<MachineView> correct = {
+          MachineView{
+              MachineSpaceCoordinate{
+                  /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1_n},
+                                    MachineSpecificationDimension::INTRA_NODE}},
+          },
+
+          MachineView{
+              MachineSpaceCoordinate{
+                  /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1_n},
+                                    MachineSpecificationDimension::INTRA_NODE}},
+          },
+          MachineView{
+              MachineSpaceCoordinate{
+                  /*node_idx=*/0_n, /*device_idx=*/2_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1_n},
+                                    MachineSpecificationDimension::INTRA_NODE}},
+          },
+          MachineView{
+              MachineSpaceCoordinate{
+                  /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{2_n},
+                                    MachineSpecificationDimension::INTRA_NODE}},
+          },
+      };
+
+      std::unordered_set<MachineView> result =
+          get_allowed_machine_views(ms, task, DeviceType::GPU);
+
+      CHECK(correct == result);
+    }
+
+    SUBCASE("2 degrees of parallelism") {
+
+      MachineSpecification ms = MachineSpecification{
+          /*num_nodes=*/3_n,
+          /*num_cpus_per_node=*/3_n,
+          /*num_gpus_per_node=*/3_n,
+          /*inter_node_bandwidth=*/0,
+          /*intra_node_bandwidth=*/0,
+      };
+      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 3_n}};
+
+      std::unordered_set<MachineView> correct = {
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, inter, intra),
+          make_2d_view(
+              1_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, inter, intra),
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/2_n, /*stride_2=*/1_n, inter, intra),
+
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, inter),
+          make_2d_view(
+              0_n, 1_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, inter),
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/2_n, /*stride_2=*/1_n, intra, inter),
+      };
+
+      std::unordered_set<MachineView> result =
+          get_allowed_machine_views(ms, task, DeviceType::GPU);
+
+      CHECK(correct == result);
+    }
+
+    SUBCASE("2D operator task space, dimensions (1,1)") {
+      MachineSpecification full_machine_spec = MachineSpecification{
+          /*num_nodes=*/nonnegative_int{2},
+          /*num_cpus_per_node=*/nonnegative_int{1},
+          /*num_gpus_per_node=*/nonnegative_int{1},
+          /*inter_node_bandwidth=*/1,
+          /*intra_node_bandwidth=*/1,
+      };
+      OperatorTaskSpace task = OperatorTaskSpace{{1_n, 1_n}};
+
+      std::unordered_set<MachineView> result =
+          get_allowed_machine_views(full_machine_spec, task, DeviceType::GPU);
+
+      std::unordered_set<MachineView> correct = {
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, intra),
+          make_2d_view(
+              1_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, intra)};
+      CHECK(correct == result);
+    }
+
+    SUBCASE("2D operator task space, dimensions (2,1)") {
+      MachineSpecification full_machine_spec = MachineSpecification{
+          /*num_nodes=*/nonnegative_int{2},
+          /*num_cpus_per_node=*/nonnegative_int{2},
+          /*num_gpus_per_node=*/nonnegative_int{2},
+          /*inter_node_bandwidth=*/1,
+          /*intra_node_bandwidth=*/1,
+      };
+      OperatorTaskSpace task = OperatorTaskSpace{{1_n, 2_n}};
+
+      std::unordered_set<MachineView> result =
+          get_allowed_machine_views(full_machine_spec, task, DeviceType::GPU);
+
+      std::unordered_set<MachineView> correct = {
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, intra),
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, inter),
+          make_2d_view(
+              1_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, intra),
+          make_2d_view(
+              0_n, 1_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, inter)};
+      CHECK(correct == result);
+    }
+  }
+}
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index e506dea1d7..a45227011c 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -109,11 +109,14 @@ TEST_SUITE(FF_TEST_SUITE) {
         DataType::FLOAT,
     };
 
+    OperatorTaskSpace fake_op_task_space = OperatorTaskSpace{{}};
+
     UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{
         /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}},
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*op_task_space=*/fake_op_task_space,
     };
 
     UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{
@@ -126,6 +129,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*op_task_space=*/fake_op_task_space,
     };
 
     ParallelTensorShape par_tensor_shape = lift_to_parallel(tensor_shape);
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
index 048f1ddcac..9059950742 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
@@ -1,8 +1,15 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
+#include "compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/operator_task_space.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/pcg_from_computation_graph.h"
+#include "utils/containers/extend.h"
 #include "utils/containers/get_only.h"
+#include "utils/containers/vector_of.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -90,6 +97,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     PCGOperatorAttrs input_attrs = PCGOperatorAttrs{InputAttrs{input_shape}};
 
+    auto make_operator_task_space = [&](ParallelTensorShape const &shape) {
+      std::vector<nonnegative_int> degrees;
+      extend(degrees, vector_of(ff_ordered_shard_degrees(shape)));
+      degrees.push_back(get_sum_degree(shape));
+      degrees.push_back(get_discard_copy_degree(shape));
+      return OperatorTaskSpace{degrees};
+    };
+
     auto make_input_key =
         [&](ParallelTensorShape const &parallel_tensor_shape) {
           return UnmappedOpCostEstimateKey{
@@ -97,6 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               /*input_shapes=*/{},
               /*weight_shapes=*/{},
               /*output_shapes=*/{parallel_tensor_shape},
+              /*op_task_space=*/make_operator_task_space(parallel_tensor_shape),
           };
         };
 
@@ -143,11 +159,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_layer_guid_t relu_layer = relu_added.parallel_layer;
       parallel_tensor_guid_t relu_output = get_only(relu_added.outputs);
 
+      OperatorTaskSpace relu_task_space =
+          get_operator_task_space(pcg, relu_layer);
+
       UnmappedOpCostEstimateKey relu_key = UnmappedOpCostEstimateKey{
           /*op_attrs=*/relu_attrs,
           /*input_shapes=*/{par_input_shape},
           /*weight_shapes=*/{},
           /*output_shapes=*/{relu_output_shape},
+          /*op_task_space=*/relu_task_space,
       };
 
       PCGBinarySPDecomposition sp_decomposition = pcg_make_series(
@@ -228,11 +248,14 @@ TEST_SUITE(FF_TEST_SUITE) {
                              {input1_tensor, input2_tensor},
                              {});
       parallel_layer_guid_t ew_op_layer = ew_op_added.parallel_layer;
+      OperatorTaskSpace ew_op_task_space =
+          get_operator_task_space(pcg, ew_op_layer);
       UnmappedOpCostEstimateKey ew_op_key = UnmappedOpCostEstimateKey{
           /*op_attrs=*/ew_op_attrs,
           /*input_shapes=*/{par_input_shape, par_input_shape},
           /*weight_shapes=*/{},
           /*output_shapes=*/{ew_op_output_shape},
+          /*op_task_space=*/ew_op_task_space,
       };
 
       PCGBinarySPDecomposition sp_decomposition =
@@ -280,4 +303,43 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
   }
+
+  TEST_CASE("from pcg") {
+    ComputationGraph cg = [&] {
+      ComputationGraphBuilder b;
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{
+              FFOrdered<nonnegative_int>{nonnegative_int{32},
+                                         nonnegative_int{64}},
+          },
+          DataType::FLOAT,
+      };
+      tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{16},
+                  /*activation=*/std::nullopt);
+      t = b.gelu(t);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{12},
+                  /*activation=*/std::nullopt,
+                  /*use_bias=*/false,
+                  /*data_type=*/DataType::FLOAT,
+                  /*kernel_initializer=*/std::nullopt,
+                  /*bias_initializer=*/std::nullopt);
+      t = b.relu(t);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{8},
+                  /*activation=*/Activation::RELU);
+      return b.computation_graph;
+    }();
+
+    ParallelComputationGraph pcg = pcg_from_computation_graph(cg);
+
+    PCGBinarySPDecomposition sp_decomp =
+        expect(get_pcg_balanced_binary_sp_decomposition(pcg),
+               "Failed to get SP decomposition of PCG");
+
+    MachineMappingProblemTree problem_tree =
+        get_machine_mapping_problem_tree(pcg, sp_decomp);
+  }
 }
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 8ae1ebe753..f049f4b288 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -99,6 +99,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
     };
 
+    OperatorTaskSpace fake_op_task_space = OperatorTaskSpace{{}};
     TensorShape tensor_shape = TensorShape{
         TensorDims{
             FFOrdered<nonnegative_int>{
@@ -116,6 +117,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*op_task_space=*/fake_op_task_space,
     };
 
     UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{
@@ -128,6 +130,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*op_task_space=*/fake_op_task_space,
     };
 
     AbstractedTensorSetMovement movement1 = AbstractedTensorSetMovement{{
diff --git a/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc
new file mode 100644
index 0000000000..ba6faa93c4
--- /dev/null
+++ b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc
@@ -0,0 +1,32 @@
+#include "compiler/mcmc/generic_mcmc_algorithm.h"
+#include "doctest/doctest.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("generic_mcmc_algorithm") {
+    float starting_state = 0.1;
+    auto generating_func = [](float x,
+                              nonnegative_int i) -> std::optional<float> {
+      float new_x = x + (randf() - 0.5) / (i.unwrap_nonnegative() + 1);
+      if (new_x < 0) {
+        return std::nullopt;
+      }
+      if (new_x > 1) {
+        return std::nullopt;
+      }
+      return new_x;
+    };
+    auto scoring_func = [](float x) { return (x - 0.5) * (x - 0.5); };
+    GenericMCMCConfig config = GenericMCMCConfig{/*temperature=*/1.0,
+                                                 /*num_iterations=*/10_n};
+    Generic_MCMC_state<float, float> result =
+        minimize_score(starting_state, generating_func, scoring_func, config);
+    float answer = result.get_state();
+    float error = result.get_score();
+    CHECK(answer > 0.49);
+    CHECK(answer < 0.51);
+    CHECK(error >= 0);
+    CHECK(error < 0.01);
+  }
+}
diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
new file mode 100644
index 0000000000..7d74d897e4
--- /dev/null
+++ b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
@@ -0,0 +1,79 @@
+#include "compiler/mcmc/mcmc_over_mapped_pcg.h"
+#include "../cost_estimator_for_test.h"
+#include "compiler/task_graph_simulator/task_simulator.h"
+#include "doctest/doctest.h"
+#include "op-attrs/parallel_tensor_dims.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/replica_type.dtg.h"
+#include "op-attrs/shard_parallel_dim.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/pcg_from_computation_graph.h"
+#include "utils/integer_conversions.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("mcmc_graph_optimize") {
+    ComputationGraph cg = [&] {
+      ComputationGraphBuilder b;
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{
+              FFOrdered<nonnegative_int>{32_n, 64_n},
+          },
+          DataType::FLOAT,
+      };
+      tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES);
+      t = b.dense(t,
+                  /*outDim=*/16_n,
+                  /*activation=*/std::nullopt);
+      t = b.gelu(t);
+      t = b.dense(t,
+                  /*outDim=*/12_n,
+                  /*activation=*/std::nullopt,
+                  /*use_bias=*/false,
+                  /*data_type=*/DataType::FLOAT,
+                  /*kernel_initializer=*/std::nullopt,
+                  /*bias_initializer=*/std::nullopt);
+      t = b.relu(t);
+      t = b.dense(t,
+                  /*outDim=*/8_n,
+                  /*activation=*/Activation::RELU);
+      return b.computation_graph;
+    }();
+
+    ParallelComputationGraph pcg = pcg_from_computation_graph(cg);
+
+    CostEstimator cost_estimator = make_fake_cost_estimator(
+        [](OpCostEstimateKey const &k) {
+          return OpCostMetrics{
+              /*forward_runtime=*/1.0,
+              /*backward_runtime=*/2.0,
+              /*memory=*/1_n,
+          };
+        },
+        [](TensorSetMovement const &) { return 1.0; });
+
+    MachineSpecification full_machine_spec = MachineSpecification{
+        /*num_nodes=*/2_n,
+        /*num_cpus_per_node=*/1_n,
+        /*num_gpus_per_node=*/1_n,
+        /*inter_node_bandwidth=*/1,
+        /*intra_node_bandwidth=*/1,
+    };
+
+    MCMCOverMappedPCGConfig search_config =
+        MCMCOverMappedPCGConfig{/*temperature=*/1.0,
+                                /*num_iterations=*/100_n,
+                                /*substitution_interval=*/5_n,
+                                /*device_type=*/DeviceType::GPU};
+
+    SearchResult result = mcmc_graph_optimize(
+        pcg, cost_estimator, full_machine_spec, search_config);
+    float runtime = task_simulator_estimate_forward_pass_time(
+        result.pcg, cost_estimator, result.machine_mapping, full_machine_spec);
+    std::cout << runtime << std::endl;
+
+    CHECK(runtime < 12);
+  }
+}
diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc
similarity index 68%
rename from lib/compiler/test/src/graph_optimize_state.cc
rename to lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc
index 5c00ce1558..3b146be93f 100644
--- a/lib/compiler/test/src/graph_optimize_state.cc
+++ b/lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc
@@ -1,4 +1,4 @@
-#include "compiler/graph_optimize_state.h"
+#include "compiler/unity_algorithm/graph_optimize_state.h"
 #include "doctest/doctest.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 
@@ -15,24 +15,6 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
         DataType::FLOAT,
     };
-    // ParallelTensorShape input_shape =
-    //     ParallelTensorShape{ParallelTensorDims{
-    //                             FFOrdered<ShardParallelDim>{
-    //                                 ShardParallelDim{32_n, 2_n},
-    //                                 ShardParallelDim{16_n, 1_n},
-    //                             },
-    //                             ReplicaParallelDimSet{
-    //                                 SumDegree{1_n},
-    //                                 DiscardCopyDegree{1_n},
-    //                             },
-    //                         },
-    //                         DataType::FLOAT};
-
-    // `machine_mapping` is determined by the PCG and the device mapping
-    // algorithm, and `runtime` is determined by the PCG and the device mapping,
-    // so their values here do not matter.
-    std::unordered_map<parallel_layer_guid_t, MachineView> empty_machine_views;
-    MachineMapping empty_machine_mapping(empty_machine_views);
 
     InitializerAttrs zero_init = InitializerAttrs{ZeroInitializerAttrs{}};
 
@@ -70,13 +52,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraph pcg2 = create_pcg();
 
       GraphOptimizeState state1 = GraphOptimizeState{
-          GraphOptimizeResult{pcg1, empty_machine_mapping},
-          0,
+          pcg1,
+          .0,
       };
-
       GraphOptimizeState state2 = GraphOptimizeState{
-          GraphOptimizeResult{pcg2, empty_machine_mapping},
-          0,
+          pcg2,
+          .0,
       };
 
       CHECK(state1 == state2);
@@ -100,16 +81,30 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraph pcg_ = builder_.pcg;
 
       GraphOptimizeState state1 = GraphOptimizeState{
-          GraphOptimizeResult{pcg1, empty_machine_mapping},
-          0,
+          pcg1,
+          .0,
       };
 
       GraphOptimizeState state_ = GraphOptimizeState{
-          GraphOptimizeResult{pcg_, empty_machine_mapping},
-          0,
+          pcg_,
+          .0,
       };
 
       CHECK_FALSE(state1 == state_);
     }
   }
+
+  TEST_CASE("GraphOptimizeState::operator<") {
+    ParallelComputationGraph pcg1 = empty_parallel_computation_graph();
+    ParallelComputationGraph pcg2 = empty_parallel_computation_graph();
+    GraphOptimizeState state1 = GraphOptimizeState{
+        pcg1,
+        1.0,
+    };
+    GraphOptimizeState state2 = GraphOptimizeState{
+        pcg2,
+        2.0,
+    };
+    CHECK(state1 < state2);
+  }
 }
diff --git a/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc
new file mode 100644
index 0000000000..4ca23710e2
--- /dev/null
+++ b/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc
@@ -0,0 +1,77 @@
+#include "compiler/unity_algorithm/unity_algorithm.h"
+#include "../cost_estimator_for_test.h"
+#include "doctest/doctest.h"
+#include "op-attrs/parallel_tensor_dims.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/replica_type.dtg.h"
+#include "op-attrs/shard_parallel_dim.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/pcg_from_computation_graph.h"
+#include "utils/integer_conversions.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("graph_optimize") {
+    ComputationGraph cg = [&] {
+      ComputationGraphBuilder b;
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{
+              FFOrdered<nonnegative_int>{nonnegative_int{32},
+                                         nonnegative_int{64}},
+          },
+          DataType::FLOAT,
+      };
+      tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{16},
+                  /*activation=*/std::nullopt);
+      t = b.gelu(t);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{12},
+                  /*activation=*/std::nullopt,
+                  /*use_bias=*/false,
+                  /*data_type=*/DataType::FLOAT,
+                  /*kernel_initializer=*/std::nullopt,
+                  /*bias_initializer=*/std::nullopt);
+      t = b.relu(t);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{8},
+                  /*activation=*/Activation::RELU);
+      return b.computation_graph;
+    }();
+
+    ParallelComputationGraph pcg = pcg_from_computation_graph(cg);
+
+    CostEstimator cost_estimator = make_fake_cost_estimator(
+        [](OpCostEstimateKey const &k) {
+          return OpCostMetrics{
+              /*forward_runtime=*/1.0,
+              /*backward_runtime=*/2.0,
+              /*memory=*/nonnegative_int{1},
+          };
+        },
+        [](TensorSetMovement const &) { return 1.0; });
+
+    MachineSpecification full_machine_spec = MachineSpecification{
+        /*num_nodes=*/nonnegative_int{2},
+        /*num_cpus_per_node=*/nonnegative_int{1},
+        /*num_gpus_per_node=*/nonnegative_int{1},
+        /*inter_node_bandwidth=*/1,
+        /*intra_node_bandwidth=*/1,
+    };
+
+    UnitySearchConfig search_config = UnitySearchConfig{
+        /*alpha=*/1.0,
+        /*budget=*/0,
+        /*threshold=*/1000.0,
+        /*max_num_ops=*/100,
+    };
+
+    SearchResult result =
+        graph_optimize(pcg, cost_estimator, full_machine_spec, search_config);
+
+    // TODO: check the result
+  }
+}
diff --git a/lib/compiler/test/src/unity_algorithm.cc b/lib/compiler/test/src/unity_algorithm.cc
deleted file mode 100644
index 8ff0978ea5..0000000000
--- a/lib/compiler/test/src/unity_algorithm.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "compiler/unity_algorithm.h"
-#include "doctest/doctest.h"
-
-TEST_SUITE(FF_TEST_SUITE) {
-  // Rapidcheck does not work for now
-  // TEST_CASE("graph_optimize") {
-  //   RC_SUBCASE([](ComputationGraph const &g,
-  //                float alpha,
-  //                int budget,
-  //                float threshold,
-  //                int max_num_ops) {
-  //     Strategy s = graph_optimize(
-  //         g,
-  //         TestCostEstimator{},
-  //         MachineSpecification{1, 1, 4, 0.1, 0.2},
-  //         [](Operator const &, MachineSpecification const &) {
-  //           return std::unordered_set<MachineView>{make_1d_machine_view(0, 1,
-  //           1)};
-  //         },
-  //         OptimizerConfig{alpha, budget, threshold, max_num_ops});
-  //     RC_ASSERT(get_nodes(s.pcg).size() > 0);
-  //     RC_ASSERT(s.machine_mapping.runtime > 0);
-  //     RC_ASSERT(keys(s.machine_mapping.machine_views) == get_nodes(s.pcg));
-  //   });
-  // }
-}
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
index 3542e73dea..f820c56d61 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_H
 #define _FLEXFLOW_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_H
 
+#include "pcg/computation_graph.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_layer_added_result.dtg.h"
diff --git a/lib/substitutions/include/substitutions/pcg_pattern.h b/lib/substitutions/include/substitutions/pcg_pattern.h
index f0962b15c2..5005a0b51c 100644
--- a/lib/substitutions/include/substitutions/pcg_pattern.h
+++ b/lib/substitutions/include/substitutions/pcg_pattern.h
@@ -12,6 +12,10 @@ namespace FlexFlow {
 
 std::unordered_set<PatternNode> get_nodes(PCGPattern const &);
 
+std::optional<PCGPatternMatch>
+    get_random_pattern_match(PCGPattern const &pattern,
+                             SubParallelComputationGraph const &pcg);
+
 /**
  * @brief Find all locations in \p pcg that match \p pattern
  */
diff --git a/lib/substitutions/include/substitutions/unity_substitution_set.h b/lib/substitutions/include/substitutions/unity_substitution_set.h
index 183f76ac8a..959ba3da2c 100644
--- a/lib/substitutions/include/substitutions/unity_substitution_set.h
+++ b/lib/substitutions/include/substitutions/unity_substitution_set.h
@@ -6,6 +6,8 @@
 #include "utils/fmt/vector.h"
 
 namespace FlexFlow {
+std::optional<Substitution>
+    get_random_substitution(MachineSpecification const &resources);
 
 std::vector<Substitution>
     get_substitution_set(MachineSpecification const &resources);
diff --git a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc
index 194ae49255..f39b771364 100644
--- a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc
+++ b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc
@@ -16,6 +16,16 @@ bool operator_satisfies_constraint(
   switch (constraint.constraint_type) {
     case ConstraintType::EQUAL:
       return expr_val.value() == constraint.attribute_value;
+    case ConstraintType::DIVISIBLE_BY: {
+      if (expr_val.value().has<nonnegative_int>() &&
+          constraint.attribute_value.has<nonnegative_int>()) {
+        return expr_val.value().get<nonnegative_int>() %
+                   constraint.attribute_value.get<nonnegative_int>() ==
+               0;
+      }
+      throw mk_runtime_error(
+          "DIVISIBLE_BY constraint requires nonnegative_int values");
+    }
     default:
       throw mk_runtime_error(
           fmt::format("Unknown constraint type {}",
diff --git a/lib/substitutions/src/substitutions/pcg_pattern.cc b/lib/substitutions/src/substitutions/pcg_pattern.cc
index a0af875848..fbc181a0f9 100644
--- a/lib/substitutions/src/substitutions/pcg_pattern.cc
+++ b/lib/substitutions/src/substitutions/pcg_pattern.cc
@@ -11,6 +11,7 @@
 #include "utils/graph/node/algorithms.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h"
+#include "utils/random_utils.h"
 
 namespace FlexFlow {
 
@@ -37,6 +38,17 @@ static MatchAdditionalCriterion
       }};
 }
 
+std::optional<PCGPatternMatch>
+    get_random_pattern_match(PCGPattern const &pattern,
+                             SubParallelComputationGraph const &pcg) {
+  std::vector<PCGPatternMatch> pattern_matches =
+      find_pattern_matches(pattern, pcg);
+  if (pattern_matches.empty()) {
+    return std::nullopt;
+  }
+  return select_random(pattern_matches);
+}
+
 std::vector<PCGPatternMatch>
     find_pattern_matches(PCGPattern const &pattern,
                          SubParallelComputationGraph const &pcg) {
diff --git a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
index 83df74f21b..0c673f0a8a 100644
--- a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
+++ b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
@@ -188,34 +188,33 @@ bool sub_pcgs_are_isomorphic(SubParallelComputationGraph const &lhs,
 }
 
 std::string as_dot(SubParallelComputationGraph const &spcg) {
-  NOT_IMPLEMENTED();
-  // std::function<std::string(ParallelLayerAttrs const &)> get_node_label =
-  //     [](ParallelLayerAttrs const &a) -> std::string {
-  //   RecordFormatter r = as_dot(a.op_attrs);
-  //
-  //   if (a.name.has_value()) {
-  //     RecordFormatter rr;
-  //     rr << "Name" << a.name.value();
-  //     r << rr;
-  //   }
-  //
-  //   std::ostringstream oss;
-  //   oss << r;
-  //   return oss.str();
-  // };
-  //
-  // std::function<std::string(ParallelTensorAttrs const &)> get_input_label =
-  //     [](ParallelTensorAttrs const &a) -> std::string {
-  //   RecordFormatter r;
-  //
-  //   r << fmt::to_string(a.shape);
-  //
-  //   std::ostringstream oss;
-  //   oss << r;
-  //   return oss.str();
-  // };
-  //
-  // return as_dot(spcg.raw_graph, get_node_label, get_input_label);
+  std::function<std::string(ParallelLayerAttrs const &)> get_node_label =
+      [](ParallelLayerAttrs const &a) -> std::string {
+    RecordFormatter r = as_dot(a.op_attrs);
+
+    if (a.name.has_value()) {
+      RecordFormatter rr;
+      rr << "Name" << a.name.value();
+      r << rr;
+    }
+
+    std::ostringstream oss;
+    oss << r;
+    return oss.str();
+  };
+
+  std::function<std::string(ParallelTensorAttrs const &)> get_input_label =
+      [](ParallelTensorAttrs const &a) -> std::string {
+    RecordFormatter r;
+
+    r << fmt::to_string(a.shape);
+
+    std::ostringstream oss;
+    oss << r;
+    return oss.str();
+  };
+
+  return as_dot(spcg.raw_graph, get_node_label, get_input_label);
 }
 
 void debug_print_dot(SubParallelComputationGraph const &spcg) {
diff --git a/lib/substitutions/src/substitutions/tensor_pattern/satisfies_constraint.cc b/lib/substitutions/src/substitutions/tensor_pattern/satisfies_constraint.cc
index 974bfcabc0..cc0af12c91 100644
--- a/lib/substitutions/src/substitutions/tensor_pattern/satisfies_constraint.cc
+++ b/lib/substitutions/src/substitutions/tensor_pattern/satisfies_constraint.cc
@@ -12,6 +12,16 @@ bool parallel_tensor_satisfies_constraint(
   switch (constraint.constraint_type) {
     case ConstraintType::EQUAL:
       return expr_val == constraint.attribute_value;
+    case ConstraintType::DIVISIBLE_BY: {
+      if (expr_val.has<nonnegative_int>() &&
+          constraint.attribute_value.has<nonnegative_int>()) {
+        return expr_val.get<nonnegative_int>() %
+                   constraint.attribute_value.get<nonnegative_int>() ==
+               0;
+      }
+      throw mk_runtime_error(
+          "DIVISIBLE_BY constraint requires nonnegative_int values");
+    }
     default:
       throw mk_runtime_error(
           fmt::format("Unknown constraint type {}",
diff --git a/lib/substitutions/src/substitutions/unity_substitution_set.cc b/lib/substitutions/src/substitutions/unity_substitution_set.cc
index 4b00cdd95f..c8d9266978 100644
--- a/lib/substitutions/src/substitutions/unity_substitution_set.cc
+++ b/lib/substitutions/src/substitutions/unity_substitution_set.cc
@@ -7,9 +7,19 @@
 #include "utils/containers/get_only.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/random_utils.h"
 
 namespace FlexFlow {
 
+std::optional<Substitution>
+    get_random_substitution(MachineSpecification const &resources) {
+  std::vector<Substitution> substitutions = get_substitution_set(resources);
+  if (substitutions.empty()) {
+    return std::nullopt;
+  }
+  return select_random(substitutions);
+}
+
 std::vector<Substitution>
     get_substitution_set(MachineSpecification const &resources) {
   std::vector<Substitution> substitutions;
diff --git a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
index 9d8e4bc259..fa0ff7794a 100644
--- a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
@@ -140,7 +140,6 @@ std::vector<UnlabelledDataflowGraphPatternMatch>
       }
     }
   }
-
   return matches;
 }
 
diff --git a/lib/utils/include/utils/full_binary_tree/as_dot.h b/lib/utils/include/utils/full_binary_tree/as_dot.h
new file mode 100644
index 0000000000..e104d05e06
--- /dev/null
+++ b/lib/utils/include/utils/full_binary_tree/as_dot.h
@@ -0,0 +1,81 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FULL_BINARY_TREE_AS_DOT_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FULL_BINARY_TREE_AS_DOT_H
+
+#include "utils/containers/get_only.h"
+#include "utils/dot_file.h"
+#include "utils/full_binary_tree/full_binary_tree_implementation.dtg.h"
+#include "utils/full_binary_tree/full_binary_tree_visitor.dtg.h"
+#include "utils/full_binary_tree/visit.h"
+#include "utils/graph/dataflow_graph/dataflow_graph.h"
+#include "utils/graph/dataflow_graph/dataflow_graph_view.h"
+#include "utils/graph/digraph/digraph_view.h"
+#include "utils/graph/instances/adjacency_digraph.h"
+#include "utils/graph/instances/unordered_set_dataflow_graph.h"
+#include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h"
+#include "utils/graph/labelled_dataflow_graph/algorithms/view_as_labelled_open_dataflow_graph.h"
+#include "utils/graph/labelled_dataflow_graph/labelled_dataflow_graph.h"
+#include "utils/graph/labelled_open_dataflow_graph/algorithms/as_dot.h"
+#include <functional>
+#include <sstream>
+#include <string>
+
+namespace FlexFlow {
+
+template <typename Tree, typename Parent, typename Leaf, typename NodeLabel>
+LabelledDataflowGraph<NodeLabel, std::monostate> as_labelled_dataflow_graph(
+    Tree const &tree,
+    FullBinaryTreeImplementation<Tree, Parent, Leaf> const &impl,
+    std::function<NodeLabel(Parent const &)> const &get_parent_label,
+    std::function<NodeLabel(Leaf const &)> const &get_leaf_label) {
+  auto g = LabelledDataflowGraph<NodeLabel, std::monostate>::template create<
+      UnorderedSetLabelledOpenDataflowGraph<NodeLabel, std::monostate>>();
+
+  FullBinaryTreeVisitor<DataflowOutput, Tree, Parent, Leaf> visitor =
+      FullBinaryTreeVisitor<DataflowOutput, Tree, Parent, Leaf>{
+          [&](Parent const &parent) -> DataflowOutput {
+            DataflowOutput left_child_output =
+                visit(impl.get_left_child(parent), impl, visitor);
+            DataflowOutput right_child_output =
+                visit(impl.get_right_child(parent), impl, visitor);
+            NodeLabel parent_label = get_parent_label(parent);
+            NodeAddedResult parent_added =
+                g.add_node(parent_label,
+                           {left_child_output, right_child_output},
+                           {std::monostate{}});
+            return get_only(parent_added.outputs);
+          },
+          [&](Leaf const &leaf) -> DataflowOutput {
+            NodeLabel leaf_label = get_leaf_label(leaf);
+            NodeAddedResult leaf_added =
+                g.add_node(leaf_label, {}, {std::monostate{}});
+            return get_only(leaf_added.outputs);
+          },
+      };
+
+  visit(tree, impl, visitor);
+
+  return g;
+}
+
+template <typename Tree, typename Parent, typename Leaf>
+std::string
+    as_dot(Tree const &tree,
+           FullBinaryTreeImplementation<Tree, Parent, Leaf> const &impl,
+           std::function<std::string(Parent const &)> const &get_parent_label,
+           std::function<std::string(Leaf const &)> const &get_leaf_label) {
+
+  LabelledDataflowGraphView<std::string, std::monostate> g =
+      as_labelled_dataflow_graph(tree, impl, get_parent_label, get_leaf_label);
+
+  std::function<std::string(std::string const &)> get_node_label =
+      [](std::string const &s) { return s; };
+  std::function<std::string(std::monostate const &)> get_input_label =
+      [](std::monostate const &) { return ""; };
+
+  return as_dot(
+      view_as_labelled_open_dataflow_graph(g), get_node_label, get_input_label);
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h
index de48cd17e9..9b4ea6cd20 100644
--- a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h
+++ b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h
@@ -1,11 +1,13 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_BINARY_SP_DECOMPOSITION_TREE_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_BINARY_SP_DECOMPOSITION_TREE_H
 
+#include "utils/full_binary_tree/binary_tree_path.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_parallel_split.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_series_split.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h"
 #include "utils/graph/series_parallel/sp_decomposition_tree_node_type.dtg.h"
+#include <optional>
 #include <unordered_set>
 
 namespace FlexFlow {
@@ -23,6 +25,10 @@ std::unordered_multiset<Node> get_leaves(BinarySPDecompositionTree const &);
 
 SPDecompositionTreeNodeType get_node_type(BinarySPDecompositionTree const &);
 
+std::optional<BinarySPDecompositionTree>
+    binary_sp_decomposition_tree_get_subtree_at_path(
+        BinarySPDecompositionTree const &, BinaryTreePath const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h
new file mode 100644
index 0000000000..9c999d8f6e
--- /dev/null
+++ b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h
@@ -0,0 +1,43 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_GENERIC_BINARY_SP_DECOMPOSITION_TREE_AS_DOT_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_GENERIC_BINARY_SP_DECOMPOSITION_TREE_AS_DOT_H
+
+#include "utils/full_binary_tree/as_dot.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+template <typename Tree, typename Series, typename Parallel, typename Leaf>
+std::string as_dot(
+    Tree const &tree,
+    GenericBinarySPDecompositionTreeImplementation<Tree,
+                                                   Series,
+                                                   Parallel,
+                                                   Leaf> const &impl,
+    std::function<std::string(Series const &)> const &get_series_label,
+    std::function<std::string(Parallel const &)> const &get_parallel_label,
+    std::function<std::string(Leaf const &)> const &get_leaf_label) {
+  FullBinaryTreeImplementation<Tree, std::variant<Series, Parallel>, Leaf>
+      full_binary_tree_impl = get_full_binary_impl_from_generic_sp_impl(impl);
+
+  std::function<std::string(std::variant<Series, Parallel> const &)>
+      get_parent_label =
+          [&](std::variant<Series, Parallel> const &parent) -> std::string {
+    return std::visit(overload{
+                          [&](Series const &series) -> std::string {
+                            return get_series_label(series);
+                          },
+                          [&](Parallel const &parallel) -> std::string {
+                            return get_parallel_label(parallel);
+                          },
+                      },
+                      parent);
+  };
+
+  return as_dot(tree, full_binary_tree_impl, get_parent_label, get_leaf_label);
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/optional.h b/lib/utils/include/utils/optional.h
index 377561d70c..8673264d36 100644
--- a/lib/utils/include/utils/optional.h
+++ b/lib/utils/include/utils/optional.h
@@ -32,6 +32,11 @@ T const &assert_unwrap(std::optional<T> const &o) {
   return o.value();
 }
 
+template <typename T>
+T expect(std::optional<T> const &x, std::string const &err) {
+  return unwrap(x, [&]() { throw mk_runtime_error(err); });
+}
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/utils/include/utils/random_utils.h b/lib/utils/include/utils/random_utils.h
index 99da9646a1..014c38fc51 100644
--- a/lib/utils/include/utils/random_utils.h
+++ b/lib/utils/include/utils/random_utils.h
@@ -5,7 +5,7 @@
 #include <stdexcept>
 #include <vector>
 
-float randf() {
+inline float randf() {
   return static_cast<float>(std::rand()) / static_cast<float>(RAND_MAX);
 }
 
diff --git a/lib/utils/src/utils/full_binary_tree/as_dot.cc b/lib/utils/src/utils/full_binary_tree/as_dot.cc
new file mode 100644
index 0000000000..12a1ab5533
--- /dev/null
+++ b/lib/utils/src/utils/full_binary_tree/as_dot.cc
@@ -0,0 +1,16 @@
+#include "utils/full_binary_tree/as_dot.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using Tree = value_type<0>;
+using Parent = value_type<1>;
+using Leaf = value_type<2>;
+
+template std::string
+    as_dot(Tree const &,
+           FullBinaryTreeImplementation<Tree, Parent, Leaf> const &,
+           std::function<std::string(Parent const &)> const &,
+           std::function<std::string(Leaf const &)> const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc
index 62489ff75f..3e4bc13289 100644
--- a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc
+++ b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc
@@ -1,5 +1,6 @@
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_leaves.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/is_binary_sp_tree_left_associative.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/is_binary_sp_tree_right_associative.h"
 
@@ -82,4 +83,10 @@ SPDecompositionTreeNodeType
   });
 }
 
+std::optional<BinarySPDecompositionTree>
+    binary_sp_decomposition_tree_get_subtree_at_path(
+        BinarySPDecompositionTree const &tree, BinaryTreePath const &path) {
+  return get_subtree_at_path(tree, generic_impl_for_binary_sp_tree(), path);
+}
+
 } // namespace FlexFlow
diff --git a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.cc b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.cc
new file mode 100644
index 0000000000..f557515c83
--- /dev/null
+++ b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.cc
@@ -0,0 +1,21 @@
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using Tree = value_type<0>;
+using Series = value_type<1>;
+using Parallel = value_type<2>;
+using Leaf = value_type<3>;
+
+template std::string
+    as_dot(Tree const &,
+           GenericBinarySPDecompositionTreeImplementation<Tree,
+                                                          Series,
+                                                          Parallel,
+                                                          Leaf> const &,
+           std::function<std::string(Series const &)> const &,
+           std::function<std::string(Parallel const &)> const &,
+           std::function<std::string(Leaf const &)> const &);
+
+} // namespace FlexFlow