guppy/graph/cycles.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
// Copyright (c) The cargo-guppy Contributors
// SPDX-License-Identifier: MIT OR Apache-2.0
//! Code for handling cycles in dependency graphs.
//!
//! See [`Cycles`][] for detailed docs.
use crate::{
graph::{PackageGraph, PackageIx},
petgraph_support::scc::Sccs,
Error, PackageId,
};
/// Contains information about dependency cycles.
///
/// More accurately, information about Strongly Connected Components with 2 or more elements.
/// Constructed through `PackageGraph::cycles`.
///
/// This page includes a bunch of detailed information on cycles, but here's the TLDR:
///
/// * Yes, cycles can happen
/// * Cycles only happen with dev-dependencies
/// * These cycles have properties that make them easy to handle
/// * We handle this in APIs like [`PackageSet::packages`][`crate::graph::PackageSet::packages`]
/// * As a result, you probably don't actually need this module
///
/// The slighly more detailed summary is that any graph of "packages" is conflating
/// the "real" package with its tests, which are actually separate binaries. These
/// tests *always* depend on the "real" package, and if we bothered to encode that
/// then any package with tests would have a cyclic dependency on itself -- so we
/// don't encode that. Unfortunately dev-dependencies allow tests to *indirectly*
/// depend on the "real" package, creating a cycle you *do* see.
///
/// If you only care about "real" builds, you can simply ignore the dev-dependency
/// edges and restore a nice and simple DAG that can be topologically sorted. This is what
/// we do for you in APIs like [`PackageSet::packages`][`crate::graph::PackageSet::packages`].
///
/// If you care about tests and dev-dependencies, we recommend treating those as
/// different from the "real" ones (essentially desugarring the package into two nodes).
/// Because all dev builds are roots of the package graph (nothing depends on a test/benchmark),
/// they can always go at the start/end (depending on direction) of the topological sort.
/// This means you can just do add a second loop before/after the "real" one.
///
/// For instance, here's a simple program that recursively computes some property of packages
/// (here "whether serde is a transitive dependency"):
///
/// ```
/// use guppy::{CargoMetadata, graph::DependencyDirection};
/// use std::collections::HashMap;
///
/// let metadata = CargoMetadata::parse_json(include_str!("../../../fixtures/small/metadata1.json")).unwrap();
/// let package_graph = metadata.build_graph().unwrap();
/// let workspace_members = package_graph.resolve_workspace();
/// let dependency_graph = package_graph.query_workspace().resolve();
///
/// // Whether the "real" package uses serde
/// let mut package_uses_serde = HashMap::new();
/// // Whether the "dev" package uses serde
/// let mut dev_package_uses_serde = HashMap::new();
///
/// // Iterate over packages in reverse topo order (process dependencies first)
/// for package in dependency_graph.packages(DependencyDirection::Reverse) {
/// // A package uses serde if...
/// let uses_serde = if package.name() == "serde" {
/// // It is literally serde (base case)
/// true
/// } else {
/// // It has a non-dev-dependency on a package which uses serde
/// // (dev-dependencies handled in the second loop)
/// package.direct_links().any(|link| {
/// !link.dev_only() && package_uses_serde[link.to().id()]
/// })
/// };
/// // Record this package's result
/// package_uses_serde.insert(package.id(), uses_serde);
/// }
///
/// // Now iterate over the workspace members to handle their tests (if any)
/// // Note that DependencyDirection doesn't matter here, we're literally
/// // just looping over every workspace member in arbitrary order!
/// for package in workspace_members.packages(DependencyDirection::Reverse) {
/// // Check dev-packages using the "real" package results for all links!
/// let uses_serde = package.direct_links().any(|link| {
/// package_uses_serde[link.to().id()]
/// });
/// // Record this dev-package's result
/// dev_package_uses_serde.insert(package.id(), uses_serde);
/// }
///
/// // Now we have all the values computed!
/// for (id, &uses_serde) in &package_uses_serde {
/// if uses_serde {
/// let name = package_graph.metadata(id).unwrap().name();
/// println!("{name} uses serde!");
/// }
/// }
/// for (id, &uses_serde) in &dev_package_uses_serde {
/// if uses_serde {
/// let name = package_graph.metadata(id).unwrap().name();
/// println!("{name}'s tests use serde!");
/// }
/// }
/// ```
///
///
///
///
///
/// # Why Cargo Dependency Graphs Have Cycles
///
/// Dependency graphs are generally Directed Acyclic Graphs (DAGs), where each package
/// is a node and each dependency is an edge. These graphs are acyclic (contain no cycles)
/// because anything else would be a paradox -- how do you build X if it depends on itself?
/// You don't!
///
/// So why does this API exist? It wouldn't make sense for Cargo to have cycles!
///
/// The problem is that "the Cargo dependency graph" is actually two different graphs
/// at different levels of abstraction: The Package Graph (Guppy, cargo-metadata), and
/// The Build Graph (Cargo's internals). These two graphs are different because each
/// package is actually a bunch of different
/// [build targets in a trenchcoat][`crate::graph::PackageMetadata::build_targets`] -- libs,
/// bins, tests, benches, and so on. In The Build Graph these different build targets get
/// their own nodes. In The Package Graph all those targets gets merged together into one
/// big node. The Build Graph is always a proper DAG, but The Package Graph can have cycles.
///
/// Thankfully these cycles can only be created by one specific (and rare) situation:
/// dev-dependencies. **A test/bench target for a package is allowed to indirectly
/// depend on the same package's lib/bin target, and this creates apparent cycles
/// in the package graph!** That's it!
///
/// As we'll see, **simply ignoring all dev-dependency edges eliminates all cycles
/// *and* preserves the ordering constraints of the dependency graph.**
///
///
///
/// # An Example Cyclic Workspace
///
/// As a concrete example, consider [the serde workspace][serde_github], which
/// actually has this "problem": there's a "cycle" between serde and serde_derive.
/// In normal builds this cycle doesn't exist: serde_derive is actually a standalone
/// crate, while [serde (optionally) pulls in serde_derive as a dependency][serde_toml].
/// The "cycle" only appears when testing serde_derive: [serde_derive's tests quite
/// reasonably depend on serde][serde_derive_toml] to test the proc-macro's output,
/// creating a cycle!
///
/// The way to resolve this monstrosity is to realize that the tests for serde_derive
/// are actually a completely different binary from the serde_derive *library*. Let's
/// call those tests serde_derive_dev. So although the (Package) graph reported by Guppy
/// (and cargo-metadata) looks like a cycle:
///
/// ```text
/// serde <-----+
/// | |
/// | |
/// +--> serde_derive
/// ```
///
/// In actuality, serde_derive_dev breaks the cycle and creates a nice clean DAG
/// (in The Build Graph):
///
/// ```text
/// +-- serde_derive_dev
/// | |
/// v |
/// serde |
/// | |
/// | v
/// +---> serde_derive
/// ```
///
/// Here's the really important thing to notice: serde_derive_dev is actually a *root*
/// in The Build Graph, and this is always true! Nothing should ever depend on the *tests*
/// or *benchmarks* for another library.
///
/// This is the key insight to ignoring dev-dependency edges. As we'll see, the roots
/// (and leaves) of a DAG are in some sense "ignorable" by the rest of the graph,
/// because they can't change the ordering constraints between other packages.
///
///
///
/// # Topological Sort Is Great (And Composable)
///
/// Now that we understand *why* cycles can happen in the package graph, let's look at
/// what those cycles mess up, and how to deal with them.
///
/// One of the big reasons everyone loves DAGs is because you can get a Topological
/// Sort of them. Topological Sort
/// (with [`DependencyDirection::Forward`][`crate::graph::DependencyDirection::Forward`])
/// is just a fancy way of saying "a list where packages always appear before their dependencies"
/// (vice-versa for [`DependencyDirection::Reverse`][`crate::graph::DependencyDirection::Reverse`]).
///
/// This is really convenient! If you need to do things in "dependency order" you can just
/// topologically sort the packages and then boring old for-loops will magically get
/// everything done before it's needed.
///
/// Unfortunately, you can't get the Topological Sort of a graph with cycles because that
/// doesn't make sense. And yet, Guppy has
/// [several APIs which do exactly that][`crate::graph::PackageSet::packages`].
/// What gives? The docs say:
///
/// > The packages within a dependency cycle will be returned in non-dev order. When the
/// > direction is forward, if package Foo has a dependency on Bar, and Bar has a cyclic
/// > dev-dependency on Foo, then Foo is returned before Bar.
///
/// We just ignore the dev-dependency edges! Problem Solved.
///
/// But isn't this throwing out important information that could change the result? Nope!
///
/// As we saw in the previous section, all dev-builds are roots in The Build Graph.
/// Ignoring all dev-dependency edges is equivalent to deleting all of those roots.
/// This may "orphan" dependencies that are only used for dev-builds, but we still
/// keep them in the graph and properly include them in the sort.
///
/// As it turns out, you can recursively compute the topological sort of a graph as follows:
///
/// 1. delete a root (or leaf)
/// 2. compute the topological sort of the new graph
/// 3. append the root (or leaf) to the start (or end) of the list
///
/// **Even although we delete all the dev-nodes from the graph when doing our sort,
/// if you want to "add them back" the only thing you need to do is handle them before
/// (or after) everything else!** Even better: all the dev-builds are roots at the same
/// time, so you can process them in any order!
///
/// Just remember that every node with dev-dependencies is really two nodes: the "normal"
/// version without dev-dependencies, and the version with them. Exactly how you want
/// to express that notion in your code is up to you. (Two different loops is the simplest.)
///
///
///
///
/// # Reasoning About Cycles: Strongly Connected Components
///
/// Ok but wait, none of that involved Strongly Connected Components! Yeah, isn't that great? 😄
///
/// Oh you still want to "know" about the cycles? Then we've gotta bust out the heavy
/// general-purpose machinery. Thankfully the problem of cycles in directed graphs is
/// an old and well-studied problem with a conceptually simple solution: hide the cycle
/// in a box and pretend that it's just one Really Big Node in the DAG.
///
/// Yes, really, that's all that Strongly Connected Components are. More precisely, SCCs
/// are defined to be maximal sets of nodes such that "every node in an SCC can reach
/// every other node in that SCC" (a property which definitely holds for cycles).
/// The reason for this more complicated definition is that you can have a bunch of
/// cycles all knotted together in a nasty ball, and trying to tease out individual
/// cycles isn't really helpful. So we just wrap the whole ball of nodes up into one
/// big "I give up" box and forget about it!
///
/// Now, what does this get us?
///
/// The graph *between* Strongly Connected Components is *always* a DAG, so you can
/// always topologically sort *that*. In really nasty cases this is just vacuously
/// true (all the nodes end up in one SCC, and so the "Graph of SCCs" is just one big
/// unsorted node). On the other hand, if the graph already *is* a DAG then each node
/// is its own SCC, and so we lose nothing. In this way SCCs give us a way to preserve
/// all the *nice* parts of our graph while also isolating the problematic parts
/// (SCCs with more than 1 node) to something self-contained that we can handle specially.
///
/// In the general case, nothing more can be done to order an SCC. By definition every
/// node depends on every other node! But as we've seen in the previous section, there
/// actually *is* a good way to order packages even with cycles, and so we maintain
/// that ordering for our SCCs: it's just the topological sort with all the
/// dev-dependencies ignored.
///
///
///
///
/// [serde_github]: https://github.com/serde-rs/serde
/// [serde_toml]: https://github.com/serde-rs/serde/blob/072145e0e913df7686f001dbf29e43a0ff7afac4/serde/Cargo.toml#L17-L18
/// [serde_derive_toml]: https://github.com/serde-rs/serde/blob/072145e0e913df7686f001dbf29e43a0ff7afac4/serde_derive/Cargo.toml#L29-L30
pub struct Cycles<'g> {
package_graph: &'g PackageGraph,
sccs: &'g Sccs<PackageIx>,
}
impl<'g> Cycles<'g> {
pub(super) fn new(package_graph: &'g PackageGraph) -> Self {
Self {
package_graph,
sccs: package_graph.sccs(),
}
}
/// Returns true if these two IDs are in the same cycle.
///
/// This is equivalent to checking if they're in the same Strongly Connected Component.
pub fn is_cyclic(&self, a: &PackageId, b: &PackageId) -> Result<bool, Error> {
let a_ix = self.package_graph.package_ix(a)?;
let b_ix = self.package_graph.package_ix(b)?;
Ok(self.sccs.is_same_scc(a_ix, b_ix))
}
/// Returns all the Strongly Connected Components (SCCs) of 2 or more elements in this graph.
///
/// SCCs are returned in topological order: if packages in SCC B depend on packages in SCC
/// A, A is returned before B.
///
/// Within an SCC, nodes are returned in non-dev order: if package Foo has a dependency on Bar,
/// and Bar has a cyclic dev-dependency on Foo, then Foo is returned before Bar.
///
/// See the type-level docs for details.
pub fn all_cycles(&self) -> impl DoubleEndedIterator<Item = Vec<&'g PackageId>> + 'g {
let dep_graph = &self.package_graph.dep_graph;
self.sccs
.multi_sccs()
.map(move |scc| scc.iter().map(move |ix| &dep_graph[*ix]).collect())
}
}