guppy/graph/cycles.rs
1// Copyright (c) The cargo-guppy Contributors
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Code for handling cycles in dependency graphs.
5//!
6//! See [`Cycles`][] for detailed docs.
7
8use crate::{
9 Error, PackageId,
10 graph::{PackageGraph, PackageIx},
11 petgraph_support::scc::Sccs,
12};
13
14/// Contains information about dependency cycles.
15///
16/// "Cycle" here means a non-trivial Strongly Connected Component: either a
17/// multi-node SCC (where every member is mutually reachable from every
18/// other), or a single-node SCC whose node has a self-loop edge (commonly
19/// produced by a path dev-dependency on the package's own crate).
20/// Constructed through `PackageGraph::cycles`.
21///
22/// This page includes a bunch of detailed information on cycles, but here's the TLDR:
23///
24/// * Yes, cycles can happen
25/// * Cycles only happen with dev-dependencies
26/// * These cycles have properties that make them easy to handle
27/// * We handle this in APIs like [`PackageSet::packages`][`crate::graph::PackageSet::packages`]
28/// * As a result, you probably don't actually need this module
29///
30/// The slighly more detailed summary is that any graph of "packages" is conflating
31/// the "real" package with its tests, which are actually separate binaries. These
32/// tests *always* depend on the "real" package, and if we bothered to encode that
33/// then any package with tests would have a cyclic dependency on itself -- so we
34/// don't encode that. Unfortunately dev-dependencies allow tests to *indirectly*
35/// depend on the "real" package, creating a cycle you *do* see.
36///
37/// If you only care about "real" builds, you can simply ignore the dev-dependency
38/// edges and restore a nice and simple DAG that can be topologically sorted. This is what
39/// we do for you in APIs like [`PackageSet::packages`][`crate::graph::PackageSet::packages`].
40///
41/// If you care about tests and dev-dependencies, we recommend treating those as
42/// different from the "real" ones (essentially desugarring the package into two nodes).
43/// Because all dev builds are roots of the package graph (nothing depends on a test/benchmark),
44/// they can always go at the start/end (depending on direction) of the topological sort.
45/// This means you can just do add a second loop before/after the "real" one.
46///
47/// For instance, here's a simple program that recursively computes some property of packages
48/// (here "whether serde is a transitive dependency"):
49///
50/// ```
51/// use guppy::{CargoMetadata, graph::DependencyDirection};
52/// use std::collections::HashMap;
53///
54/// let metadata = CargoMetadata::parse_json(include_str!("../../../fixtures/small/metadata1.json")).unwrap();
55/// let package_graph = metadata.build_graph().unwrap();
56/// let workspace_members = package_graph.resolve_workspace();
57/// let dependency_graph = package_graph.query_workspace().resolve();
58///
59/// // Whether the "real" package uses serde
60/// let mut package_uses_serde = HashMap::new();
61/// // Whether the "dev" package uses serde
62/// let mut dev_package_uses_serde = HashMap::new();
63///
64/// // Iterate over packages in reverse topo order (process dependencies first)
65/// for package in dependency_graph.packages(DependencyDirection::Reverse) {
66/// // A package uses serde if...
67/// let uses_serde = if package.name() == "serde" {
68/// // It is literally serde (base case)
69/// true
70/// } else {
71/// // It has a non-dev-dependency on a package which uses serde
72/// // (dev-dependencies handled in the second loop)
73/// package.direct_links().any(|link| {
74/// !link.dev_only() && package_uses_serde[link.to().id()]
75/// })
76/// };
77/// // Record this package's result
78/// package_uses_serde.insert(package.id(), uses_serde);
79/// }
80///
81/// // Now iterate over the workspace members to handle their tests (if any)
82/// // Note that DependencyDirection doesn't matter here, we're literally
83/// // just looping over every workspace member in arbitrary order!
84/// for package in workspace_members.packages(DependencyDirection::Reverse) {
85/// // Check dev-packages using the "real" package results for all links!
86/// let uses_serde = package.direct_links().any(|link| {
87/// package_uses_serde[link.to().id()]
88/// });
89/// // Record this dev-package's result
90/// dev_package_uses_serde.insert(package.id(), uses_serde);
91/// }
92///
93/// // Now we have all the values computed!
94/// for (id, &uses_serde) in &package_uses_serde {
95/// if uses_serde {
96/// let name = package_graph.metadata(id).unwrap().name();
97/// println!("{name} uses serde!");
98/// }
99/// }
100/// for (id, &uses_serde) in &dev_package_uses_serde {
101/// if uses_serde {
102/// let name = package_graph.metadata(id).unwrap().name();
103/// println!("{name}'s tests use serde!");
104/// }
105/// }
106/// ```
107///
108///
109///
110///
111///
112/// # Why Cargo Dependency Graphs Have Cycles
113///
114/// Dependency graphs are generally Directed Acyclic Graphs (DAGs), where each package
115/// is a node and each dependency is an edge. These graphs are acyclic (contain no cycles)
116/// because anything else would be a paradox -- how do you build X if it depends on itself?
117/// You don't!
118///
119/// So why does this API exist? It wouldn't make sense for Cargo to have cycles!
120///
121/// The problem is that "the Cargo dependency graph" is actually two different graphs
122/// at different levels of abstraction: The Package Graph (Guppy, cargo-metadata), and
123/// The Build Graph (Cargo's internals). These two graphs are different because each
124/// package is actually a bunch of different
125/// [build targets in a trenchcoat][`crate::graph::PackageMetadata::build_targets`] -- libs,
126/// bins, tests, benches, and so on. In The Build Graph these different build targets get
127/// their own nodes. In The Package Graph all those targets gets merged together into one
128/// big node. The Build Graph is always a proper DAG, but The Package Graph can have cycles.
129///
130/// Thankfully these cycles can only be created by one specific (and rare) situation:
131/// dev-dependencies. **A test/bench target for a package is allowed to indirectly
132/// depend on the same package's lib/bin target, and this creates apparent cycles
133/// in the package graph!** That's it!
134///
135/// Concretely, this shows up in two shapes:
136///
137/// 1. **Single-node cycles (self-loops).** A package declares a path
138/// dev-dependency on its own crate -- usually to enable specific
139/// features under `cfg(test)` -- which becomes a self-edge in the
140/// package graph. The package's lone SCC is cyclic, with itself as
141/// the only member.
142/// 2. **Multi-node cycles.** Two or more packages depend on each other,
143/// with at least one dev-dependency edge somewhere around the cycle.
144/// The serde / serde_derive example below is the canonical case.
145///
146/// As we'll see, **simply ignoring all dev-dependency edges eliminates all cycles
147/// *and* preserves the ordering constraints of the dependency graph.** Both
148/// shapes of cycle vanish under that filter, since the dev-dependency edge
149/// is always involved.
150///
151///
152///
153/// # An Example Cyclic Workspace
154///
155/// As a concrete example, consider [the serde workspace][serde_github], which
156/// actually has this "problem": there's a "cycle" between serde and serde_derive.
157/// In normal builds this cycle doesn't exist: serde_derive is actually a standalone
158/// crate, while [serde (optionally) pulls in serde_derive as a dependency][serde_toml].
159/// The "cycle" only appears when testing serde_derive: [serde_derive's tests quite
160/// reasonably depend on serde][serde_derive_toml] to test the proc-macro's output,
161/// creating a cycle!
162///
163/// The way to resolve this monstrosity is to realize that the tests for serde_derive
164/// are actually a completely different binary from the serde_derive *library*. Let's
165/// call those tests serde_derive_dev. So although the (Package) graph reported by Guppy
166/// (and cargo-metadata) looks like a cycle:
167///
168/// ```text
169/// serde <-----+
170/// | |
171/// | |
172/// +--> serde_derive
173/// ```
174///
175/// In actuality, serde_derive_dev breaks the cycle and creates a nice clean DAG
176/// (in The Build Graph):
177///
178/// ```text
179/// +-- serde_derive_dev
180/// | |
181/// v |
182/// serde |
183/// | |
184/// | v
185/// +---> serde_derive
186/// ```
187///
188/// Here's the really important thing to notice: serde_derive_dev is actually a *root*
189/// in The Build Graph, and this is always true! Nothing should ever depend on the *tests*
190/// or *benchmarks* for another library.
191///
192/// This is the key insight to ignoring dev-dependency edges. As we'll see, the roots
193/// (and leaves) of a DAG are in some sense "ignorable" by the rest of the graph,
194/// because they can't change the ordering constraints between other packages.
195///
196///
197///
198/// # Topological Sort Is Great (And Composable)
199///
200/// Now that we understand *why* cycles can happen in the package graph, let's look at
201/// what those cycles mess up, and how to deal with them.
202///
203/// One of the big reasons everyone loves DAGs is because you can get a Topological
204/// Sort of them. Topological Sort
205/// (with [`DependencyDirection::Forward`][`crate::graph::DependencyDirection::Forward`])
206/// is just a fancy way of saying "a list where packages always appear before their dependencies"
207/// (vice-versa for [`DependencyDirection::Reverse`][`crate::graph::DependencyDirection::Reverse`]).
208///
209/// This is really convenient! If you need to do things in "dependency order" you can just
210/// topologically sort the packages and then boring old for-loops will magically get
211/// everything done before it's needed.
212///
213/// Unfortunately, you can't get the Topological Sort of a graph with cycles because that
214/// doesn't make sense. And yet, Guppy has
215/// [several APIs which do exactly that][`crate::graph::PackageSet::packages`].
216/// What gives? The docs say:
217///
218/// > The packages within a dependency cycle will be returned in non-dev order. When the
219/// > direction is forward, if package Foo has a dependency on Bar, and Bar has a cyclic
220/// > dev-dependency on Foo, then Foo is returned before Bar.
221///
222/// We just ignore the dev-dependency edges! Problem Solved.
223///
224/// But isn't this throwing out important information that could change the result? Nope!
225///
226/// As we saw in the previous section, all dev-builds are roots in The Build Graph.
227/// Ignoring all dev-dependency edges is equivalent to deleting all of those roots.
228/// This may "orphan" dependencies that are only used for dev-builds, but we still
229/// keep them in the graph and properly include them in the sort.
230///
231/// As it turns out, you can recursively compute the topological sort of a graph as follows:
232///
233/// 1. delete a root (or leaf)
234/// 2. compute the topological sort of the new graph
235/// 3. append the root (or leaf) to the start (or end) of the list
236///
237/// **Even although we delete all the dev-nodes from the graph when doing our sort,
238/// if you want to "add them back" the only thing you need to do is handle them before
239/// (or after) everything else!** Even better: all the dev-builds are roots at the same
240/// time, so you can process them in any order!
241///
242/// Just remember that every node with dev-dependencies is really two nodes: the "normal"
243/// version without dev-dependencies, and the version with them. Exactly how you want
244/// to express that notion in your code is up to you. (Two different loops is the simplest.)
245///
246///
247///
248///
249/// # Reasoning About Cycles: Strongly Connected Components
250///
251/// Ok but wait, none of that involved Strongly Connected Components! Yeah, isn't that great? 😄
252///
253/// Oh you still want to "know" about the cycles? Then we've gotta bust out the heavy
254/// general-purpose machinery. Thankfully the problem of cycles in directed graphs is
255/// an old and well-studied problem with a conceptually simple solution: hide the cycle
256/// in a box and pretend that it's just one Really Big Node in the DAG.
257///
258/// Yes, really, that's all that Strongly Connected Components are. More precisely, SCCs
259/// are defined to be maximal sets of nodes such that "every node in an SCC can reach
260/// every other node in that SCC" (a property which definitely holds for cycles).
261/// The reason for this more complicated definition is that you can have a bunch of
262/// cycles all knotted together in a nasty ball, and trying to tease out individual
263/// cycles isn't really helpful. So we just wrap the whole ball of nodes up into one
264/// big "I give up" box and forget about it!
265///
266/// Now, what does this get us?
267///
268/// The graph *between* Strongly Connected Components is *always* a DAG, so you can
269/// always topologically sort *that*. In really nasty cases this is just vacuously
270/// true (all the nodes end up in one SCC, and so the "Graph of SCCs" is just one big
271/// unsorted node). On the other hand, if the graph already *is* a DAG then each node
272/// is its own SCC, and so we lose nothing. In this way SCCs give us a way to preserve
273/// all the *nice* parts of our graph while also isolating the problematic parts
274/// (the cyclic SCCs) to something self-contained that we can handle specially.
275///
276/// > [!NOTE]
277/// > A node's single-element SCC is "cyclic" iff that node has a self-loop
278/// > edge in the graph. The SCC machinery itself doesn't make this
279/// > distinction -- every node is its own SCC -- but [`Cycles::is_cyclic`]
280/// > and [`Cycles::all_cycles`] do, so callers consistently see only SCCs
281/// > that actually lie on a directed cycle.
282///
283/// In the general case, nothing more can be done to order an SCC. By definition every
284/// node depends on every other node! But as we've seen in the previous section, there
285/// actually *is* a good way to order packages even with cycles, and so we maintain
286/// that ordering for our SCCs: it's just the topological sort with all the
287/// dev-dependencies ignored.
288///
289///
290///
291///
292/// [serde_github]: https://github.com/serde-rs/serde
293/// [serde_toml]: https://github.com/serde-rs/serde/blob/072145e0e913df7686f001dbf29e43a0ff7afac4/serde/Cargo.toml#L17-L18
294/// [serde_derive_toml]: https://github.com/serde-rs/serde/blob/072145e0e913df7686f001dbf29e43a0ff7afac4/serde_derive/Cargo.toml#L29-L30
295pub struct Cycles<'g> {
296 package_graph: &'g PackageGraph,
297 sccs: &'g Sccs<PackageIx>,
298}
299
300impl<'g> Cycles<'g> {
301 pub(super) fn new(package_graph: &'g PackageGraph) -> Self {
302 Self {
303 package_graph,
304 sccs: package_graph.sccs(),
305 }
306 }
307
308 /// Returns true if `a` and `b` lie on a common directed cycle in the
309 /// package graph.
310 ///
311 /// "Lie on a common cycle" means: in the same Strongly Connected
312 /// Component *and* that SCC is non-trivial.
313 ///
314 /// * For distinct package IDs, that's the same as being in a multi-node SCC.
315 /// * For the same package ID, the node must either be in a multi-node SCC,
316 /// or have a self-loop edge in the dependency graph (e.g., from a `path`
317 /// dev-dependency on the package's own crate).
318 ///
319 /// In particular, `is_cyclic(a, a)` is *not* reflexively true: it
320 /// returns `false` for nodes that aren't on any cycle.
321 pub fn is_cyclic(&self, a: &PackageId, b: &PackageId) -> Result<bool, Error> {
322 let a_ix = self.package_graph.package_ix(a)?;
323 let b_ix = self.package_graph.package_ix(b)?;
324
325 if a_ix != b_ix {
326 // Different nodes lie on a common cycle iff they're in the
327 // same SCC -- which, for distinct nodes, can only be a multi-
328 // node SCC.
329 return Ok(self.sccs.is_same_scc(a_ix, b_ix));
330 }
331
332 // Same node: on a cycle iff its SCC is non-trivial.
333 Ok(self.sccs.in_multi_scc(a_ix) || self.package_graph.dep_graph.contains_edge(a_ix, a_ix))
334 }
335
336 /// Returns all the cyclic Strongly Connected Components (SCCs) of this
337 /// graph: every multi-node SCC, plus every single-node SCC whose node
338 /// has a self-loop edge.
339 ///
340 /// SCCs are returned in topological order: if packages in SCC B depend
341 /// on packages in SCC A, A is returned before B.
342 ///
343 /// Within an SCC, nodes are returned in non-dev order: if package Foo
344 /// has a dependency on Bar, and Bar has a cyclic dev-dependency on
345 /// Foo, then Foo is returned before Bar.
346 ///
347 /// See the type-level docs for details.
348 pub fn all_cycles(&self) -> impl DoubleEndedIterator<Item = Vec<&'g PackageId>> + 'g + use<'g> {
349 let dep_graph = &self.package_graph.dep_graph;
350 self.sccs.all_sccs().filter_map(move |scc| {
351 let is_cyclic = match scc {
352 // Multi-node SCCs are non-trivial by definition.
353 [_, _, ..] => true,
354 // Single-node SCC is cyclic iff there's a self-loop edge.
355 &[ix] => dep_graph.contains_edge(ix, ix),
356 // SCCs always have at least one element; the empty case is
357 // unreachable from `kosaraju_scc`.
358 [] => false,
359 };
360 is_cyclic.then(|| scc.iter().map(move |ix| &dep_graph[*ix]).collect())
361 })
362 }
363}