guppy/graph/
cycles.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
// Copyright (c) The cargo-guppy Contributors
// SPDX-License-Identifier: MIT OR Apache-2.0

//! Code for handling cycles in dependency graphs.
//!
//! See [`Cycles`][] for detailed docs.

use crate::{
    graph::{PackageGraph, PackageIx},
    petgraph_support::scc::Sccs,
    Error, PackageId,
};

/// Contains information about dependency cycles.
///
/// More accurately, information about Strongly Connected Components with 2 or more elements.
/// Constructed through `PackageGraph::cycles`.
///
/// This page includes a bunch of detailed information on cycles, but here's the TLDR:
///
/// * Yes, cycles can happen
/// * Cycles only happen with dev-dependencies
/// * These cycles have properties that make them easy to handle
/// * We handle this in APIs like [`PackageSet::packages`][`crate::graph::PackageSet::packages`]
/// * As a result, you probably don't actually need this module
///
/// The slighly more detailed summary is that any graph of "packages" is conflating
/// the "real" package with its tests, which are actually separate binaries. These
/// tests *always* depend on the "real" package, and if we bothered to encode that
/// then any package with tests would have a cyclic dependency on itself -- so we
/// don't encode that. Unfortunately dev-dependencies allow tests to *indirectly*
/// depend on the "real" package, creating a cycle you *do* see.
///
/// If you only care about "real" builds, you can simply ignore the dev-dependency
/// edges and restore a nice and simple DAG that can be topologically sorted. This is what
/// we do for you in APIs like [`PackageSet::packages`][`crate::graph::PackageSet::packages`].
///
/// If you care about tests and dev-dependencies, we recommend treating those as
/// different from the "real" ones (essentially desugarring the package into two nodes).
/// Because all dev builds are roots of the package graph (nothing depends on a test/benchmark),
/// they can always go at the start/end (depending on direction) of the topological sort.
/// This means you can just do add a second loop before/after the "real" one.
///
/// For instance, here's a simple program that recursively computes some property of packages
/// (here "whether serde is a transitive dependency"):
///
/// ```
/// use guppy::{CargoMetadata, graph::DependencyDirection};
/// use std::collections::HashMap;
///
/// let metadata = CargoMetadata::parse_json(include_str!("../../../fixtures/small/metadata1.json")).unwrap();
/// let package_graph = metadata.build_graph().unwrap();
/// let workspace_members = package_graph.resolve_workspace();
/// let dependency_graph = package_graph.query_workspace().resolve();
///
/// // Whether the "real" package uses serde
/// let mut package_uses_serde = HashMap::new();
/// // Whether the "dev" package uses serde
/// let mut dev_package_uses_serde = HashMap::new();
///
/// // Iterate over packages in reverse topo order (process dependencies first)
/// for package in dependency_graph.packages(DependencyDirection::Reverse) {
///     // A package uses serde if...
///     let uses_serde = if package.name() == "serde" {
///         // It is literally serde (base case)
///         true
///     } else {
///         // It has a non-dev-dependency on a package which uses serde
///         // (dev-dependencies handled in the second loop)
///         package.direct_links().any(|link| {
///             !link.dev_only() && package_uses_serde[link.to().id()]
///         })
///     };
///     // Record this package's result
///     package_uses_serde.insert(package.id(), uses_serde);
/// }
///
/// // Now iterate over the workspace members to handle their tests (if any)
/// // Note that DependencyDirection doesn't matter here, we're literally
/// // just looping over every workspace member in arbitrary order!
/// for package in workspace_members.packages(DependencyDirection::Reverse) {
///     // Check dev-packages using the "real" package results for all links!
///     let uses_serde = package.direct_links().any(|link| {
///         package_uses_serde[link.to().id()]
///     });
///     // Record this dev-package's result
///     dev_package_uses_serde.insert(package.id(), uses_serde);
/// }
///
/// // Now we have all the values computed!
/// for (id, &uses_serde) in &package_uses_serde {
///     if uses_serde {
///         let name = package_graph.metadata(id).unwrap().name();
///         println!("{name} uses serde!");
///     }
/// }
/// for (id, &uses_serde) in &dev_package_uses_serde {
///     if uses_serde {
///         let name = package_graph.metadata(id).unwrap().name();
///         println!("{name}'s tests use serde!");
///     }
/// }
/// ```
///
///
///
///
///
/// # Why Cargo Dependency Graphs Have Cycles
///
/// Dependency graphs are generally Directed Acyclic Graphs (DAGs), where each package
/// is a node and each dependency is an edge. These graphs are acyclic (contain no cycles)
/// because anything else would be a paradox -- how do you build X if it depends on itself?
/// You don't!
///
/// So why does this API exist? It wouldn't make sense for Cargo to have cycles!
///
/// The problem is that "the Cargo dependency graph" is actually two different graphs
/// at different levels of abstraction: The Package Graph (Guppy, cargo-metadata), and
/// The Build Graph (Cargo's internals). These two graphs are different because each
/// package is actually a bunch of different
/// [build targets in a trenchcoat][`crate::graph::PackageMetadata::build_targets`] -- libs,
/// bins, tests, benches, and so on. In The Build Graph these different build targets get
/// their own nodes. In The Package Graph all those targets gets merged together into one
/// big node. The Build Graph is always a proper DAG, but The Package Graph can have cycles.
///
/// Thankfully these cycles can only be created by one specific (and rare) situation:
/// dev-dependencies. **A test/bench target for a package is allowed to indirectly
/// depend on the same package's lib/bin target, and this creates apparent cycles
/// in the package graph!** That's it!
///
/// As we'll see, **simply ignoring all dev-dependency edges eliminates all cycles
/// *and* preserves the ordering constraints of the dependency graph.**
///
///
///
/// # An Example Cyclic Workspace
///
/// As a concrete example, consider [the serde workspace][serde_github], which
/// actually has this "problem": there's a "cycle" between serde and serde_derive.
/// In normal builds this cycle doesn't exist: serde_derive is actually a standalone
/// crate, while [serde (optionally) pulls in serde_derive as a dependency][serde_toml].
/// The "cycle" only appears when testing serde_derive: [serde_derive's tests quite
/// reasonably depend on serde][serde_derive_toml] to test the proc-macro's output,
/// creating a cycle!
///
/// The way to resolve this monstrosity is to realize that the tests for serde_derive
/// are actually a completely different binary from the serde_derive *library*. Let's
/// call those tests serde_derive_dev. So although the (Package) graph reported by Guppy
/// (and cargo-metadata) looks like a cycle:
///
/// ```text
/// serde <-----+
///   |         |
///   |         |
///   +--> serde_derive
/// ```
///
/// In actuality, serde_derive_dev breaks the cycle and creates a nice clean DAG
/// (in The Build Graph):
///
/// ```text
///   +-- serde_derive_dev
///   |          |
///   v          |
/// serde        |
///   |          |
///   |          v
///   +---> serde_derive
/// ```
///
/// Here's the really important thing to notice: serde_derive_dev is actually a *root*
/// in The Build Graph, and this is always true! Nothing should ever depend on the *tests*
/// or *benchmarks* for another library.
///
/// This is the key insight to ignoring dev-dependency edges. As we'll see, the roots
/// (and leaves) of a DAG are in some sense "ignorable" by the rest of the graph,
/// because they can't change the ordering constraints between other packages.
///
///
///
/// # Topological Sort Is Great (And Composable)
///
/// Now that we understand *why* cycles can happen in the package graph, let's look at
/// what those cycles mess up, and how to deal with them.
///
/// One of the big reasons everyone loves DAGs is because you can get a Topological
/// Sort of them. Topological Sort
/// (with [`DependencyDirection::Forward`][`crate::graph::DependencyDirection::Forward`])
/// is just a fancy way of saying "a list where packages always appear before their dependencies"
/// (vice-versa for [`DependencyDirection::Reverse`][`crate::graph::DependencyDirection::Reverse`]).
///
/// This is really convenient! If you need to do things in "dependency order" you can just
/// topologically sort the packages and then boring old for-loops will magically get
/// everything done before it's needed.
///
/// Unfortunately, you can't get the Topological Sort of a graph with cycles because that
/// doesn't make sense. And yet, Guppy has
/// [several APIs which do exactly that][`crate::graph::PackageSet::packages`].
/// What gives? The docs say:
///
/// > The packages within a dependency cycle will be returned in non-dev order. When the
/// > direction is forward, if package Foo has a dependency on Bar, and Bar has a cyclic
/// > dev-dependency on Foo, then Foo is returned before Bar.
///
/// We just ignore the dev-dependency edges! Problem Solved.
///
/// But isn't this throwing out important information that could change the result? Nope!
///
/// As we saw in the previous section, all dev-builds are roots in The Build Graph.
/// Ignoring all dev-dependency edges is equivalent to deleting all of those roots.
/// This may "orphan" dependencies that are only used for dev-builds, but we still
/// keep them in the graph and properly include them in the sort.
///
/// As it turns out, you can recursively compute the topological sort of a graph as follows:
///
/// 1. delete a root (or leaf)
/// 2. compute the topological sort of the new graph
/// 3. append the root (or leaf) to the start (or end) of the list
///
/// **Even although we delete all the dev-nodes from the graph when doing our sort,
/// if you want to "add them back" the only thing you need to do is handle them before
/// (or after) everything else!** Even better: all the dev-builds are roots at the same
/// time, so you can process them in any order!
///
/// Just remember that every node with dev-dependencies is really two nodes: the "normal"
/// version without dev-dependencies, and the version with them. Exactly how you want
/// to express that notion in your code is up to you. (Two different loops is the simplest.)
///
///
///
///
/// # Reasoning About Cycles: Strongly Connected Components
///
/// Ok but wait, none of that involved Strongly Connected Components! Yeah, isn't that great? 😄
///
/// Oh you still want to "know" about the cycles? Then we've gotta bust out the heavy
/// general-purpose machinery. Thankfully the problem of cycles in directed graphs is
/// an old and well-studied problem with a conceptually simple solution: hide the cycle
/// in a box and pretend that it's just one Really Big Node in the DAG.
///
/// Yes, really, that's all that Strongly Connected Components are. More precisely, SCCs
/// are defined to be maximal sets of nodes such that "every node in an SCC can reach
/// every other node in that SCC" (a property which definitely holds for cycles).
/// The reason for this more complicated definition is that you can have a bunch of
/// cycles all knotted together in a nasty ball, and trying to tease out individual
/// cycles isn't really helpful. So we just wrap the whole ball of nodes up into one
/// big "I give up" box and forget about it!
///
/// Now, what does this get us?
///
/// The graph *between* Strongly Connected Components is *always* a DAG, so you can
/// always topologically sort *that*. In really nasty cases this is just vacuously
/// true (all the nodes end up in one SCC, and so the "Graph of SCCs" is just one big
/// unsorted node). On the other hand, if the graph already *is* a DAG then each node
/// is its own SCC, and so we lose nothing. In this way SCCs give us a way to preserve
/// all the *nice* parts of our graph while also isolating the problematic parts
/// (SCCs with more than 1 node) to something self-contained that we can handle specially.
///
/// In the general case, nothing more can be done to order an SCC. By definition every
/// node depends on every other node! But as we've seen in the previous section, there
/// actually *is* a good way to order packages even with cycles, and so we maintain
/// that ordering for our SCCs: it's just the topological sort with all the
/// dev-dependencies ignored.
///
///
///
///
/// [serde_github]: https://github.com/serde-rs/serde
/// [serde_toml]: https://github.com/serde-rs/serde/blob/072145e0e913df7686f001dbf29e43a0ff7afac4/serde/Cargo.toml#L17-L18
/// [serde_derive_toml]: https://github.com/serde-rs/serde/blob/072145e0e913df7686f001dbf29e43a0ff7afac4/serde_derive/Cargo.toml#L29-L30
pub struct Cycles<'g> {
    package_graph: &'g PackageGraph,
    sccs: &'g Sccs<PackageIx>,
}

impl<'g> Cycles<'g> {
    pub(super) fn new(package_graph: &'g PackageGraph) -> Self {
        Self {
            package_graph,
            sccs: package_graph.sccs(),
        }
    }

    /// Returns true if these two IDs are in the same cycle.
    ///
    /// This is equivalent to checking if they're in the same Strongly Connected Component.
    pub fn is_cyclic(&self, a: &PackageId, b: &PackageId) -> Result<bool, Error> {
        let a_ix = self.package_graph.package_ix(a)?;
        let b_ix = self.package_graph.package_ix(b)?;
        Ok(self.sccs.is_same_scc(a_ix, b_ix))
    }

    /// Returns all the Strongly Connected Components (SCCs) of 2 or more elements in this graph.
    ///
    /// SCCs are returned in topological order: if packages in SCC B depend on packages in SCC
    /// A, A is returned before B.
    ///
    /// Within an SCC, nodes are returned in non-dev order: if package Foo has a dependency on Bar,
    /// and Bar has a cyclic dev-dependency on Foo, then Foo is returned before Bar.
    ///
    /// See the type-level docs for details.
    pub fn all_cycles(&self) -> impl DoubleEndedIterator<Item = Vec<&'g PackageId>> + 'g {
        let dep_graph = &self.package_graph.dep_graph;
        self.sccs
            .multi_sccs()
            .map(move |scc| scc.iter().map(move |ix| &dep_graph[*ix]).collect())
    }
}