diff --git a/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_histogram_-_commonality_with_perfect_set_of_20_ids.png b/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_histogram_-_commonality_with_perfect_set_of_20_ids.png
new file mode 100644
index 00000000..5edf5a24
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_histogram_-_commonality_with_perfect_set_of_20_ids.png differ
diff --git a/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_histogram_-_routing_table_size_per_node.png b/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_histogram_-_routing_table_size_per_node.png
new file mode 100644
index 00000000..d7c88ffb
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_histogram_-_routing_table_size_per_node.png differ
diff --git a/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_histogram_-_storage_usage_per_node.png b/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_histogram_-_storage_usage_per_node.png
new file mode 100644
index 00000000..546c89bd
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_histogram_-_storage_usage_per_node.png differ
diff --git a/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_routing_table_size_per_node.png b/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_routing_table_size_per_node.png
new file mode 100644
index 00000000..8089a7a8
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_routing_table_size_per_node.png differ
diff --git a/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_storage_usage_per_node.png b/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_storage_usage_per_node.png
new file mode 100644
index 00000000..d6a558e9
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/just_bootstrap_1k_-_storage_usage_per_node.png differ
diff --git a/public/blog/lets-write-a-dht-3/partition_1k.gif b/public/blog/lets-write-a-dht-3/partition_1k.gif
new file mode 100644
index 00000000..b030ef7d
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/partition_1k.gif differ
diff --git a/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_histogram_-_commonality_with_perfect_set_of_20_ids.png b/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_histogram_-_commonality_with_perfect_set_of_20_ids.png
new file mode 100644
index 00000000..781bd591
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_histogram_-_commonality_with_perfect_set_of_20_ids.png differ
diff --git a/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_histogram_-_routing_table_size_per_node.png b/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_histogram_-_routing_table_size_per_node.png
new file mode 100644
index 00000000..79531614
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_histogram_-_routing_table_size_per_node.png differ
diff --git a/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_histogram_-_storage_usage_per_node.png b/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_histogram_-_storage_usage_per_node.png
new file mode 100644
index 00000000..6150d760
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_histogram_-_storage_usage_per_node.png differ
diff --git a/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_routing_table_size_per_node.png b/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_routing_table_size_per_node.png
new file mode 100644
index 00000000..7522e02b
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_routing_table_size_per_node.png differ
diff --git a/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_storage_usage_per_node.png b/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_storage_usage_per_node.png
new file mode 100644
index 00000000..193a0eb3
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/perfect_routing_tables_1k_-_storage_usage_per_node.png differ
diff --git a/public/blog/lets-write-a-dht-3/remove_1k.gif b/public/blog/lets-write-a-dht-3/remove_1k.gif
new file mode 100644
index 00000000..6bfdec1e
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/remove_1k.gif differ
diff --git a/public/blog/lets-write-a-dht-3/self_lookup_strategy-0_-_histogram_-_commonality_with_perfect_set_of_20_ids.png b/public/blog/lets-write-a-dht-3/self_lookup_strategy-0_-_histogram_-_commonality_with_perfect_set_of_20_ids.png
new file mode 100644
index 00000000..16210bb1
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/self_lookup_strategy-0_-_histogram_-_commonality_with_perfect_set_of_20_ids.png differ
diff --git a/public/blog/lets-write-a-dht-3/self_lookup_strategy-9_-_histogram_-_commonality_with_perfect_set_of_20_ids.png b/public/blog/lets-write-a-dht-3/self_lookup_strategy-9_-_histogram_-_commonality_with_perfect_set_of_20_ids.png
new file mode 100644
index 00000000..ec795c99
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/self_lookup_strategy-9_-_histogram_-_commonality_with_perfect_set_of_20_ids.png differ
diff --git a/public/blog/lets-write-a-dht-3/self_lookup_strategy-9_-_routing_table_size_per_node.png b/public/blog/lets-write-a-dht-3/self_lookup_strategy-9_-_routing_table_size_per_node.png
new file mode 100644
index 00000000..64f2065c
Binary files /dev/null and b/public/blog/lets-write-a-dht-3/self_lookup_strategy-9_-_routing_table_size_per_node.png differ
diff --git a/src/app/blog/lets-write-a-dht-1/page.mdx b/src/app/blog/lets-write-a-dht-1/page.mdx
new file mode 100644
index 00000000..fb4fc3b7
--- /dev/null
+++ b/src/app/blog/lets-write-a-dht-1/page.mdx
@@ -0,0 +1,235 @@
+import { BlogPostLayout } from '@/components/BlogPostLayout'
+import {ThemeImage} from '@/components/ThemeImage'
+
+export const post = {
+ draft: false,
+ author: 'Rüdiger Klaehn',
+ date: '2025-09-18',
+ title: 'A DHT for iroh',
+ description:
+ "Let's write a DHT for iroh - protocol",
+}
+
+export const metadata = {
+ title: post.title,
+ description: post.description,
+ openGraph: {
+ title: post.title,
+ description: post.description,
+ images: [{
+ url: `/api/og?title=Blog&subtitle=${post.title}`,
+ width: 1200,
+ height: 630,
+ alt: post.title,
+ type: 'image/png',
+ }],
+ type: 'article'
+ }
+}
+
+export default (props) =>
+
+Iroh blobs is a very efficient way to distribute data from one peer to another. It even has some capabilities to get data from multiple nodes, as of iroh-blobs 0.9x. But there is one crucial component missing for it to become a global permissionless content distribution system: a content discovery service that tells you which nodes have which content.
+
+This is a very hard problem. Existing systems such as bittorrent solve it reasonably well, although they are not perfect.
+
+The standard solution for content discovery in systems such as bittorrent and IPFS is a Distributed Hash Table (DHT). This series of blog posts and the associated repository are an experiment: is it possible to write a high performance distributed hash table using iroh connections?
+
+The code is not yet production ready, but it is an interesting use case for many advanced techniques involving iroh connections, such as connection pools and 0rtt connections. It also is a nice way to show off irpc, both for *local* rpc to control a DHT node and for the DHT protocol itself.
+
+# What is a Distributed Hash Table
+
+Let's see what wikipedia says:
+
+"A distributed hash table (DHT) is a distributed system that provides a lookup service similar to a hash table. Key–value pairs are stored in a DHT, and any participating node can efficiently retrieve the value associated with a given key. The main advantage of a DHT is that nodes can be added or removed with minimum work around re-distributing keys."
+
+So a distributed hash table seen as a black box is just like a hashtable, but spread over possibly millions of machines that are connected via possibly slow and fallible network connections. The algorithm needs to gracefully deal with nodes being slow or high latency, nodes coming and going, and ideally even with some nodes that intentionally misbehave.
+
+## Keys
+
+Just like a normal hash table, a distributed hash table maps some key type to some value type. Keys in local hash tables can be of arbitrary size. The key that is actually used for lookup is a (e.g. 64 bit) hash of the value, and the hash table has additional logic to deal with rare but inevitable hash collisions. For distributed hash tables, typically you restrict the key to a fixed size and let the application deal with the mapping from the actual key to the hash table keyspace. E.g. the bittorrent mainline DHT uses a 20 byte keyspace, which is the size of a SHA1 hash. The main purpose of the mainline DHT is to find content providers for data based on a SHA1 hash of the data. But even with mainline there are cases where the actual key you want to look up is larger than the keyspace, e.g. [bep_0044] where you want to look up some information for an ED25519 public key. In that case mainline does exactly what you would do in a local hash table - it hashes the public key using SHA1 and then uses the hash as the lookup key.
+
+For iroh we are mainly interested in looking up content based on its BLAKE3 hash. Another use case for the DHT is to look up information for an iroh node id, which is an ED25519 public key. So it makes sense for a clean room implementation to choose a 32 byte keyspace. An arbitrary size key can be mapped to this keyspace using a cryptographic hash function with an astronomically low probability of collisions.
+
+
+It is important to keep in mind that despite the main purpose of the DHT being BLAKE3 lookups, the key is just an arbitrary blob. You can put whatever you want in these 32 bytes, BLAKE3 hashes, SHA2 hashes, ED25519 public keys, whatever fits 32 bytes.
+
+Also, a DHT with a smaller keyspace such as mainline with its 20 bytes would still be completely viable provided that you had a way to store arbitrary values for a key. The reason to use 32 bytes is just convenience, since this is a completely separate implementation anyway.
+
+
+## Values
+
+In principle, values in a DHT can be of arbirary size. But there are various reasons for wanting to keep the values small.
+
+First of all, we want requests to store and look up data to be small for efficiency. Ideally a storage or lookup request should fit into a single network packet even with the inevitable framing overhead. In QUIC, the minimum MTU (maximum transmission unit) is specified as 1200 bytes. So if a request consisting of key, value and some overhead fits into 1200 bytes, a request or response will be sent as a single non-fragmented UDP packet.
+
+Second, we rely on arbitrary nodes on the network to store data for us without being in any way compensated for it. So the values need to be small so a small DHT node can store many of them. If the value size was unlimited, people could and would abuse the DHT for storing actual data, which would put a lot of load on DHT nodes and would make it extremely unlikely that people would run DHT nodes without being compensated for it.
+
+For that reason, mature systems such as the mainline DHT restrict value size to 1000 bytes, and we are going to limit value size to 1024 bytes or 1KiB.
+
+You could write a DHT to store arbitrary values, but in almost all cases the value should have some relationship with the key. E.g. for mainline, the value in most cases is a set of socket addresses where you can download the data for the SHA1 hash of the key. So in principle you could validate the key by checking if you can actually download the data from the socket addresses contained in the data. In some mainline extensions, like bep_0044, the key is the SHA1 hash of an ED25519 public key, and the value contains the actual public key, a signature computed from the corresponding private key, and some user data. Again, it is possible to validate the value based on the key - if the SHA1 hash of the public key contained in the value does not match the lookup key, the value is invalid for the key.
+
+## Storage
+
+Even disregarding all the distributed systems complexity, at the end of the day the data needs to live somewhere. Each node will store a fraction of the total data. So one component of a DHT node is just a remotely accessible key value store, where the key is a fixed size blob. There can be multiple values for a key. E.g. there can be multiple nodes that store the same data. For that reason the storage is a multimap. The storage layer needs to have some mechanism to limit the number of values for a key, typically by time based expiry and/or a maximum number of values per key.
+
+## Routing
+
+As mentioned above, in a DHT not every node has all the data. So we need some mechanism to find which node has the data. Basically we need a way for a node to say "I don't have the data, but you might try my neighbours X and Y, they are more likely to have it". Independent of the exact routing algorithm, it is important to understand that routing is only concerned with keys, not with values. You first find the nodes that are most likely to have a value for a key, then ask these nodes for the actual value. So the two parts, routing and value storage/retrieval should be pretty separate. Adding new value types should be possible without having to touch the routing algorithm, and modifying the routing algorithm should be possible without having to think about values at all.
+
+## Kademlia
+
+The most popular routing algorithm for DHTs is [Kademlia]. The core idea of Kademlia is to define a [metric] that gives a scalar distance between any two keys (points in the metric space) that fulfills the metric axioms. DHT nodes have a node id that gets mapped to the metric space, and you store the data on the `k` nodes that are closest to the key.
+
+The metric chosen by Kademlia is the XOR metric: the distance of two keys `a` and `b` is simply the bitwise xor of the keys. This is absurdly cheap to compute and fulfills all the metric axioms. It also helps with sparse routing tables, as we will learn later.
+
+If a node had perfect knowledge of all other nodes in the network, it could give you a perfect answer to the question "where should I store the data for key `key`". Just sort the set of all keys that correspond to node ids by distance to the key and return the `k` smallest values. For small to medium DHTs this is a viable strategy, since modern computers can easily store millions of 32 byte keys without breaking a sweat. But for either extremely large DHTs or nodes with low memory requirements, it is desirable to store just a subset of all keys.
+
+
+It would be easily possible for a modern machine to keep the entire set of known node ids in memory even for very large DHTs. But you have to remember that routing is just a part of the work of the DHT, and also the DHT process should be cheap enough in terms of memory and storage that you can run it in the background without slowing the entire system down.
+
+
+A fixed size random sampling of the set of all known node ids would be viable and would work, but there are some downsides. For a completely random sampling, you do not have detailed knowledge of your immediate neighbours, so it would take many hops to find a better node if you are already close to the goal. If the sampling was only neighbours in terms of the distance metric, it would take you many hops to find a node that is far away. It turns out that the best distribution is a power law distribution where you know a lot of immediate neighbours but also some nodes that are far away.
+
+Imagine you wanted to find an arbitrary person, and your entire friend group was geographically close. It would be a lot of hops to find somebody that is far away. Now imagine your entire friend group was randomly spread around the world. It would take a lot of hops to find a neighbour that is not in your friend group. The ideal friend distribution is to know a lot of people in your village, but also some people in the next city, in neighbouring countries, and a few on different continents.
+
+Kademlia uses a routing table that remembers nodes based on proximity. It defines proximity buckets based on the number of leading zeros of the xor distance to your own node id. For a k-bit keyspace there are k proximity buckets, and for each of these so called k-buckets you have a maximum number of nodes you will remember. This gives a fixed upper limit on the size of the routing table while automatically keeping a power law distribution. You will know almost all nodes in your immediate proximity, but also some nodes that are on the other side of the world in terms of the xor metric space. The way the bucketing is done is quite cheap, but other schemes would work just as well as long as you approximate a power law distribution. E.g. you could also have 32 buckets based on the number of trailing zero *bytes* in the distance, and increase the bucket size.
+
+
+For our key size of 32 bytes or 256 bits, and a max bucket size of 20 keys, the maximum routing table size is 20 * 256 = 5120 nodes or 20 * 256 * 32 = 163840 bytes.
+
+
+## Lookup algorithm
+
+If each node had perfect knowledge of all other nodes on the network, lookup and storage would be just a two step process:
+
+1. Ask any node for the `k` closest nodes to key `x`.
+2. Store/look up the data on these `k` nodes.
+
+But since this is no longer the case, the lookup process needs to be iterative. We first use our local knowledge to find good candidates to ask, then ask them for the `k` closest nodes to our target `x` that they knows. We then ask some of the resulting nodes the same question again, until we can no longer make an improvement. Basically we do a greedy downhill search until we arrive at a minimum, which due to the power law distribution of the nodes is hopefully a global minimum. There are some intricacies to this iterative algorithm. E.g. we always ask multiple nodes at each stage, while keeping the parallelism limited to avoid having too many concurrent connections. Fortunately these complexities don't leak into the protocol. All we need is a way to ask a node for the `k` closest nodes to some key `x` according to its local knowledge.
+
+Due to the limited size of the routing table and the cheapness of computing the distance, this query is extremely cheap to answer for a node. Both the query and the response are small enough to fit into an unfragmented UDP packet.
+
+## Routing table updates
+
+A key property of a DHT compared to more rigid algorithms is that nodes should be able to come and go at any time. So you update the routing tables whenever you interact with a different node id of a DHT node. In addition you actively query the network with random keys to learn about new node ids. We will perform some experiments with various updating schemes later.
+
+# RPC protocol
+
+Now that we have a very rough idea what a distributed hashtable is meant to do, let's start defining the protocol that nodes will use to talk to each other. We are going to use [irpc] to define the protocol. This has the advantage that we can simulate a DHT consisting of thousands of nodes in memory for tests, and then use the same code with iroh connections as the underlying transport in production.
+
+First of all, we need a way to store and retrieve values. This is basically just a key value store API for a multimap. This protocol in isolation is sufficient to implement a tracker, a node that has full knowledge of what is where.
+
+
+Every type we use in the RPC protocol must be serializable so we can serialize it using [postcard]. Postcard is a non self-describing format, so we need to make sure to keep the order of the enum cases if we want the protocol to be long term stable. All rpc requests, responses and the overall rpc enum have the `#[derive(Debug, Serialize, Deserialize)]` annotation, but we will omit this from the examples below for brevity.
+
+
+## Values
+
+An id is just a 32 byte blob, with conversions from [iroh::NodeId](https://docs.rs/iroh/latest/iroh/type.NodeId.html) and [blake3::Hash](https://docs.rs/blake3/latest/blake3/struct.Hash.html).
+```rust
+pub struct Id([u8; 32]);
+```
+
+A value can be a bunch of different things, all related to the key. Either a provider for a BLAKE3 hash, a message signed by an Ed25519 key, or a tiny blob of BLAKE3 hashed data. Each of these variants corresponds to a mainline feature.
+```rust
+pub enum Value {
+ Blake3Provider(Blake3Provider),
+ ED25519SignedMessage(ED25519SignedMessage),
+ Blake3Immutable(Blake3Immutable),
+}
+```
+
+We want the ability to query only values of a certain kind, so we need a corresponding Kind enum:
+```rust
+pub enum Kind {
+ Blake3Provider,
+ ED25519SignedMessage,
+ Blake3Immutable,
+}
+```
+
+## KV store protocol
+
+For the kv store part of the DHT rpc protocol, we want to keep things extremely minimalistic. So we just need a way to set and get values.
+
+```rust
+pub struct Set {
+ pub key: Id,
+ pub value: Value,
+}
+
+pub struct GetAll {
+ pub key: Id,
+ pub kind: Kind,
+ ...
+}
+```
+
+Set allows us to ask the node to store a value. It might refuse to do so, but we can ask. GetAll allows us to get all values of a certain kind. That's it. So here is the storage and retrieval part of our protocol:
+
+```rust
+ #[rpc_requests(message = RpcMessage)]
+ pub enum RpcProto {
+ /// Set a key to a value.
+ #[rpc(tx = oneshot::Sender)]
+ Set(Set),
+ /// Get all values of a certain kind for a key, as a stream of values.
+ #[rpc(tx = mpsc::Sender)]
+ GetAll(GetAll),
+ ... // routing part TBD
+ }
+```
+
+So let's take a look at the rpc annotations. `Set` is a normal RPC call with a single answer message of type `SetResponse`, which just contains some info about if the set was successful and if not why not. GetAll might return many values, so we use a response stream of `Value`s. The `#[rpc_requests(message = RpcMessage)]` annotation is to turn this enum into an irpc rpc protocol and to define a corresponding message enum. For details, see this [blog post about irpc].
+
+GetResponse is still missing. It's just an enum describing if the set succeeded, and if not why it failed. You might wonder why we don't use a `Result<(), SetError>`: you could do that, but you have to be aware that serializing detailed errors is sometimes a big pain, and also the exact details of the failure like the stack trace are nobody's business. We just give you a very rough idea why the request failed so you can decide whether to try again or not. E.g. `ErrFull` means you might try a bit later, ErrInvalid means that there is something wrong with the value, e.g. signature error, and there is no point in trying again.
+
+```
+pub enum SetResponse {
+ Ok,
+ /// The key was too far away from the node id in terms of the DHT metric.
+ ErrDistance,
+ /// The value is too old.
+ ErrExpired,
+ /// The node does not have capacity to store the value.
+ ErrFull,
+ /// The value is invalid, e.g. the signature does not match the public key.
+ ErrInvalid,
+}
+```
+
+## Routing protocol
+
+The routing part of the protocol is a bit more interesting. We want the ability to query a node for its local knowledge of nodes for a key. So the message needs to contain the key we are looking for. But there is one more thing: if the requester is itself a DHT node, the callee might want to add this node id to its routing table. If the requester is a short lived client, we don't want its node id to be added to the routing table since asking it anything would be pointless. It is up to the callee to validate that the id is a valid responsive DHT node and then update its routing table, all we do in the request is to provide this information.
+
+Note that for a mem transport there is no way for the callee to check the requester node id. For the rpc protocol, the mem transport is only used in simulations where we trust the caller. When the request comes in via an iroh connection, we can do a quick check that the requester node id is the remote node id of the connection.
+
+```rust
+pub struct FindNode {
+ pub id: Id,
+ pub requester: Option,
+}
+```
+
+Now let's add this message to the rpc protocol:
+
+```rust
+#[rpc_requests(message = RpcMessage)]
+pub enum RpcProto {
+ ...
+ /// A request to query the routing table for the most natural locations
+ #[rpc(tx = oneshot::Sender>)]
+ FindNode(FindNode),
+}
+```
+
+The answer is just a sequence of iroh [NodeAddr]s, containing most importantly the node id of the nodes, but also information like the current home relay and possibly even socket addrs where you can attempt to dial the node. Of course we could rely on [node discovery] here, but DHTs will perform a lot of connections to a very large number of nodes, and the callee already has some information about how to dial the node, so we might as well include it to reduce the load on the node discovery system.
+
+
+An implementation detail: the routing table does not have to contain discovery information. It is purely about node ids. When we answer a FindNode request we enrich the node ids with the discovery information that is stored in the iroh endpoint.
+
+
+You might ask why this is not a streaming response but just a single Vec. The number of nodes a node will return in response to a FindNode query is very small (at most ~20), and is immediately available after querying the routing table. So there is no point in streaming this - 20 NodeAddrs with some discovery info will fit into 1 or 2 MTUs, so it is more efficient to send them all at once.
+
+And that's it. That is the entire rpc protocol. Many DHT implementations also add a `Ping` call, but since querying the routing table is so cheap, if you want to know if a node is alive you might as well ask it for the closest nodes to some random key and get some extra information for free.
diff --git a/src/app/blog/lets-write-a-dht-2/page.mdx b/src/app/blog/lets-write-a-dht-2/page.mdx
new file mode 100644
index 00000000..db8b56fd
--- /dev/null
+++ b/src/app/blog/lets-write-a-dht-2/page.mdx
@@ -0,0 +1,587 @@
+import { BlogPostLayout } from '@/components/BlogPostLayout'
+import {ThemeImage} from '@/components/ThemeImage'
+
+export const post = {
+ draft: false,
+ author: 'Rüdiger Klaehn',
+ date: '2025-09-19',
+ title: 'A DHT for iroh',
+ description:
+ "Let's write a DHT for iroh - implementation",
+}
+
+export const metadata = {
+ title: post.title,
+ description: post.description,
+ openGraph: {
+ title: post.title,
+ description: post.description,
+ images: [{
+ url: `/api/og?title=Blog&subtitle=${post.title}`,
+ width: 1200,
+ height: 630,
+ alt: post.title,
+ type: 'image/png',
+ }],
+ type: 'article'
+ }
+}
+
+export default (props) =>
+
+## RPC client
+
+Using an irpc client directly is not exactly horrible, but nevertheless we want to add some sugar to make it more easy to use. So we write a wrapper around the irpc client that makes using it more convenient. Set and FindNode are just async fns, GetAll returns a stream of responses.
+
+```rust
+impl RpcClient {
+ ...
+
+ pub async fn set(&self, key: Id, value: Value) -> irpc::Result {
+ self.0.rpc(Set { key, value }).await
+ }
+
+ pub async fn get_all(
+ &self,
+ key: Id,
+ kind: Kind,
+ ) -> irpc::Result> {
+ self.0
+ .server_streaming(GetAll { key, kind }, 32)
+ .await
+ }
+
+ pub async fn find_node(
+ &self,
+ id: Id,
+ requester: Option,
+ ) -> irpc::Result> {
+ self.0.rpc(FindNode { id, requester }).await
+ }
+}
+```
+
+This client can now be used either with a remote node that is connected via a memory transport, or with a node that is connected via an iroh connection.
+
+## Storage implementation
+
+The first thing we would have to do to implement this protocol would be the storage part. For this experiment we will use a very simple memory storage. This might even be a good idea for production! We have a limited value size, and DHTs are not persistent storage anyway. DHT records need to be continuously republished, so if a DHT node goes down it will just be repopulated with values shortly after becoming online again.
+
+The only notable thing we do here is to store values for different value kinds separately for more simple retrieval, and to use an [IndexSet] for the values to keep values sorted by insertion time.
+
+```rust
+struct MemStorage {
+ /// The DHT data storage, mapping keys to values.
+ /// Separated by kind to allow for efficient retrieval.
+ data: BTreeMap>>,
+}
+
+impl MemStorage {
+ fn new() -> Self {
+ Self {
+ data: BTreeMap::new(),
+ }
+ }
+
+ /// Set a value for a key.
+ fn set(&mut self, key: Id, value: Value) {
+ let kind = value.kind();
+ self.data
+ .entry(key)
+ .or_default()
+ .entry(kind)
+ .or_default()
+ .insert(value);
+ }
+
+ /// Get all values of a certain kind for a key.
+ fn get_all(&self, key: &Id, kind: &Kind) -> Option<&IndexSet> {
+ self.data.get(key).and_then(|kinds| kinds.get(kind))
+ }
+}
+```
+
+## Routing implementation
+
+Now it looks like we have run out of simple things to do and need to actually implement the routing part. The routing API does not care how the routing table is organized internally - it could just as well be the full set of nodes. But we want to implement the Kademlia algorithm to get that nice power law distribution.
+
+So let's define the routing table. First of all we need some simple integer arithmetic like xor and leading_zeros for 256 bit numbers. There are various crates that provide this, but since we don't need anything fancy like multiplication or division, we just quickly implemented it inline.
+
+The routing table itself is just a 2d array of node ids. Each row (k-bucket) has a small fixed upper size, so we are going to use the [ArrayVec] crate to prevent allocations. For each node id we just keep a tiny bit of extra information - a timestamp when we have last seen any evidence that the node actually exists and responds, to decide which nodes to check for liveness.
+
+A KBucket is tiny, so doing full scans for addition and removal is totally acceptable. We don't want any clever algorithms here.
+
+```rust
+struct NodeInfo {
+ pub id: NodeId,
+ pub last_seen: u64,
+}
+
+struct KBucket {
+ nodes: ArrayVec,
+}
+```
+
+The routing table data is now just one bucket per bit, so 256 buckets in our case where we have decided to bucket by leading zero *bits*:
+
+```rust
+struct Buckets([KBucket; 256]);
+```
+
+The only additional information we need for the routing table is our own node id. Data in the routing table is organized in terms of closeness to the local node id, so we frequently need to access the local node id when inserting data.
+
+```rust
+struct RoutingTable {
+ buckets: Box,
+ local_id: NodeId,
+}
+```
+
+
+Rust is very nice in that it allows you to write data structures with a lot of memory locality. That is one of the reasons for it's good performance. But this sometimes comes with problems. E.g. our Buckets structs has a size of ~163840 bytes, so if you try to allocate it on the stack even temporarily you will have an instant stack overflow on some systems with a small default stack size.
+
+Hence the `Box`, and you will sometimes have to jump through some hoops initializing a Buckets struct.
+
+The problem would go away if we were to use `Vec` instead of `ArrayVec`, but that would mean that the routing table data is spread all over the heap depending on heap fragmentation at the time of allocation.
+
+
+Now assuming that the system has some way to find valid DHT nodes, all we need is a way to insert nodes into the routing table, and to query the routing table for the `k` closest nodes to some key `x` to implement the FindNode rpc call.
+
+## Insertion
+
+Insertion means first computing which bucket the node should go into, and then inserting at that index. Computing the bucket index is computing the xor distance to our own node id, then counting leading zeros and flipping the result around, since we want bucket 0 to contain the closest nodes and bucket 255 to contain the furthest away nodes as per Kademlia convention.
+
+```rust
+fn bucket_index(&self, target: &[u8; 32]) -> usize {
+ let distance = xor(&self.local_id.as_bytes(), target);
+ let zeros = leading_zeros(&distance);
+ if zeros >= BUCKET_COUNT {
+ 0 // Same node case
+ } else {
+ BUCKET_COUNT - 1 - zeros
+ }
+}
+
+fn add_node(&mut self, node: NodeInfo) -> bool {
+ if node.id == self.local_id {
+ return false;
+ }
+
+ let bucket_idx = self.bucket_index(node.id.as_bytes());
+ self.buckets[bucket_idx].add_node(node)
+}
+```
+
+Insertion in a bucket where the node already exists means just updating the timestamp. Otherwise just append the node, and if there is no room either make room by evicting the oldest existing node, or ping the oldest node and fail the insertion if the old node responds. For now we just fail, favoring stability. Nodes will be pinged in regular intervals anyway, and nodes that are non-responsive will be purged.
+
+```rust
+impl KBucket {
+ fn add_node(&mut self, node: NodeInfo) -> bool {
+ // Check if node already exists and update it
+ for existing in &mut self.nodes {
+ if existing.id == node.id {
+ existing.last_seen = node.last_seen;
+ return true; // Updated existing node
+ }
+ }
+
+ // Add new node if space available
+ if self.nodes.len() < K {
+ self.nodes.push(node);
+ return true;
+ }
+
+ false // Bucket full
+ }
+}
+```
+
+As you can see this is a very simple implementation. Within the bucket we don't care about order.
+
+## Querying
+
+Since the xor metric is so simple, and the routing table is of limited size, it is not worth doing anything fancy when querying for a key. Conceptually we just create an array of nodes and distances, sort it by distance, take the `k` smallest, and that's it.
+
+Since this operation is performed very frequently we did a few simple optimizations though.
+
+```rust
+impl RoutingTable {
+ find_closest_nodes(&self, target: &Id, k: usize) -> Vec {
+ let mut candidates = Vec::with_capacity(self.nodes().count());
+ candidates.extend(
+ self.nodes()
+ .map(|node| Distance::between(target, node.id.as_bytes())),
+ );
+ if k < candidates.len() {
+ candidates.select_nth_unstable(k - 1);
+ candidates.truncate(k);
+ }
+ candidates.sort_unstable();
+
+ candidates
+ .into_iter()
+ .map(|dist| {
+ NodeId::from_bytes(&dist.inverse(target))
+ .expect("inverse called with different target than between")
+ })
+ .collect()
+ }
+}
+```
+
+We first create an array of candidates that contains all node ids in the routing table. This will almost always be larger than `k`.
+
+We could just sort it, but we are only interested in the order of the `k` smallest values, not the overall order. So we can save some comparisons by using [select_nth_unstable](https://doc.rust-lang.org/std/primitive.slice.html#method.select_nth_unstable) to sort such that the `k`th element is in the right place, then truncate and sort just the remaining `<= k` elements. We can always use unstable sort since the xor distance is an injective function, no two nodes can have the same distance to target id.
+
+As a last trick, instead of storing (id, distance) tuples we just store the distance itself while sorting, and recompute the node id itself by xor-ing again with the target id. This reduces the size of the temporary array by half.
+
+
+We are treating find_closest_nodes as essentially free. As a justification for this, We wrote a microbenchmark that does find_closest_nodes with `k=20` for a *full* routing table that you will rarely see in the real world. It takes `94.690 µs` on average on my machine. So it might not be completely free, but compared to the networking overhead it is probably nothing to worry about!
+
+
+## Wiring it up
+
+The handler for our rpc protocol is a typical rust actor. The actor has the mem storage as well as the routing table as state, and processes messages one by one. If the storage was persistent, you might want to perform the actual storage and retrieval as well as the
+sending of the response stream in a background task, but for now it is all sequential.
+
+There are some background tasks to update the routing table to add new nodes and forget unreachable nodes, but these are omitted for now.
+
+```rust
+struct Node {
+ routing_table: RoutingTable,
+ storage: MemStorage,
+}
+
+struct Actor {
+ node: Node,
+ /// receiver for rpc messages from the network
+ rpc_rx: tokio::sync::mpsc::Receiver,
+ ... more plumbing for background tasks
+}
+
+impl Actor {
+ async fn run(mut self) {
+ loop {
+ tokio::select! {
+ msg = self.rpc_rx.recv() => {
+ if let Some(msg) = msg {
+ self.handle_rpc(msg).await;
+ } else {
+ break;
+ }
+ }
+ ... other background tasks and stuff
+ }
+ }
+ }
+
+ async fn handle_rpc(&mut self, message: RpcMessage) {
+ match message {
+ RpcMessage::Set(msg) => {
+ /// msg validation omitted
+ self.node.storage.set(msg.key, msg.value.clone());
+ msg.tx.send(SetResponse::Ok).await.ok();
+ }
+ RpcMessage::GetAll(msg) => {
+ let Some(values) = self.node.storage.get_all(&msg.key, &msg.kind) else {
+ return;
+ };
+ // sampling values and randomizing omitted
+ for value in values {
+ if msg.tx.send(value.clone()).await.is_err() {
+ break;
+ }
+ }
+ }
+ RpcMessage::FindNode(msg) => {
+ // call local find_node and just return the results
+ let ids = self
+ .node
+ .routing_table
+ .find_closest_nodes(&msg.id, self.state.config.k)
+ .into_iter()
+ .map(|id| self.state.pool.node_addr(id))
+ .collect();
+ msg.tx.send(ids).await.ok();
+ }
+ }
+ }
+}
+```
+
+Set is trivial. It just sets the value and returns Ok to the requester. There is some logic to validate the value based on the key, but this has been omitted here.
+
+GetAll is a bit more complex. It queries the storage for values, then does some limiting and randomizing (omitted here), and the streams out the responses.
+
+FindNode queries the routing table and gets back a sequence of node ids. It then augments this information with dialing information from the connection pool (a wrapper around an iroh endpoint) and sends out the response all at once.
+
+What we have now is an actor that stores values and maintains a routing table. All rpc operations are fully local, there is no way for a remote node to trigger something expensive.
+
+The next step is to implement the iterative lookup algorithm. Once we have that, storage and retrieval are just calls to the `k` closest nodes to a key that are the result of the iterative lookup algorithm.
+
+Both storage and retrieval involve *a lot* of network operations. To hide all these details from the user, we will need a message based protocol that the DHT client uses to communicate with the DHT actor. This will also be an irpc protocol, but it will be used either in memory or to control a DHT node running in a different process on a local machine, so it does not have to concern itself as much with having small messages and with adversarial scenarios.
+
+We also don't have to care about stability, since this will be used only between the same version of the binary.
+
+As mentined, the main complexity of a DHT is the routing. What values we store almost does not matter, as long as they can be validated somehow and are small enough to fit. So for testing, we are going to implement just storage of immutable small blobs.
+
+We need the ability to store and retrieve such blobs, and for the user facing API we don't care about nodes. All these details are for the DHT to sort out internally. So let's design the API.
+
+The API protocol will also contain internal messages that the DHT needs for periodic tasks. We can just hide them from the public API wrapper if we don't want our users to mess with internals.
+
+```rust
+#[rpc_requests(message = ApiMessage)]
+#[derive(Debug, Serialize, Deserialize)]
+pub enum ApiProto {
+ #[rpc(wrap, tx = mpsc::Sender)]
+ NetworkPut { id: Id, value: Value },
+ #[rpc(wrap, tx = mpsc::Sender<(NodeId, Value)>)]
+ NetworkGet { id: Id, kind: Kind },
+ ... plumbing rpc calls
+}
+```
+
+We need the ability to store and retrieve values.
+
+Storing values is a two step process, first use the iterative algorithm to find the `k` closest nodes, then, in parallel, try to store the value on all these nodes. To give the user some feedback over where the data is stored, we return a stream of node ids where the data was successfully stored.
+
+Retrieval is almost identical. We first find the `k` closest nodes, then, in parallel, ask all of them for the value. Again we return a stream of (NodeId, Value) so we can get answers to the user as soon as they become available.
+
+In case of immutable values, the first validated value is all it takes, as soon as we got that we can abort the operation. For other values we might want to wait for all results and then choose the most recent one, or use them all, e.g. to retrieve content over iroh-blobs from multiple sources.
+
+Here is the ApiClient for the get_immutable and put_immutable rpc calls:
+
+```rust
+async fn put_immutable(
+ &self,
+ value: &[u8],
+) -> irpc::Result<(blake3::Hash, Vec)> {
+ let hash = blake3::hash(value);
+ let id = Id::from(*hash.as_bytes());
+ let mut rx = self
+ .0
+ .server_streaming(
+ NetworkPut {
+ id,
+ value: Value::Blake3Immutable(Blake3Immutable {
+ timestamp: now(),
+ data: value.to_vec(),
+ }),
+ },
+ 32,
+ )
+ .await?;
+ let mut res = Vec::new();
+ loop {
+ match rx.recv().await {
+ Ok(Some(id)) => res.push(id),
+ Ok(None) => break,
+ Err(_) => {}
+ }
+ }
+ Ok((hash, res))
+}
+
+async fn get_immutable(&self, hash: blake3::Hash) -> irpc::Result