Improve how node roles are assigned in Garage

- change the terminology: the network configuration becomes the role
  table, the configuration of a nodes becomes a node's role
- the modification of the role table takes place in two steps: first,
  changes are staged in a CRDT data structure. Then, once the user is
  happy with the changes, they can commit them all at once (or revert
  them).
- update documentation
- fix tests
- implement smarter partition assignation algorithm

This patch breaks the format of the network configuration: when
migrating, the cluster will be in a state where no roles are assigned.
All roles must be re-assigned and commited at once. This migration
should not pose an issue.
This commit is contained in:
Alex Auvolat 2021-11-09 12:24:04 +01:00
parent 53888995bd
commit c94406f428
No known key found for this signature in database
GPG key ID: EDABF9711E244EB1
42 changed files with 1430 additions and 557 deletions

15
Cargo.lock generated
View file

@ -379,7 +379,7 @@ dependencies = [
[[package]]
name = "garage"
version = "0.4.0"
version = "0.5.0"
dependencies = [
"async-trait",
"bytes 1.1.0",
@ -408,7 +408,7 @@ dependencies = [
[[package]]
name = "garage_api"
version = "0.4.0"
version = "0.5.0"
dependencies = [
"base64",
"bytes 1.1.0",
@ -440,7 +440,7 @@ dependencies = [
[[package]]
name = "garage_model"
version = "0.4.0"
version = "0.5.0"
dependencies = [
"arc-swap",
"async-trait",
@ -462,7 +462,7 @@ dependencies = [
[[package]]
name = "garage_rpc"
version = "0.4.0"
version = "0.5.0"
dependencies = [
"arc-swap",
"async-trait",
@ -479,6 +479,7 @@ dependencies = [
"rand",
"rmp-serde 0.15.5",
"serde",
"serde_bytes",
"serde_json",
"tokio",
"tokio-stream",
@ -486,7 +487,7 @@ dependencies = [
[[package]]
name = "garage_table"
version = "0.4.0"
version = "0.5.0"
dependencies = [
"async-trait",
"bytes 1.1.0",
@ -506,7 +507,7 @@ dependencies = [
[[package]]
name = "garage_util"
version = "0.4.0"
version = "0.5.0"
dependencies = [
"blake2",
"chrono",
@ -530,7 +531,7 @@ dependencies = [
[[package]]
name = "garage_web"
version = "0.4.0"
version = "0.5.0"
dependencies = [
"err-derive 0.3.0",
"futures",

View file

@ -40,13 +40,13 @@ in
{
cargo2nixVersion = "0.9.0";
workspace = {
garage_util = rustPackages.unknown.garage_util."0.4.0";
garage_rpc = rustPackages.unknown.garage_rpc."0.4.0";
garage_table = rustPackages.unknown.garage_table."0.4.0";
garage_model = rustPackages.unknown.garage_model."0.4.0";
garage_api = rustPackages.unknown.garage_api."0.4.0";
garage_web = rustPackages.unknown.garage_web."0.4.0";
garage = rustPackages.unknown.garage."0.4.0";
garage_util = rustPackages.unknown.garage_util."0.5.0";
garage_rpc = rustPackages.unknown.garage_rpc."0.5.0";
garage_table = rustPackages.unknown.garage_table."0.5.0";
garage_model = rustPackages.unknown.garage_model."0.5.0";
garage_api = rustPackages.unknown.garage_api."0.5.0";
garage_web = rustPackages.unknown.garage_web."0.5.0";
garage = rustPackages.unknown.garage."0.5.0";
};
"registry+https://github.com/rust-lang/crates.io-index".aho-corasick."0.7.18" = overridableMkRustCrate (profileName: rec {
name = "aho-corasick";
@ -246,7 +246,7 @@ in
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469"; };
dependencies = {
${ if hostPlatform.config == "aarch64-apple-darwin" || hostPlatform.parsed.cpu.name == "aarch64" && hostPlatform.parsed.kernel.name == "linux" then "libc" else null } = rustPackages."registry+https://github.com/rust-lang/crates.io-index".libc."0.2.103" { inherit profileName; };
${ if hostPlatform.parsed.cpu.name == "aarch64" && hostPlatform.parsed.kernel.name == "linux" || hostPlatform.config == "aarch64-apple-darwin" then "libc" else null } = rustPackages."registry+https://github.com/rust-lang/crates.io-index".libc."0.2.103" { inherit profileName; };
};
});
@ -606,9 +606,9 @@ in
};
});
"unknown".garage."0.4.0" = overridableMkRustCrate (profileName: rec {
"unknown".garage."0.5.0" = overridableMkRustCrate (profileName: rec {
name = "garage";
version = "0.4.0";
version = "0.5.0";
registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/garage");
dependencies = {
@ -616,12 +616,12 @@ in
bytes = rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.1.0" { inherit profileName; };
futures = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.17" { inherit profileName; };
futures_util = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.17" { inherit profileName; };
garage_api = rustPackages."unknown".garage_api."0.4.0" { inherit profileName; };
garage_model = rustPackages."unknown".garage_model."0.4.0" { inherit profileName; };
garage_rpc = rustPackages."unknown".garage_rpc."0.4.0" { inherit profileName; };
garage_table = rustPackages."unknown".garage_table."0.4.0" { inherit profileName; };
garage_util = rustPackages."unknown".garage_util."0.4.0" { inherit profileName; };
garage_web = rustPackages."unknown".garage_web."0.4.0" { inherit profileName; };
garage_api = rustPackages."unknown".garage_api."0.5.0" { inherit profileName; };
garage_model = rustPackages."unknown".garage_model."0.5.0" { inherit profileName; };
garage_rpc = rustPackages."unknown".garage_rpc."0.5.0" { inherit profileName; };
garage_table = rustPackages."unknown".garage_table."0.5.0" { inherit profileName; };
garage_util = rustPackages."unknown".garage_util."0.5.0" { inherit profileName; };
garage_web = rustPackages."unknown".garage_web."0.5.0" { inherit profileName; };
git_version = rustPackages."registry+https://github.com/rust-lang/crates.io-index".git-version."0.3.5" { inherit profileName; };
hex = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; };
sodiumoxide = rustPackages."registry+https://github.com/rust-lang/crates.io-index".kuska-sodiumoxide."0.2.5-0" { inherit profileName; };
@ -638,9 +638,9 @@ in
};
});
"unknown".garage_api."0.4.0" = overridableMkRustCrate (profileName: rec {
"unknown".garage_api."0.5.0" = overridableMkRustCrate (profileName: rec {
name = "garage_api";
version = "0.4.0";
version = "0.5.0";
registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/api");
dependencies = {
@ -651,9 +651,9 @@ in
err_derive = buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.0" { profileName = "__noProfile"; };
futures = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.17" { inherit profileName; };
futures_util = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.17" { inherit profileName; };
garage_model = rustPackages."unknown".garage_model."0.4.0" { inherit profileName; };
garage_table = rustPackages."unknown".garage_table."0.4.0" { inherit profileName; };
garage_util = rustPackages."unknown".garage_util."0.4.0" { inherit profileName; };
garage_model = rustPackages."unknown".garage_model."0.5.0" { inherit profileName; };
garage_table = rustPackages."unknown".garage_table."0.5.0" { inherit profileName; };
garage_util = rustPackages."unknown".garage_util."0.5.0" { inherit profileName; };
hex = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; };
hmac = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hmac."0.10.1" { inherit profileName; };
http = rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."0.2.5" { inherit profileName; };
@ -673,9 +673,9 @@ in
};
});
"unknown".garage_model."0.4.0" = overridableMkRustCrate (profileName: rec {
"unknown".garage_model."0.5.0" = overridableMkRustCrate (profileName: rec {
name = "garage_model";
version = "0.4.0";
version = "0.5.0";
registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/model");
dependencies = {
@ -683,9 +683,9 @@ in
async_trait = buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".async-trait."0.1.51" { profileName = "__noProfile"; };
futures = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.17" { inherit profileName; };
futures_util = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.17" { inherit profileName; };
garage_rpc = rustPackages."unknown".garage_rpc."0.4.0" { inherit profileName; };
garage_table = rustPackages."unknown".garage_table."0.4.0" { inherit profileName; };
garage_util = rustPackages."unknown".garage_util."0.4.0" { inherit profileName; };
garage_rpc = rustPackages."unknown".garage_rpc."0.5.0" { inherit profileName; };
garage_table = rustPackages."unknown".garage_table."0.5.0" { inherit profileName; };
garage_util = rustPackages."unknown".garage_util."0.5.0" { inherit profileName; };
hex = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; };
log = rustPackages."registry+https://github.com/rust-lang/crates.io-index".log."0.4.14" { inherit profileName; };
netapp = rustPackages."registry+https://github.com/rust-lang/crates.io-index".netapp."0.3.0" { inherit profileName; };
@ -698,9 +698,9 @@ in
};
});
"unknown".garage_rpc."0.4.0" = overridableMkRustCrate (profileName: rec {
"unknown".garage_rpc."0.5.0" = overridableMkRustCrate (profileName: rec {
name = "garage_rpc";
version = "0.4.0";
version = "0.5.0";
registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/rpc");
dependencies = {
@ -709,7 +709,7 @@ in
bytes = rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.1.0" { inherit profileName; };
futures = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.17" { inherit profileName; };
futures_util = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.17" { inherit profileName; };
garage_util = rustPackages."unknown".garage_util."0.4.0" { inherit profileName; };
garage_util = rustPackages."unknown".garage_util."0.5.0" { inherit profileName; };
gethostname = rustPackages."registry+https://github.com/rust-lang/crates.io-index".gethostname."0.2.1" { inherit profileName; };
hex = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; };
hyper = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hyper."0.14.13" { inherit profileName; };
@ -719,15 +719,16 @@ in
rand = rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.4" { inherit profileName; };
rmp_serde = rustPackages."registry+https://github.com/rust-lang/crates.io-index".rmp-serde."0.15.5" { inherit profileName; };
serde = rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde."1.0.130" { inherit profileName; };
serde_bytes = rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_bytes."0.11.5" { inherit profileName; };
serde_json = rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_json."1.0.68" { inherit profileName; };
tokio = rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio."1.12.0" { inherit profileName; };
tokio_stream = rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio-stream."0.1.7" { inherit profileName; };
};
});
"unknown".garage_table."0.4.0" = overridableMkRustCrate (profileName: rec {
"unknown".garage_table."0.5.0" = overridableMkRustCrate (profileName: rec {
name = "garage_table";
version = "0.4.0";
version = "0.5.0";
registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/table");
dependencies = {
@ -735,8 +736,8 @@ in
bytes = rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.1.0" { inherit profileName; };
futures = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.17" { inherit profileName; };
futures_util = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.17" { inherit profileName; };
garage_rpc = rustPackages."unknown".garage_rpc."0.4.0" { inherit profileName; };
garage_util = rustPackages."unknown".garage_util."0.4.0" { inherit profileName; };
garage_rpc = rustPackages."unknown".garage_rpc."0.5.0" { inherit profileName; };
garage_util = rustPackages."unknown".garage_util."0.5.0" { inherit profileName; };
hexdump = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; };
log = rustPackages."registry+https://github.com/rust-lang/crates.io-index".log."0.4.14" { inherit profileName; };
rand = rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.4" { inherit profileName; };
@ -748,9 +749,9 @@ in
};
});
"unknown".garage_util."0.4.0" = overridableMkRustCrate (profileName: rec {
"unknown".garage_util."0.5.0" = overridableMkRustCrate (profileName: rec {
name = "garage_util";
version = "0.4.0";
version = "0.5.0";
registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/util");
dependencies = {
@ -775,18 +776,18 @@ in
};
});
"unknown".garage_web."0.4.0" = overridableMkRustCrate (profileName: rec {
"unknown".garage_web."0.5.0" = overridableMkRustCrate (profileName: rec {
name = "garage_web";
version = "0.4.0";
version = "0.5.0";
registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/web");
dependencies = {
err_derive = buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.0" { profileName = "__noProfile"; };
futures = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.17" { inherit profileName; };
garage_api = rustPackages."unknown".garage_api."0.4.0" { inherit profileName; };
garage_model = rustPackages."unknown".garage_model."0.4.0" { inherit profileName; };
garage_table = rustPackages."unknown".garage_table."0.4.0" { inherit profileName; };
garage_util = rustPackages."unknown".garage_util."0.4.0" { inherit profileName; };
garage_api = rustPackages."unknown".garage_api."0.5.0" { inherit profileName; };
garage_model = rustPackages."unknown".garage_model."0.5.0" { inherit profileName; };
garage_table = rustPackages."unknown".garage_table."0.5.0" { inherit profileName; };
garage_util = rustPackages."unknown".garage_util."0.5.0" { inherit profileName; };
http = rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."0.2.5" { inherit profileName; };
hyper = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hyper."0.14.13" { inherit profileName; };
log = rustPackages."registry+https://github.com/rust-lang/crates.io-index".log."0.4.14" { inherit profileName; };

View file

@ -3,10 +3,18 @@ Garage [![Build Status](https://drone.deuxfleurs.fr/api/badges/Deuxfleurs/garage
<p align="center" style="text-align:center;">
<a href="https://garagehq.deuxfleurs.fr">
<img alt="Garage logo" src="doc/logo/garage.png" height="200" />
<img alt="Garage logo" src="https://garagehq.deuxfleurs.fr/img/logo.svg" height="200" />
</a>
</p>
<p align="center" style="text-align:center;">
[ <strong><a href="https://garagehq.deuxfleurs.fr/">Website and documentation</a></strong>
| <a href="https://garagehq.deuxfleurs.fr/_releases.html">Binary releases</a>
| <a href="https://git.deuxfleurs.fr/Deuxfleurs/garage">Git repository</a>
| <a href="https://matrix.to/#/%23garage:deuxfleurs.fr">Matrix channel</a>
]
</p>
Garage is a lightweight S3-compatible distributed object store, with the following goals:
- As self-contained as possible
@ -22,5 +30,3 @@ Non-goals include:
- Erasure coding (our replication model is simply to copy the data as is on several nodes, in different datacenters if possible)
Our main use case is to provide a distributed storage layer for small-scale self hosted services such as [Deuxfleurs](https://deuxfleurs.fr).
**[Go to the documentation](https://garagehq.deuxfleurs.fr)**

View file

@ -5,12 +5,12 @@
- [Quick start](./quick_start/index.md)
- [Cookbook](./cookbook/index.md)
- [Multi-node deployment](./cookbook/real_world.md)
- [Building from source](./cookbook/from_source.md)
- [Integration with systemd](./cookbook/systemd.md)
- [Gateways](./cookbook/gateways.md)
- [Exposing buckets as websites](./cookbook/exposing_websites.md)
- [Configuring a reverse proxy](./cookbook/reverse_proxy.md)
- [Production Deployment](./cookbook/real_world.md)
- [Recovering from failures](./cookbook/recovering.md)
- [Integrations](./connect/index.md)
@ -25,6 +25,7 @@
- [Reference Manual](./reference_manual/index.md)
- [Garage configuration file](./reference_manual/configuration.md)
- [Cluster layout management](./reference_manual/layout.md)
- [Garage CLI](./reference_manual/cli.md)
- [S3 compatibility status](./reference_manual/s3_compatibility.md)

View file

@ -21,7 +21,9 @@ Currently it will not work with minio client. Follow issue [#64](https://git.deu
The instructions are similar to a regular node, the only option that is different is while configuring the node, you must set the `--gateway` parameter:
```bash
garage node configure --gateway --tag gw1 xxxx
garage layout assign --gateway --tag gw1 <node_id>
garage layout show # review the changes you are making
garage layout apply # once satisfied, apply the changes
```
Then use `http://localhost:3900` when a S3 endpoint is required:
@ -29,3 +31,9 @@ Then use `http://localhost:3900` when a S3 endpoint is required:
```bash
aws --endpoint-url http://127.0.0.1:3900 s3 ls
```
If a newly added gateway node seems to not be working, do a full table resync to ensure that bucket and key list are correctly propagated:
```bash
garage repair -a --yes tables
```

View file

@ -41,15 +41,15 @@ For our example, we will suppose the following infrastructure with IPv6 connecti
## Get a Docker image
Our docker image is currently named `lxpz/garage_amd64` and is stored on the [Docker Hub](https://hub.docker.com/r/lxpz/garage_amd64/tags?page=1&ordering=last_updated).
Our docker image is currently named `dxflrs/amd64_garage` and is stored on the [Docker Hub](https://hub.docker.com/r/dxflrs/amd64_garage/tags?page=1&ordering=last_updated).
We encourage you to use a fixed tag (eg. `v0.4.0`) and not the `latest` tag.
For this example, we will use the latest published version at the time of the writing which is `v0.4.0` but it's up to you
to check [the most recent versions on the Docker Hub](https://hub.docker.com/r/lxpz/garage_amd64/tags?page=1&ordering=last_updated).
to check [the most recent versions on the Docker Hub](https://hub.docker.com/r/dxflrs/amd64_garage/tags?page=1&ordering=last_updated).
For example:
```
sudo docker pull lxpz/garage_amd64:v0.4.0
sudo docker pull dxflrs/amd64_garage:v0.4.0
```
## Deploying and configuring Garage
@ -144,7 +144,7 @@ At this point, nodes are not yet talking to one another.
Your output should therefore look like follows:
```
Mercury$ garage node-id
Mercury$ garage status
==== HEALTHY NODES ====
ID Hostname Address Tag Zone Capacity
563e1ac825ee3323… Mercury [fc00:1::1]:3901 NO ROLE ASSIGNED
@ -157,14 +157,14 @@ When your Garage nodes first start, they will generate a local node identifier
(based on a public/private key pair).
To obtain the node identifier of a node, once it is generated,
run `garage node-id`.
run `garage node id`.
This will print keys as follows:
```bash
Mercury$ garage node-id
Mercury$ garage node id
563e1ac825ee3323aa441e72c26d1030d6d4414aeb3dd25287c531e7fc2bc95d@[fc00:1::1]:3901
Venus$ garage node-id
Venus$ garage node id
86f0f26ae4afbd59aaf9cfb059eefac844951efd5b8caeec0d53f4ed6c85f332@[fc00:1::2]:3901
etc.
@ -191,20 +191,22 @@ ID Hostname Address Tag Zone Capa
212f7572f0c89da9… Mars [fc00:F::1]:3901 NO ROLE ASSIGNED
```
## Giving roles to nodes
## Creating a cluster layout
We will now inform Garage of the disk space available on each node of the cluster
as well as the zone (e.g. datacenter) in which each machine is located.
This information is called the **cluster layout** and consists
of a role that is assigned to each active cluster node.
For our example, we will suppose we have the following infrastructure
(Capacity, Identifier and Zone are specific values to Garage described in the following):
| Location | Name | Disk Space | `Capacity` | `Identifier` | `Zone` |
|----------|---------|------------|------------|--------------|--------------|
| Paris | Mercury | 1 To | `2` | `563e` | `par1` |
| Paris | Venus | 2 To | `4` | `86f0` | `par1` |
| London | Earth | 2 To | `4` | `6814` | `lon1` |
| Brussels | Mars | 1.5 To | `3` | `212f` | `bru1` |
| Paris | Mercury | 1 To | `10` | `563e` | `par1` |
| Paris | Venus | 2 To | `20` | `86f0` | `par1` |
| London | Earth | 2 To | `20` | `6814` | `lon1` |
| Brussels | Mars | 1.5 To | `15` | `212f` | `bru1` |
#### Node identifiers
@ -239,13 +241,9 @@ in order to provide high availability despite failure of a zone.
Garage reasons on an abstract metric about disk storage that is named the *capacity* of a node.
The capacity configured in Garage must be proportional to the disk space dedicated to the node.
Due to the way the Garage allocation algorithm works, capacity values must
be **integers**, and must be **as small as possible**, for instance with
1 representing the size of your smallest server.
Here we chose that 1 unit of capacity = 0.5 To, so that we can express servers of size
1 To and 2 To, as wel as the intermediate size 1.5 To, with the integer values 2, 4 and
3 respectively (see table above).
Capacity values must be **integers** but can be given any signification.
Here we chose that 1 unit of capacity = 100 GB.
Note that the amount of data stored by Garage on each server may not be strictly proportional to
its capacity value, as Garage will priorize having 3 copies of data in different zones,
@ -257,13 +255,29 @@ have 66% chance of being stored by Venus and 33% chance of being stored by Mercu
Given the information above, we will configure our cluster as follow:
```bash
garage layout assign -z par1 -c 10 -t mercury 563e
garage layout assign -z par1 -c 20 -t venus 86f0
garage layout assign -z lon1 -c 20 -t earth 6814
garage layout assign -z bru1 -c 15 -t mars 212f
```
garage node configure -z par1 -c 2 -t mercury 563e
garage node configure -z par1 -c 4 -t venus 86f0
garage node configure -z lon1 -c 4 -t earth 6814
garage node configure -z bru1 -c 3 -t mars 212f
At this point, the changes in the cluster layout have not yet been applied.
To show the new layout that will be applied, call:
```bash
garage layout show
```
Once you are satisfied with your new layout, apply it with:
```bash
garage layout apply
```
**WARNING:** if you want to use the layout modification commands in a script,
make sure to read [this page](/reference_manual/layout.html) first.
## Using your Garage cluster

View file

@ -28,8 +28,10 @@ and you should instead use one of the methods detailed in the next sections.
Removing a node is done with the following command:
```
garage node remove --yes <node_id>
```bash
garage layout remove <node_id>
garage layout show # review the changes you are making
garage layout apply # once satisfied, apply the changes
```
(you can get the `node_id` of the failed node by running `garage status`)
@ -50,7 +52,7 @@ We just need to tell Garage to get back all the data blocks and store them on th
First, set up a new HDD to store Garage's data directory on the failed node, and restart Garage using
the existing configuration. Then, run:
```
```bash
garage repair -a --yes blocks
```
@ -58,7 +60,7 @@ This will re-synchronize blocks of data that are missing to the new HDD, reading
You can check on the advancement of this process by doing the following command:
```
```bash
garage stats -a
```
@ -94,9 +96,11 @@ The ID of the lost node should be shown in `garage status` in the section for di
Then, replace the broken node by the new one, using:
```
garage node configure --replace <old_node_id> \
-c <capacity> -z <zone> -t <node_tag> <new_node_id>
```bash
garage layout assign <new_node_id> --replace <old_node_id> \
-c <capacity> -z <zone> -t <node_tag>
garage layout show # review the changes you are making
garage layout apply # once satisfied, apply the changes
```
Garage will then start synchronizing all required data on the new node.

View file

@ -18,10 +18,18 @@ This very website is hosted using Garage. In other words: the doc is the PoC!
# The Garage Geo-Distributed Data Store
Garage is a lightweight geo-distributed data store.
It comes from the observation that despite numerous object stores
many people have broken data management policies (backup/replication on a single site or none at all).
To promote better data management policies, we focused on the following **desirable properties**:
Garage is a lightweight geo-distributed data store that implements the
[Amazon S3](https://docs.aws.amazon.com/AmazonS3/latest/API/Welcome.html)
object storage protocole. It enables applications to store large blobs such
as pictures, video, images, documents, etc., in a redundant multi-node
setting. S3 is versatile enough to also be used to publish a static
website.
Garage comes from the observation that despite the numerous existing
implementation of object stores, many people have broken data management
policies (backup/replication on a single site or none at all). To promote
better data management policies, we focused on the following **desirable
properties**:
- **Self-contained & lightweight**: works everywhere and integrates well in existing environments to target [hyperconverged infrastructures](https://en.wikipedia.org/wiki/Hyper-converged_infrastructure).
- **Highly resilient**: highly resilient to network failures, network latency, disk failures, sysadmin failures.
@ -32,26 +40,19 @@ We also noted that the pursuit of some other goals are detrimental to our initia
The following has been identified as **non-goals** (if these points matter to you, you should not use Garage):
- **Extreme performances**: high performances constrain a lot the design and the infrastructure; we seek performances through minimalism only.
- **Feature extensiveness**: complete implementation of the S3 API or any other API to make garage a drop-in replacement is not targeted as it could lead to decisions impacting our desirable properties.
- **Feature extensiveness**: complete implementation of the S3 API or any other API to make Garage a drop-in replacement is not targeted as it could lead to decisions impacting our desirable properties.
- **Storage optimizations**: erasure coding or any other coding technique both increase the difficulty of placing data and synchronizing; we limit ourselves to duplication.
- **POSIX/Filesystem compatibility**: we do not aim at being POSIX compatible or to emulate any kind of filesystem. Indeed, in a distributed environment, such synchronizations are translated in network messages that impose severe constraints on the deployment.
## Supported and planned protocols
Garage speaks (or will speak) the following protocols:
- [S3](https://docs.aws.amazon.com/AmazonS3/latest/API/Welcome.html) - *SUPPORTED* - Enable applications to store large blobs such as pictures, video, images, documents, etc. S3 is versatile enough to also be used to publish a static website.
- [IMAP](https://github.com/go-pluto/pluto) - *PLANNED* - email storage is quite complex to get good performances.
To keep performances optimal, most IMAP servers only support on-disk storage.
We plan to add logic to Garage to make it a viable solution for email storage.
- *More to come*
## Use Cases
**[Deuxfleurs](https://deuxfleurs.fr):** Garage is used by Deuxfleurs which is a non-profit hosting organization.
Especially, it is used to host their main website, this documentation and some of its members' blogs.
Additionally, Garage is used as a [backend for Nextcloud](https://docs.nextcloud.com/server/20/admin_manual/configuration_files/primary_storage.html).
Deuxfleurs also plans to use Garage as their [Matrix's media backend](https://github.com/matrix-org/synapse-s3-storage-provider) and as the backend of [OCIS](https://github.com/owncloud/ocis).
**[Deuxfleurs](https://deuxfleurs.fr):** Garage is used by Deuxfleurs which
is a non-profit hosting organization. Especially, it is used to host their
main website, this documentation and some of its members' blogs.
Deuxfleurs also uses Garage as their [Matrix's media
backend](https://github.com/matrix-org/synapse-s3-storage-provider).
Deuxfleurs also uses it in its continuous integration platform to store
Drone's job logs and a Nix binary cache.
*Are you using Garage? [Open a pull request](https://git.deuxfleurs.fr/Deuxfleurs/garage/) to add your organization here!*

View file

@ -6,22 +6,23 @@ and how to interact with it.
Our goal is to introduce you to Garage's workflows.
Following this guide is recommended before moving on to
[configuring a real-world deployment](../cookbook/real_world.md).
[configuring a multi-node cluster](../cookbook/real_world.md).
Note that this kind of deployment should not be used in production, as it provides
no redundancy for your data!
Note that this kind of deployment should not be used in production,
as it provides no redundancy for your data!
## Get a binary
Download the latest Garage binary from the release pages on our repository:
<https://git.deuxfleurs.fr/Deuxfleurs/garage/releases>
<https://garagehq.deuxfleurs.fr/_releases.html>
Place this binary somewhere in your `$PATH` so that you can invoke the `garage`
command directly (for instance you can copy the binary in `/usr/local/bin`
or in `~/.local/bin`).
If a binary of the last version is not available for your architecture,
or if you want a build customized for your system,
you can [build Garage from source](../cookbook/from_source.md).
@ -109,9 +110,9 @@ ID Hostname Address Tag Zone Capacit
563e1ac825ee3323… linuxbox 127.0.0.1:3901 NO ROLE ASSIGNED
```
## Configuring your Garage node
## Creating a cluster layout
Configuring the nodes in a Garage deployment means informing Garage
Creating a cluster layout for a Garage deployment means informing Garage
of the disk space available on each node of the cluster
as well as the zone (e.g. datacenter) each machine is located in.
@ -119,14 +120,18 @@ For our test deployment, we are using only one node. The way in which we configu
it does not matter, you can simply write:
```bash
garage node configure -z dc1 -c 1 <node_id>
garage layout assign -z dc1 -c 1 <node_id>
```
where `<node_id>` corresponds to the identifier of the node shown by `garage status` (first column).
You can enter simply a prefix of that identifier.
For instance here you could write just `garage node configure -z dc1 -c 1 563e`.
For instance here you could write just `garage layout assign -z dc1 -c 1 563e`.
The layout then has to be applied to the cluster, using:
```bash
garage layout apply
```
## Creating buckets and keys
@ -197,7 +202,7 @@ Now that we have a bucket and a key, we need to give permissions to the key on t
```
garage bucket allow \
--read \
--write
--write \
nextcloud-bucket \
--key nextcloud-app-key
```
@ -270,5 +275,5 @@ The following tools can also be used to send and recieve files from/to Garage:
- [Cyberduck](https://cyberduck.io/)
- [`s3cmd`](https://s3tools.org/s3cmd)
Refer to the ["configuring clients"](../cookbook/clients.md) page to learn how to configure
these clients to interact with a Garage server.
Refer to the ["Integrations" section](../connect/index.md) to learn how to
configure application and command line utilities to integrate with Garage.

View file

@ -133,9 +133,9 @@ These peer identifiers have the following syntax:
In the case where `rpc_public_addr` is correctly specified in the
configuration file, the full identifier of a node including IP and port can
be obtained by running `garage node-id` and then included directly in the
be obtained by running `garage node id` and then included directly in the
`bootstrap_peers` list of other nodes. Otherwise, only the node's public
key will be returned by `garage node-id` and you will have to add the IP
key will be returned by `garage node id` and you will have to add the IP
yourself.
#### `consul_host` and `consul_service_name`

View file

@ -0,0 +1,74 @@
# Creating and updating a cluster layout
The cluster layout in Garage is a table that assigns to each node a role in
the cluster. The role of a node in Garage can either be a storage node with
a certain capacity, or a gateway node that does not store data and is only
used as an API entry point for faster cluster access.
An introduction to building cluster layouts can be found in the [production deployment](/cookbook/real_world.md) page.
## How cluster layouts work in Garage
In Garage, a cluster layout is composed of the following components:
- a table of roles assigned to nodes
- a version number
Garage nodes will always use the cluster layout with the highest version number.
Garage nodes also maintain and synchronize between them a set of proposed role
changes that haven't yet been applied. These changes will be applied (or
canceled) in the next version of the layout
The following commands insert modifications to the set of proposed role changes
for the next layout version (but they do not create the new layout immediately):
```bash
garage layout assign [...]
garage layout remove [...]
```
The following command can be used to inspect the layout that is currently set in the cluster
and the changes proposed for the next layout version, if any:
```bash
garage layout show
```
The following commands create a new layout with the specified version number,
that either takes into account the proposed changes or cancels them:
```bash
garage layout apply --version <new_version_number>
garage layout revert --version <new_version_number>
```
The version number of the new layout to create must be 1 + the version number
of the previous layout that existed in the cluster. The `apply` and `revert`
commands will fail otherwise.
## Warnings about Garage cluster layout management
**Warning: never make several calls to `garage layout apply` or `garage layout
revert` with the same value of the `--version` flag. Doing so can lead to the
creation of several different layouts with the same version number, in which
case your Garage cluster will become inconsistent until fixed.** If a call to
`garage layout apply` or `garage layout revert` has failed and `garage layout
show` indicates that a new layout with the given version number has not been
set in the cluster, then it is fine to call the command again with the same
version number.
If you are using the `garage` CLI by typing individual commands in your
shell, you shouldn't have much issues as long as you run commands one after
the other and take care of checking the output of `garage layout show`
before applying any changes.
If you are using the `garage` CLI to script layout changes, follow the following recommendations:
- Make all of your `garage` CLI calls to the same RPC host. Do not use the
`garage` CLI to connect to individual nodes to send them each a piece of the
layout changes you are making, as the changes propagate asynchronously
between nodes and might not all be taken into account at the time when the
new layout is applied.
- **Only call `garage layout apply` once**, and call it **strictly after** all
of the `layout assign` and `layout remove` commands have returned.

View file

@ -1,5 +1,7 @@
# Load Balancing Data (planned for version 0.2)
**This is being yet improved in release 0.5. The working document has not been updated yet, it still only applies to Garage 0.2 through 0.4.**
I have conducted a quick study of different methods to load-balance data over different Garage nodes using consistent hashing.
## Requirements

View file

@ -69,7 +69,7 @@ done
sleep 3
# Establish connections between nodes
for count in $(seq 1 3); do
NODE=$(garage -c /tmp/config.$count.toml node-id -q)
NODE=$(garage -c /tmp/config.$count.toml node id -q)
for count2 in $(seq 1 3); do
garage -c /tmp/config.$count2.toml node connect $NODE
done

View file

@ -25,6 +25,7 @@ garage -c /tmp/config.1.toml status \
| grep 'NO ROLE' \
| grep -Po '^[0-9a-f]+' \
| while read id; do
garage -c /tmp/config.1.toml node configure -z dc1 -c 1 $id
garage -c /tmp/config.1.toml layout assign $id -z dc1 -c 1
done
garage -c /tmp/config.1.toml layout apply --version 1

View file

@ -116,11 +116,11 @@ if [ -z "$SKIP_AWS" ]; then
echo "🧪 Website Testing"
echo "<h1>hello world</h1>" > /tmp/garage-index.html
aws s3 cp /tmp/garage-index.html s3://eprouvette/index.html
[ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.garage.tld" http://127.0.0.1:3923/ ` == 404 ]
[ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.garage.tld" http://127.0.0.1:3921/ ` == 404 ]
garage -c /tmp/config.1.toml bucket website --allow eprouvette
[ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.garage.tld" http://127.0.0.1:3923/ ` == 200 ]
[ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.garage.tld" http://127.0.0.1:3921/ ` == 200 ]
garage -c /tmp/config.1.toml bucket website --deny eprouvette
[ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.garage.tld" http://127.0.0.1:3923/ ` == 404 ]
[ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.garage.tld" http://127.0.0.1:3921/ ` == 404 ]
aws s3 rm s3://eprouvette/index.html
rm /tmp/garage-index.html
fi

View file

@ -1,11 +1,12 @@
[package]
name = "garage_api"
version = "0.4.0"
version = "0.5.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
description = "S3 API server crate for the Garage object store"
repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
readme = "../../README.md"
[lib]
path = "lib.rs"
@ -13,9 +14,9 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
garage_model = { version = "0.4.0", path = "../model" }
garage_table = { version = "0.4.0", path = "../table" }
garage_util = { version = "0.4.0", path = "../util" }
garage_model = { version = "0.5.0", path = "../model" }
garage_table = { version = "0.5.0", path = "../table" }
garage_util = { version = "0.5.0", path = "../util" }
base64 = "0.13"
bytes = "1.0"

View file

@ -1,11 +1,12 @@
[package]
name = "garage"
version = "0.4.0"
version = "0.5.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
description = "Garage, an S3-compatible distributed object store for self-hosted deployments"
repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
readme = "../../README.md"
[[bin]]
name = "garage"
@ -14,12 +15,12 @@ path = "main.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
garage_api = { version = "0.4.0", path = "../api" }
garage_model = { version = "0.4.0", path = "../model" }
garage_rpc = { version = "0.4.0", path = "../rpc" }
garage_table = { version = "0.4.0", path = "../table" }
garage_util = { version = "0.4.0", path = "../util" }
garage_web = { version = "0.4.0", path = "../web" }
garage_api = { version = "0.5.0", path = "../api" }
garage_model = { version = "0.5.0", path = "../model" }
garage_rpc = { version = "0.5.0", path = "../rpc" }
garage_table = { version = "0.5.0", path = "../table" }
garage_util = { version = "0.5.0", path = "../util" }
garage_web = { version = "0.5.0", path = "../web" }
bytes = "1.0"
git-version = "0.3.4"

View file

@ -339,7 +339,7 @@ impl AdminRpcHandler {
let mut failures = vec![];
let ring = self.garage.system.ring.borrow().clone();
for node in ring.config.members.keys() {
for node in ring.layout.node_ids().iter() {
let node = (*node).into();
let resp = self
.endpoint
@ -383,7 +383,7 @@ impl AdminRpcHandler {
let mut ret = String::new();
let ring = self.garage.system.ring.borrow().clone();
for node in ring.config.members.keys() {
for node in ring.layout.node_ids().iter() {
let mut opt = opt.clone();
opt.all_nodes = false;

View file

@ -2,7 +2,7 @@ use std::collections::HashSet;
use garage_util::error::*;
use garage_rpc::ring::*;
use garage_rpc::layout::*;
use garage_rpc::system::*;
use garage_rpc::*;
@ -20,11 +20,8 @@ pub async fn cli_command_dispatch(
Command::Node(NodeOperation::Connect(connect_opt)) => {
cmd_connect(system_rpc_endpoint, rpc_host, connect_opt).await
}
Command::Node(NodeOperation::Configure(configure_opt)) => {
cmd_configure(system_rpc_endpoint, rpc_host, configure_opt).await
}
Command::Node(NodeOperation::Remove(remove_opt)) => {
cmd_remove(system_rpc_endpoint, rpc_host, remove_opt).await
Command::Layout(layout_opt) => {
cli_layout_command_dispatch(layout_opt, system_rpc_endpoint, rpc_host).await
}
Command::Bucket(bo) => {
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::BucketOperation(bo)).await
@ -48,56 +45,60 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
SystemRpc::ReturnKnownNodes(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let config = match rpc_cli
.call(&rpc_host, &SystemRpc::PullConfig, PRIO_NORMAL)
.await??
{
SystemRpc::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let layout = fetch_layout(rpc_cli, rpc_host).await?;
println!("==== HEALTHY NODES ====");
let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTag\tZone\tCapacity".to_string()];
let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity".to_string()];
for adv in status.iter().filter(|adv| adv.is_up) {
if let Some(cfg) = config.members.get(&adv.id) {
healthy_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tag}]\t{zone}\t{capacity}",
id = adv.id,
host = adv.status.hostname,
addr = adv.addr,
tag = cfg.tag,
zone = cfg.zone,
capacity = cfg.capacity_string(),
));
} else {
healthy_nodes.push(format!(
"{id:?}\t{h}\t{addr}\tNO ROLE ASSIGNED",
id = adv.id,
h = adv.status.hostname,
addr = adv.addr,
));
match layout.roles.get(&adv.id) {
Some(NodeRoleV(Some(cfg))) => {
healthy_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}",
id = adv.id,
host = adv.status.hostname,
addr = adv.addr,
tags = cfg.tags.join(","),
zone = cfg.zone,
capacity = cfg.capacity_string(),
));
}
_ => {
let new_role = match layout.staging.get(&adv.id) {
Some(NodeRoleV(Some(_))) => "(pending)",
_ => "NO ROLE ASSIGNED",
};
healthy_nodes.push(format!(
"{id:?}\t{h}\t{addr}\t{new_role}",
id = adv.id,
h = adv.status.hostname,
addr = adv.addr,
new_role = new_role,
));
}
}
}
format_table(healthy_nodes);
let status_keys = status.iter().map(|adv| adv.id).collect::<HashSet<_>>();
let failure_case_1 = status.iter().any(|adv| !adv.is_up);
let failure_case_2 = config
.members
let failure_case_2 = layout
.roles
.items()
.iter()
.any(|(id, _)| !status_keys.contains(id));
.filter(|(_, _, v)| v.0.is_some())
.any(|(id, _, _)| !status_keys.contains(id));
if failure_case_1 || failure_case_2 {
println!("\n==== FAILED NODES ====");
let mut failed_nodes =
vec!["ID\tHostname\tAddress\tTag\tZone\tCapacity\tLast seen".to_string()];
vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()];
for adv in status.iter().filter(|adv| !adv.is_up) {
if let Some(cfg) = config.members.get(&adv.id) {
if let Some(NodeRoleV(Some(cfg))) = layout.roles.get(&adv.id) {
failed_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tag}]\t{zone}\t{capacity}\t{last_seen}",
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}",
id = adv.id,
host = adv.status.hostname,
addr = adv.addr,
tag = cfg.tag,
tags = cfg.tags.join(","),
zone = cfg.zone,
capacity = cfg.capacity_string(),
last_seen = adv
@ -107,20 +108,28 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
));
}
}
for (id, cfg) in config.members.iter() {
if !status_keys.contains(id) {
failed_nodes.push(format!(
"{id:?}\t??\t??\t[{tag}]\t{zone}\t{capacity}\tnever seen",
id = id,
tag = cfg.tag,
zone = cfg.zone,
capacity = cfg.capacity_string(),
));
for (id, _, role_v) in layout.roles.items().iter() {
if let NodeRoleV(Some(cfg)) = role_v {
if !status_keys.contains(id) {
failed_nodes.push(format!(
"{id:?}\t??\t??\t[{tags}]\t{zone}\t{capacity}\tnever seen",
id = id,
tags = cfg.tags.join(","),
zone = cfg.zone,
capacity = cfg.capacity_string(),
));
}
}
}
format_table(failed_nodes);
}
if print_staging_role_changes(&layout) {
println!();
println!("Please use `garage layout show` to check the proposed new layout and apply it.");
println!();
}
Ok(())
}
@ -141,115 +150,6 @@ pub async fn cmd_connect(
}
}
pub async fn cmd_configure(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
args: ConfigureNodeOpt,
) -> Result<(), Error> {
let status = match rpc_cli
.call(&rpc_host, &SystemRpc::GetKnownNodes, PRIO_NORMAL)
.await??
{
SystemRpc::ReturnKnownNodes(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let added_node = find_matching_node(status.iter().map(|adv| adv.id), &args.node_id)?;
let mut config = match rpc_cli
.call(&rpc_host, &SystemRpc::PullConfig, PRIO_NORMAL)
.await??
{
SystemRpc::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
for replaced in args.replace.iter() {
let replaced_node = find_matching_node(config.members.keys().cloned(), replaced)?;
if config.members.remove(&replaced_node).is_none() {
return Err(Error::Message(format!(
"Cannot replace node {:?} as it is not in current configuration",
replaced_node
)));
}
}
if args.capacity.is_some() && args.gateway {
return Err(Error::Message(
"-c and -g are mutually exclusive, please configure node either with c>0 to act as a storage node or with -g to act as a gateway node".into()));
}
if args.capacity == Some(0) {
return Err(Error::Message("Invalid capacity value: 0".into()));
}
let new_entry = match config.members.get(&added_node) {
None => {
let capacity = match args.capacity {
Some(c) => Some(c),
None if args.gateway => None,
_ => return Err(Error::Message(
"Please specify a capacity with the -c flag, or set node explicitly as gateway with -g".into())),
};
NetworkConfigEntry {
zone: args.zone.ok_or("Please specifiy a zone with the -z flag")?,
capacity,
tag: args.tag.unwrap_or_default(),
}
}
Some(old) => {
let capacity = match args.capacity {
Some(c) => Some(c),
None if args.gateway => None,
_ => old.capacity,
};
NetworkConfigEntry {
zone: args.zone.unwrap_or_else(|| old.zone.to_string()),
capacity,
tag: args.tag.unwrap_or_else(|| old.tag.to_string()),
}
}
};
config.members.insert(added_node, new_entry);
config.version += 1;
rpc_cli
.call(&rpc_host, &SystemRpc::AdvertiseConfig(config), PRIO_NORMAL)
.await??;
Ok(())
}
pub async fn cmd_remove(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
args: RemoveNodeOpt,
) -> Result<(), Error> {
let mut config = match rpc_cli
.call(&rpc_host, &SystemRpc::PullConfig, PRIO_NORMAL)
.await??
{
SystemRpc::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let deleted_node = find_matching_node(config.members.keys().cloned(), &args.node_id)?;
if !args.yes {
return Err(Error::Message(format!(
"Add the flag --yes to really remove {:?} from the cluster",
deleted_node
)));
}
config.members.remove(&deleted_node);
config.version += 1;
rpc_cli
.call(&rpc_host, &SystemRpc::AdvertiseConfig(config), PRIO_NORMAL)
.await??;
Ok(())
}
pub async fn cmd_admin(
rpc_cli: &Endpoint<AdminRpc, ()>,
rpc_host: NodeID,
@ -283,5 +183,3 @@ pub async fn cmd_admin(
}
Ok(())
}
// --- Utility functions ----

340
src/garage/cli/layout.rs Normal file
View file

@ -0,0 +1,340 @@
use garage_util::crdt::Crdt;
use garage_util::data::*;
use garage_util::error::*;
use garage_rpc::layout::*;
use garage_rpc::system::*;
use garage_rpc::*;
use crate::cli::*;
pub async fn cli_layout_command_dispatch(
cmd: LayoutOperation,
system_rpc_endpoint: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
) -> Result<(), Error> {
match cmd {
LayoutOperation::Assign(configure_opt) => {
cmd_assign_role(system_rpc_endpoint, rpc_host, configure_opt).await
}
LayoutOperation::Remove(remove_opt) => {
cmd_remove_role(system_rpc_endpoint, rpc_host, remove_opt).await
}
LayoutOperation::Show => cmd_show_layout(system_rpc_endpoint, rpc_host).await,
LayoutOperation::Apply(apply_opt) => {
cmd_apply_layout(system_rpc_endpoint, rpc_host, apply_opt).await
}
LayoutOperation::Revert(revert_opt) => {
cmd_revert_layout(system_rpc_endpoint, rpc_host, revert_opt).await
}
}
}
pub async fn cmd_assign_role(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
args: AssignRoleOpt,
) -> Result<(), Error> {
let status = match rpc_cli
.call(&rpc_host, &SystemRpc::GetKnownNodes, PRIO_NORMAL)
.await??
{
SystemRpc::ReturnKnownNodes(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let added_node = find_matching_node(status.iter().map(|adv| adv.id), &args.node_id)?;
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
let mut roles = layout.roles.clone();
roles.merge(&layout.staging);
for replaced in args.replace.iter() {
let replaced_node = find_matching_node(layout.node_ids().iter().cloned(), replaced)?;
match roles.get(&replaced_node) {
Some(NodeRoleV(Some(_))) => {
layout
.staging
.merge(&roles.update_mutator(replaced_node, NodeRoleV(None)));
}
_ => {
return Err(Error::Message(format!(
"Cannot replace node {:?} as it is not currently in planned layout",
replaced_node
)));
}
}
}
if args.capacity.is_some() && args.gateway {
return Err(Error::Message(
"-c and -g are mutually exclusive, please configure node either with c>0 to act as a storage node or with -g to act as a gateway node".into()));
}
if args.capacity == Some(0) {
return Err(Error::Message("Invalid capacity value: 0".into()));
}
let new_entry = match roles.get(&added_node) {
Some(NodeRoleV(Some(old))) => {
let capacity = match args.capacity {
Some(c) => Some(c),
None if args.gateway => None,
None => old.capacity,
};
let tags = if args.tags.is_empty() {
old.tags.clone()
} else {
args.tags
};
NodeRole {
zone: args.zone.unwrap_or_else(|| old.zone.to_string()),
capacity,
tags,
}
}
_ => {
let capacity = match args.capacity {
Some(c) => Some(c),
None if args.gateway => None,
None => return Err(Error::Message(
"Please specify a capacity with the -c flag, or set node explicitly as gateway with -g".into())),
};
NodeRole {
zone: args.zone.ok_or("Please specifiy a zone with the -z flag")?,
capacity,
tags: args.tags,
}
}
};
layout
.staging
.merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry))));
send_layout(rpc_cli, rpc_host, layout).await?;
println!("Role change is staged but not yet commited.");
println!("Use `garage layout show` to view staged role changes,");
println!("and `garage layout apply` to enact staged changes.");
Ok(())
}
pub async fn cmd_remove_role(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
args: RemoveRoleOpt,
) -> Result<(), Error> {
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
let mut roles = layout.roles.clone();
roles.merge(&layout.staging);
let deleted_node =
find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?;
layout
.staging
.merge(&roles.update_mutator(deleted_node, NodeRoleV(None)));
send_layout(rpc_cli, rpc_host, layout).await?;
println!("Role removal is staged but not yet commited.");
println!("Use `garage layout show` to view staged role changes,");
println!("and `garage layout apply` to enact staged changes.");
Ok(())
}
pub async fn cmd_show_layout(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
) -> Result<(), Error> {
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
println!("==== CURRENT CLUSTER LAYOUT ====");
if !print_cluster_layout(&layout) {
println!("No nodes currently have a role in the cluster.");
println!("See `garage status` to view available nodes.");
}
println!();
println!("Current cluster layout version: {}", layout.version);
if print_staging_role_changes(&layout) {
layout.roles.merge(&layout.staging);
println!();
println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====");
if !print_cluster_layout(&layout) {
println!("No nodes have a role in the new layout.");
}
println!();
// this will print the stats of what partitions
// will move around when we apply
if layout.calculate_partition_assignation() {
println!("To enact the staged role changes, type:");
println!();
println!(" garage layout apply --version {}", layout.version + 1);
println!();
println!(
"You can also revert all proposed changes with: garage layout revert --version {}",
layout.version + 1
);
} else {
println!("Not enough nodes have an assigned role to maintain enough copies of data.");
println!("This new layout cannot yet be applied.");
}
}
Ok(())
}
pub async fn cmd_apply_layout(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
apply_opt: ApplyLayoutOpt,
) -> Result<(), Error> {
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
match apply_opt.version {
None => {
println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout.");
println!("To know the correct value of the --version flag, invoke `garage layout show` and review the proposed changes.");
return Err(Error::Message("--version flag is missing".into()));
}
Some(v) => {
if v != layout.version + 1 {
return Err(Error::Message("Invalid value of --version flag".into()));
}
}
}
layout.roles.merge(&layout.staging);
if !layout.calculate_partition_assignation() {
return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into()));
}
layout.staging.clear();
layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]);
layout.version += 1;
send_layout(rpc_cli, rpc_host, layout).await?;
println!("New cluster layout with updated role assignation has been applied in cluster.");
println!("Data will now be moved around between nodes accordingly.");
Ok(())
}
pub async fn cmd_revert_layout(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
revert_opt: RevertLayoutOpt,
) -> Result<(), Error> {
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
match revert_opt.version {
None => {
println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout.");
println!("To know the correct value of the --version flag, invoke `garage layout show` and review the proposed changes.");
return Err(Error::Message("--version flag is missing".into()));
}
Some(v) => {
if v != layout.version + 1 {
return Err(Error::Message("Invalid value of --version flag".into()));
}
}
}
layout.staging.clear();
layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]);
layout.version += 1;
send_layout(rpc_cli, rpc_host, layout).await?;
println!("All proposed role changes in cluster layout have been canceled.");
Ok(())
}
// --- utility ---
pub async fn fetch_layout(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
) -> Result<ClusterLayout, Error> {
match rpc_cli
.call(&rpc_host, &SystemRpc::PullClusterLayout, PRIO_NORMAL)
.await??
{
SystemRpc::AdvertiseClusterLayout(t) => Ok(t),
resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
}
}
pub async fn send_layout(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
layout: ClusterLayout,
) -> Result<(), Error> {
rpc_cli
.call(
&rpc_host,
&SystemRpc::AdvertiseClusterLayout(layout),
PRIO_NORMAL,
)
.await??;
Ok(())
}
pub fn print_cluster_layout(layout: &ClusterLayout) -> bool {
let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
for (id, _, role) in layout.roles.items().iter() {
let role = match &role.0 {
Some(r) => r,
_ => continue,
};
let tags = role.tags.join(",");
table.push(format!(
"{:?}\t{}\t{}\t{}",
id,
tags,
role.zone,
role.capacity_string()
));
}
if table.len() == 1 {
false
} else {
format_table(table);
true
}
}
pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
if !layout.staging.items().is_empty() {
println!();
println!("==== STAGED ROLE CHANGES ====");
let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
for (id, _, role) in layout.staging.items().iter() {
if let Some(role) = &role.0 {
let tags = role.tags.join(",");
table.push(format!(
"{:?}\t{}\t{}\t{}",
id,
tags,
role.zone,
role.capacity_string()
));
} else {
table.push(format!("{:?}\tREMOVED", id));
}
}
format_table(table);
true
} else {
false
}
}

View file

@ -1,9 +1,11 @@
pub(crate) mod cmd;
pub(crate) mod init;
pub(crate) mod layout;
pub(crate) mod structs;
pub(crate) mod util;
pub(crate) use cmd::*;
pub(crate) use init::*;
pub(crate) use layout::*;
pub(crate) use structs::*;
pub(crate) use util::*;

View file

@ -8,23 +8,23 @@ pub enum Command {
#[structopt(name = "server")]
Server,
/// Print identifier (public key) of this Garage node
#[structopt(name = "node-id")]
NodeId(NodeIdOpt),
/// Get network status
#[structopt(name = "status")]
Status,
/// Garage node operations
/// Operations on individual Garage nodes
#[structopt(name = "node")]
Node(NodeOperation),
/// Bucket operations
/// Operations on the assignation of node roles in the cluster layout
#[structopt(name = "layout")]
Layout(LayoutOperation),
/// Operations on buckets
#[structopt(name = "bucket")]
Bucket(BucketOperation),
/// Key operations
/// Operations on S3 access keys
#[structopt(name = "key")]
Key(KeyOperation),
@ -39,17 +39,13 @@ pub enum Command {
#[derive(StructOpt, Debug)]
pub enum NodeOperation {
/// Print identifier (public key) of this Garage node
#[structopt(name = "id")]
NodeId(NodeIdOpt),
/// Connect to Garage node that is currently isolated from the system
#[structopt(name = "connect")]
Connect(ConnectNodeOpt),
/// Configure Garage node
#[structopt(name = "configure")]
Configure(ConfigureNodeOpt),
/// Remove Garage node from cluster
#[structopt(name = "remove")]
Remove(RemoveNodeOpt),
}
#[derive(StructOpt, Debug)]
@ -67,8 +63,31 @@ pub struct ConnectNodeOpt {
}
#[derive(StructOpt, Debug)]
pub struct ConfigureNodeOpt {
/// Node to configure (prefix of hexadecimal node id)
pub enum LayoutOperation {
/// Assign role to Garage node
#[structopt(name = "assign")]
Assign(AssignRoleOpt),
/// Remove role from Garage cluster node
#[structopt(name = "remove")]
Remove(RemoveRoleOpt),
/// Show roles currently assigned to nodes and changes staged for commit
#[structopt(name = "show")]
Show,
/// Apply staged changes to cluster layout
#[structopt(name = "apply")]
Apply(ApplyLayoutOpt),
/// Revert staged changes to cluster layout
#[structopt(name = "revert")]
Revert(RevertLayoutOpt),
}
#[derive(StructOpt, Debug)]
pub struct AssignRoleOpt {
/// Node to which to assign role (prefix of hexadecimal node id)
pub(crate) node_id: String,
/// Location (zone or datacenter) of the node
@ -83,9 +102,9 @@ pub struct ConfigureNodeOpt {
#[structopt(short = "g", long = "gateway")]
pub(crate) gateway: bool,
/// Optional node tag
/// Optional tags to add to node
#[structopt(short = "t", long = "tag")]
pub(crate) tag: Option<String>,
pub(crate) tags: Vec<String>,
/// Replaced node(s): list of node IDs that will be removed from the current cluster
#[structopt(long = "replace")]
@ -93,13 +112,24 @@ pub struct ConfigureNodeOpt {
}
#[derive(StructOpt, Debug)]
pub struct RemoveNodeOpt {
/// Node to configure (prefix of hexadecimal node id)
pub struct RemoveRoleOpt {
/// Node whose role to remove (prefix of hexadecimal node id)
pub(crate) node_id: String,
}
/// If this flag is not given, the node won't be removed
#[structopt(long = "yes")]
pub(crate) yes: bool,
#[derive(StructOpt, Debug)]
pub struct ApplyLayoutOpt {
/// Version number of new configuration: this command will fail if
/// it is not exactly 1 + the previous configuration's version
#[structopt(long = "version")]
pub(crate) version: Option<u64>,
}
#[derive(StructOpt, Debug)]
pub struct RevertLayoutOpt {
/// Version number of old configuration to which to revert
#[structopt(long = "version")]
pub(crate) version: Option<u64>,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]

View file

@ -70,7 +70,9 @@ async fn main() {
server::run_server(opt.config_file).await
}
Command::NodeId(node_id_opt) => node_id_command(opt.config_file, node_id_opt.quiet),
Command::Node(NodeOperation::NodeId(node_id_opt)) => {
node_id_command(opt.config_file, node_id_opt.quiet)
}
_ => cli_command(opt).await,
};

View file

@ -1,11 +1,12 @@
[package]
name = "garage_model"
version = "0.4.0"
version = "0.5.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
description = "Core data model for the Garage object store"
repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
readme = "../../README.md"
[lib]
path = "lib.rs"
@ -13,9 +14,9 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
garage_rpc = { version = "0.4.0", path = "../rpc" }
garage_table = { version = "0.4.0", path = "../table" }
garage_util = { version = "0.4.0", path = "../util" }
garage_rpc = { version = "0.5.0", path = "../rpc" }
garage_table = { version = "0.5.0", path = "../table" }
garage_util = { version = "0.5.0", path = "../util" }
async-trait = "0.1.7"
arc-swap = "1.0"

View file

@ -1,11 +1,12 @@
[package]
name = "garage_rpc"
version = "0.4.0"
version = "0.5.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
description = "Cluster membership management and RPC protocol for the Garage object store"
repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
readme = "../../README.md"
[lib]
path = "lib.rs"
@ -13,7 +14,7 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
garage_util = { version = "0.4.0", path = "../util" }
garage_util = { version = "0.5.0", path = "../util" }
arc-swap = "1.0"
bytes = "1.0"
@ -26,6 +27,7 @@ sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" }
async-trait = "0.1.7"
rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
serde_bytes = "0.11"
serde_json = "1.0"
futures = "0.3"

579
src/rpc/layout.rs Normal file
View file

@ -0,0 +1,579 @@
use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
use serde::{Deserialize, Serialize};
use garage_util::crdt::{AutoCrdt, Crdt, LwwMap};
use garage_util::data::*;
use crate::ring::*;
/// The layout of the cluster, i.e. the list of roles
/// which are assigned to each cluster node
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ClusterLayout {
pub version: u64,
pub replication_factor: usize,
pub roles: LwwMap<Uuid, NodeRoleV>,
/// node_id_vec: a vector of node IDs with a role assigned
/// in the system (this includes gateway nodes).
/// The order here is different than the vec stored by `roles`, because:
/// 1. non-gateway nodes are first so that they have lower numbers
/// 2. nodes that don't have a role are excluded (but they need to
/// stay in the CRDT as tombstones)
pub node_id_vec: Vec<Uuid>,
/// the assignation of data partitions to node, the values
/// are indices in node_id_vec
#[serde(with = "serde_bytes")]
pub ring_assignation_data: Vec<CompactNodeType>,
/// Role changes which are staged for the next version of the layout
pub staging: LwwMap<Uuid, NodeRoleV>,
pub staging_hash: Hash,
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct NodeRoleV(pub Option<NodeRole>);
impl AutoCrdt for NodeRoleV {
const WARN_IF_DIFFERENT: bool = true;
}
/// The user-assigned roles of cluster nodes
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct NodeRole {
/// Datacenter at which this entry belong. This information might be used to perform a better
/// geodistribution
pub zone: String,
/// The (relative) capacity of the node
/// If this is set to None, the node does not participate in storing data for the system
/// and is only active as an API gateway to other nodes
pub capacity: Option<u32>,
/// A set of tags to recognize the node
pub tags: Vec<String>,
}
impl NodeRole {
pub fn capacity_string(&self) -> String {
match self.capacity {
Some(c) => format!("{}", c),
None => "gateway".to_string(),
}
}
}
impl ClusterLayout {
pub fn new(replication_factor: usize) -> Self {
let empty_lwwmap = LwwMap::new();
let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]);
ClusterLayout {
version: 0,
replication_factor,
roles: LwwMap::new(),
node_id_vec: Vec::new(),
ring_assignation_data: Vec::new(),
staging: empty_lwwmap,
staging_hash: empty_lwwmap_hash,
}
}
pub fn merge(&mut self, other: &ClusterLayout) -> bool {
match other.version.cmp(&self.version) {
Ordering::Greater => {
*self = other.clone();
true
}
Ordering::Equal => {
self.staging.merge(&other.staging);
let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]);
let changed = new_staging_hash != self.staging_hash;
self.staging_hash = new_staging_hash;
changed
}
Ordering::Less => false,
}
}
/// Returns a list of IDs of nodes that currently have
/// a role in the cluster
pub fn node_ids(&self) -> &[Uuid] {
&self.node_id_vec[..]
}
pub fn num_nodes(&self) -> usize {
self.node_id_vec.len()
}
/// Returns the role of a node in the layout
pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> {
match self.roles.get(node) {
Some(NodeRoleV(Some(v))) => Some(v),
_ => None,
}
}
/// Check a cluster layout for internal consistency
/// returns true if consistent, false if error
pub fn check(&self) -> bool {
// Check that the hash of the staging data is correct
let staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]);
if staging_hash != self.staging_hash {
return false;
}
// Check that node_id_vec contains the correct list of nodes
let mut expected_nodes = self
.roles
.items()
.iter()
.filter(|(_, _, v)| v.0.is_some())
.map(|(id, _, _)| *id)
.collect::<Vec<_>>();
expected_nodes.sort();
let mut node_id_vec = self.node_id_vec.clone();
node_id_vec.sort();
if expected_nodes != node_id_vec {
return false;
}
// Check that the assignation data has the correct length
if self.ring_assignation_data.len() != (1 << PARTITION_BITS) * self.replication_factor {
return false;
}
// Check that the assigned nodes are correct identifiers
// of nodes that are assigned a role
// and that role is not the role of a gateway nodes
for x in self.ring_assignation_data.iter() {
if *x as usize >= self.node_id_vec.len() {
return false;
}
let node = self.node_id_vec[*x as usize];
match self.roles.get(&node) {
Some(NodeRoleV(Some(x))) if x.capacity.is_some() => (),
_ => return false,
}
}
true
}
/// Calculate an assignation of partitions to nodes
pub fn calculate_partition_assignation(&mut self) -> bool {
let (configured_nodes, zones) = self.configured_nodes_and_zones();
let n_zones = zones.len();
println!("Calculating updated partition assignation, this may take some time...");
println!();
let old_partitions = self.parse_assignation_data();
let mut partitions = old_partitions.clone();
for part in partitions.iter_mut() {
part.nodes
.retain(|(_, info)| info.map(|x| x.capacity.is_some()).unwrap_or(false));
}
// When nodes are removed, or when bootstraping an assignation from
// scratch for a new cluster, the old partitions will have holes (or be empty).
// Here we add more nodes to make a complete (sub-optimal) assignation,
// using an initial partition assignation that is calculated using the multi-dc maglev trick
match self.initial_partition_assignation() {
Some(initial_partitions) => {
for (part, ipart) in partitions.iter_mut().zip(initial_partitions.iter()) {
for (id, info) in ipart.nodes.iter() {
if part.nodes.len() < self.replication_factor {
part.add(part.nodes.len() + 1, n_zones, id, info.unwrap());
}
}
assert!(part.nodes.len() == self.replication_factor);
}
}
None => {
return false;
}
}
// Calculate how many partitions each node should ideally store,
// and how many partitions they are storing with the current assignation
// This defines our target for which we will optimize in the following loop.
let total_capacity = configured_nodes
.iter()
.map(|(_, info)| info.capacity.unwrap_or(0))
.sum::<u32>() as usize;
let total_partitions = self.replication_factor * (1 << PARTITION_BITS);
let target_partitions_per_node = configured_nodes
.iter()
.map(|(id, info)| {
(
*id,
info.capacity.unwrap_or(0) as usize * total_partitions / total_capacity,
)
})
.collect::<HashMap<&Uuid, usize>>();
let mut partitions_per_node = self.partitions_per_node(&partitions[..]);
println!("Target number of partitions per node:");
for (node, npart) in target_partitions_per_node.iter() {
println!("{:?}\t{}", node, npart);
}
println!();
// Shuffle partitions between nodes so that nodes will reach (or better approach)
// their target number of stored partitions
loop {
let mut option = None;
for (i, part) in partitions.iter_mut().enumerate() {
for (irm, (idrm, _)) in part.nodes.iter().enumerate() {
let suprm = partitions_per_node.get(*idrm).cloned().unwrap_or(0) as i32
- target_partitions_per_node.get(*idrm).cloned().unwrap_or(0) as i32;
for (idadd, infoadd) in configured_nodes.iter() {
// skip replacing a node by itself
// and skip replacing by gateway nodes
if idadd == idrm || infoadd.capacity.is_none() {
continue;
}
let supadd = partitions_per_node.get(*idadd).cloned().unwrap_or(0) as i32
- target_partitions_per_node.get(*idadd).cloned().unwrap_or(0) as i32;
// We want to try replacing node idrm by node idadd
// if that brings us close to our goal.
let square = |i: i32| i * i;
let oldcost = square(suprm) + square(supadd);
let newcost = square(suprm - 1) + square(supadd + 1);
if newcost >= oldcost {
// not closer to our goal
continue;
}
let gain = oldcost - newcost;
let mut newpart = part.clone();
newpart.nodes.remove(irm);
if !newpart.add(newpart.nodes.len() + 1, n_zones, idadd, infoadd) {
continue;
}
assert!(newpart.nodes.len() == self.replication_factor);
if !old_partitions[i]
.is_valid_transition_to(&newpart, self.replication_factor)
{
continue;
}
if option
.as_ref()
.map(|(old_gain, _, _, _, _)| gain > *old_gain)
.unwrap_or(true)
{
option = Some((gain, i, idadd, idrm, newpart));
}
}
}
}
if let Some((_gain, i, idadd, idrm, newpart)) = option {
*partitions_per_node.entry(idadd).or_insert(0) += 1;
*partitions_per_node.get_mut(idrm).unwrap() -= 1;
partitions[i] = newpart;
} else {
break;
}
}
// Check we completed the assignation correctly
// (this is a set of checks for the algorithm's consistency)
assert!(partitions.len() == (1 << PARTITION_BITS));
assert!(partitions
.iter()
.all(|p| p.nodes.len() == self.replication_factor));
let new_partitions_per_node = self.partitions_per_node(&partitions[..]);
assert!(new_partitions_per_node == partitions_per_node);
// Show statistics
println!("New number of partitions per node:");
for (node, npart) in partitions_per_node.iter() {
println!("{:?}\t{}", node, npart);
}
println!();
let mut diffcount = HashMap::new();
for (oldpart, newpart) in old_partitions.iter().zip(partitions.iter()) {
let nminus = oldpart.txtplus(newpart);
let nplus = newpart.txtplus(oldpart);
if nminus != "[...]" || nplus != "[...]" {
let tup = (nminus, nplus);
*diffcount.entry(tup).or_insert(0) += 1;
}
}
if diffcount.is_empty() {
println!("No data will be moved between nodes.");
} else {
let mut diffcount = diffcount.into_iter().collect::<Vec<_>>();
diffcount.sort();
println!("Number of partitions that move:");
for ((nminus, nplus), npart) in diffcount {
println!("\t{}\t{} -> {}", npart, nminus, nplus);
}
}
println!();
// Calculate and save new assignation data
let (nodes, assignation_data) =
self.compute_assignation_data(&configured_nodes[..], &partitions[..]);
self.node_id_vec = nodes;
self.ring_assignation_data = assignation_data;
true
}
fn initial_partition_assignation(&self) -> Option<Vec<PartitionAss<'_>>> {
let (configured_nodes, zones) = self.configured_nodes_and_zones();
let n_zones = zones.len();
// Create a vector of partition indices (0 to 2**PARTITION_BITS-1)
let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::<Vec<_>>();
// Prepare ring
let mut partitions: Vec<PartitionAss> = partitions_idx
.iter()
.map(|_i| PartitionAss::new())
.collect::<Vec<_>>();
// Create MagLev priority queues for each node
let mut queues = configured_nodes
.iter()
.filter(|(_id, info)| info.capacity.is_some())
.map(|(node_id, node_info)| {
let mut parts = partitions_idx
.iter()
.map(|i| {
let part_data =
[&u16::to_be_bytes(*i as u16)[..], node_id.as_slice()].concat();
(*i, fasthash(&part_data[..]))
})
.collect::<Vec<_>>();
parts.sort_by_key(|(_i, h)| *h);
let parts_i = parts.iter().map(|(i, _h)| *i).collect::<Vec<_>>();
(node_id, node_info, parts_i, 0)
})
.collect::<Vec<_>>();
let max_capacity = configured_nodes
.iter()
.filter_map(|(_, node_info)| node_info.capacity)
.fold(0, std::cmp::max);
// Fill up ring
for rep in 0..self.replication_factor {
queues.sort_by_key(|(ni, _np, _q, _p)| {
let queue_data = [&u16::to_be_bytes(rep as u16)[..], ni.as_slice()].concat();
fasthash(&queue_data[..])
});
for (_, _, _, pos) in queues.iter_mut() {
*pos = 0;
}
let mut remaining = partitions_idx.len();
while remaining > 0 {
let remaining0 = remaining;
for i_round in 0..max_capacity {
for (node_id, node_info, q, pos) in queues.iter_mut() {
if i_round >= node_info.capacity.unwrap() {
continue;
}
for (pos2, &qv) in q.iter().enumerate().skip(*pos) {
if partitions[qv].add(rep + 1, n_zones, node_id, node_info) {
remaining -= 1;
*pos = pos2 + 1;
break;
}
}
}
}
if remaining == remaining0 {
// No progress made, exit
return None;
}
}
}
Some(partitions)
}
fn configured_nodes_and_zones(&self) -> (Vec<(&Uuid, &NodeRole)>, HashSet<&str>) {
let configured_nodes = self
.roles
.items()
.iter()
.filter(|(_id, _, info)| info.0.is_some())
.map(|(id, _, info)| (id, info.0.as_ref().unwrap()))
.collect::<Vec<(&Uuid, &NodeRole)>>();
let zones = configured_nodes
.iter()
.filter(|(_id, info)| info.capacity.is_some())
.map(|(_id, info)| info.zone.as_str())
.collect::<HashSet<&str>>();
(configured_nodes, zones)
}
fn compute_assignation_data<'a>(
&self,
configured_nodes: &[(&'a Uuid, &'a NodeRole)],
partitions: &[PartitionAss<'a>],
) -> (Vec<Uuid>, Vec<CompactNodeType>) {
assert!(partitions.len() == (1 << PARTITION_BITS));
// Make a canonical order for nodes
let mut nodes = configured_nodes
.iter()
.filter(|(_id, info)| info.capacity.is_some())
.map(|(id, _)| **id)
.collect::<Vec<_>>();
let nodes_rev = nodes
.iter()
.enumerate()
.map(|(i, id)| (*id, i as CompactNodeType))
.collect::<HashMap<Uuid, CompactNodeType>>();
let mut assignation_data = vec![];
for partition in partitions.iter() {
assert!(partition.nodes.len() == self.replication_factor);
for (id, _) in partition.nodes.iter() {
assignation_data.push(*nodes_rev.get(id).unwrap());
}
}
nodes.extend(
configured_nodes
.iter()
.filter(|(_id, info)| info.capacity.is_none())
.map(|(id, _)| **id),
);
(nodes, assignation_data)
}
fn parse_assignation_data(&self) -> Vec<PartitionAss<'_>> {
if self.ring_assignation_data.len() == self.replication_factor * (1 << PARTITION_BITS) {
// If the previous assignation data is correct, use that
let mut partitions = vec![];
for i in 0..(1 << PARTITION_BITS) {
let mut part = PartitionAss::new();
for node_i in self.ring_assignation_data
[i * self.replication_factor..(i + 1) * self.replication_factor]
.iter()
{
let node_id = &self.node_id_vec[*node_i as usize];
if let Some(NodeRoleV(Some(info))) = self.roles.get(node_id) {
part.nodes.push((node_id, Some(info)));
} else {
part.nodes.push((node_id, None));
}
}
partitions.push(part);
}
partitions
} else {
// Otherwise start fresh
(0..(1 << PARTITION_BITS))
.map(|_| PartitionAss::new())
.collect()
}
}
fn partitions_per_node<'a>(&self, partitions: &[PartitionAss<'a>]) -> HashMap<&'a Uuid, usize> {
let mut partitions_per_node = HashMap::<&Uuid, usize>::new();
for p in partitions.iter() {
for (id, _) in p.nodes.iter() {
*partitions_per_node.entry(*id).or_insert(0) += 1;
}
}
partitions_per_node
}
}
// ---- Internal structs for partition assignation in layout ----
#[derive(Clone)]
struct PartitionAss<'a> {
nodes: Vec<(&'a Uuid, Option<&'a NodeRole>)>,
}
impl<'a> PartitionAss<'a> {
fn new() -> Self {
Self { nodes: Vec::new() }
}
fn nplus(&self, other: &PartitionAss<'a>) -> usize {
self.nodes
.iter()
.filter(|x| !other.nodes.contains(x))
.count()
}
fn txtplus(&self, other: &PartitionAss<'a>) -> String {
let mut nodes = self
.nodes
.iter()
.filter(|x| !other.nodes.contains(x))
.map(|x| format!("{:?}", x.0))
.collect::<Vec<_>>();
nodes.sort();
if self.nodes.iter().any(|x| other.nodes.contains(x)) {
nodes.push("...".into());
}
format!("[{}]", nodes.join(" "))
}
fn is_valid_transition_to(&self, other: &PartitionAss<'a>, replication_factor: usize) -> bool {
let min_keep_nodes_per_part = (replication_factor + 1) / 2;
let n_removed = self.nplus(other);
if self.nodes.len() <= min_keep_nodes_per_part {
n_removed == 0
} else {
n_removed <= self.nodes.len() - min_keep_nodes_per_part
}
}
fn add(
&mut self,
target_len: usize,
n_zones: usize,
node: &'a Uuid,
role: &'a NodeRole,
) -> bool {
if self.nodes.len() != target_len - 1 {
return false;
}
let p_zns = self
.nodes
.iter()
.map(|(_id, info)| info.unwrap().zone.as_str())
.collect::<HashSet<&str>>();
if (p_zns.len() < n_zones && !p_zns.contains(&role.zone.as_str()))
|| (p_zns.len() == n_zones && !self.nodes.iter().any(|(id, _)| *id == node))
{
self.nodes.push((node, Some(role)));
true
} else {
false
}
}
}

View file

@ -5,6 +5,7 @@ extern crate log;
mod consul;
pub mod layout;
pub mod ring;
pub mod system;

View file

@ -1,12 +1,11 @@
//! Module containing types related to computing nodes which should receive a copy of data blocks
//! and metadata
use std::collections::{HashMap, HashSet};
use std::convert::TryInto;
use serde::{Deserialize, Serialize};
use garage_util::data::*;
use crate::layout::ClusterLayout;
/// A partition id, which is stored on 16 bits
/// i.e. we have up to 2**16 partitions.
/// (in practice we have exactly 2**PARTITION_BITS partitions)
@ -22,47 +21,6 @@ pub const PARTITION_BITS: usize = 8;
const PARTITION_MASK_U16: u16 = ((1 << PARTITION_BITS) - 1) << (16 - PARTITION_BITS);
/// The user-defined configuration of the cluster's nodes
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct NetworkConfig {
/// Map of each node's id to it's configuration
pub members: HashMap<Uuid, NetworkConfigEntry>,
/// Version of this config
pub version: u64,
}
impl NetworkConfig {
pub(crate) fn new() -> Self {
Self {
members: HashMap::new(),
version: 0,
}
}
}
/// The overall configuration of one (possibly remote) node
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct NetworkConfigEntry {
/// Datacenter at which this entry belong. This infromation might be used to perform a better
/// geodistribution
pub zone: String,
/// The (relative) capacity of the node
/// If this is set to None, the node does not participate in storing data for the system
/// and is only active as an API gateway to other nodes
pub capacity: Option<u32>,
/// A tag to recognize the entry, not used for other things than display
pub tag: String,
}
impl NetworkConfigEntry {
pub fn capacity_string(&self) -> String {
match self.capacity {
Some(c) => format!("{}", c),
None => "gateway".to_string(),
}
}
}
/// A ring distributing fairly objects to nodes
#[derive(Clone)]
pub struct Ring {
@ -70,7 +28,7 @@ pub struct Ring {
pub replication_factor: usize,
/// The network configuration used to generate this ring
pub config: NetworkConfig,
pub layout: ClusterLayout,
// Internal order of nodes used to make a more compact representation of the ring
nodes: Vec<Uuid>,
@ -81,7 +39,7 @@ pub struct Ring {
// Type to store compactly the id of a node in the system
// Change this to u16 the day we want to have more than 256 nodes in a cluster
type CompactNodeType = u8;
pub type CompactNodeType = u8;
// The maximum number of times an object might get replicated
// This must be at least 3 because Garage supports 3-way replication
@ -102,132 +60,26 @@ struct RingEntry {
}
impl Ring {
// TODO this function MUST be refactored, it's 100 lines long, with a 50 lines loop, going up to 6
// levels of imbrication. It is basically impossible to test, maintain, or understand for an
// outsider.
pub(crate) fn new(config: NetworkConfig, replication_factor: usize) -> Self {
// Create a vector of partition indices (0 to 2**PARTITION_BITS-1)
let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::<Vec<_>>();
let zones = config
.members
.iter()
.filter(|(_id, info)| info.capacity.is_some())
.map(|(_id, info)| info.zone.as_str())
.collect::<HashSet<&str>>();
let n_zones = zones.len();
// Prepare ring
let mut partitions: Vec<Vec<(&Uuid, &NetworkConfigEntry)>> = partitions_idx
.iter()
.map(|_i| Vec::new())
.collect::<Vec<_>>();
// Create MagLev priority queues for each node
let mut queues = config
.members
.iter()
.filter(|(_id, info)| info.capacity.is_some())
.map(|(node_id, node_info)| {
let mut parts = partitions_idx
.iter()
.map(|i| {
let part_data =
[&u16::to_be_bytes(*i as u16)[..], node_id.as_slice()].concat();
(*i, fasthash(&part_data[..]))
})
.collect::<Vec<_>>();
parts.sort_by_key(|(_i, h)| *h);
let parts_i = parts.iter().map(|(i, _h)| *i).collect::<Vec<_>>();
(node_id, node_info, parts_i, 0)
})
.collect::<Vec<_>>();
let max_capacity = config
.members
.iter()
.filter_map(|(_, node_info)| node_info.capacity)
.fold(0, std::cmp::max);
assert!(replication_factor <= MAX_REPLICATION);
// Fill up ring
for rep in 0..replication_factor {
queues.sort_by_key(|(ni, _np, _q, _p)| {
let queue_data = [&u16::to_be_bytes(rep as u16)[..], ni.as_slice()].concat();
fasthash(&queue_data[..])
});
for (_, _, _, pos) in queues.iter_mut() {
*pos = 0;
}
let mut remaining = partitions_idx.len();
while remaining > 0 {
let remaining0 = remaining;
for i_round in 0..max_capacity {
for (node_id, node_info, q, pos) in queues.iter_mut() {
if i_round >= node_info.capacity.unwrap() {
continue;
}
for (pos2, &qv) in q.iter().enumerate().skip(*pos) {
if partitions[qv].len() != rep {
continue;
}
let p_zns = partitions[qv]
.iter()
.map(|(_id, info)| info.zone.as_str())
.collect::<HashSet<&str>>();
if (p_zns.len() < n_zones && !p_zns.contains(&node_info.zone.as_str()))
|| (p_zns.len() == n_zones
&& !partitions[qv].iter().any(|(id, _i)| id == node_id))
{
partitions[qv].push((node_id, node_info));
remaining -= 1;
*pos = pos2 + 1;
break;
}
}
}
}
if remaining == remaining0 {
// No progress made, exit
warn!("Could not build ring, not enough nodes configured.");
return Self {
replication_factor,
config,
nodes: vec![],
ring: vec![],
};
}
}
pub(crate) fn new(layout: ClusterLayout, replication_factor: usize) -> Self {
if replication_factor != layout.replication_factor {
warn!("Could not build ring: replication factor does not match between local configuration and network role assignation.");
return Self::empty(layout, replication_factor);
}
// Make a canonical order for nodes
let nodes = config
.members
.iter()
.filter(|(_id, info)| info.capacity.is_some())
.map(|(id, _)| *id)
.collect::<Vec<_>>();
let nodes_rev = nodes
.iter()
.enumerate()
.map(|(i, id)| (*id, i as CompactNodeType))
.collect::<HashMap<Uuid, CompactNodeType>>();
if layout.ring_assignation_data.len() != replication_factor * (1 << PARTITION_BITS) {
warn!("Could not build ring: network role assignation data has invalid length");
return Self::empty(layout, replication_factor);
}
let ring = partitions
.iter()
.enumerate()
.map(|(i, nodes)| {
let nodes = layout.node_id_vec.clone();
let ring = (0..(1 << PARTITION_BITS))
.map(|i| {
let top = (i as u16) << (16 - PARTITION_BITS);
let nodes = nodes
.iter()
.map(|(id, _info)| *nodes_rev.get(id).unwrap())
.collect::<Vec<CompactNodeType>>();
assert!(nodes.len() == replication_factor);
let mut nodes_buf = [0u8; MAX_REPLICATION];
nodes_buf[..replication_factor].copy_from_slice(&nodes[..]);
nodes_buf[..replication_factor].copy_from_slice(
&layout.ring_assignation_data
[replication_factor * i..replication_factor * (i + 1)],
);
RingEntry {
hash_prefix: top,
nodes_buf,
@ -237,12 +89,21 @@ impl Ring {
Self {
replication_factor,
config,
layout,
nodes,
ring,
}
}
fn empty(layout: ClusterLayout, replication_factor: usize) -> Self {
Self {
replication_factor,
layout,
nodes: vec![],
ring: vec![],
}
}
/// Get the partition in which data would fall on
pub fn partition_of(&self, position: &Hash) -> Partition {
let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap());

View file

@ -225,7 +225,7 @@ impl RpcHelper {
// Retrieve some status variables that we will use to sort requests
let peer_list = self.0.fullmesh.get_peer_list();
let ring: Arc<Ring> = self.0.ring.borrow().clone();
let our_zone = match ring.config.members.get(&self.0.our_node_id) {
let our_zone = match ring.layout.node_role(&self.0.our_node_id) {
Some(pc) => &pc.zone,
None => "",
};
@ -238,7 +238,7 @@ impl RpcHelper {
// and within a same zone we priorize nodes with the lowest latency.
let mut requests = requests
.map(|(to, fut)| {
let peer_zone = match ring.config.members.get(&to) {
let peer_zone = match ring.layout.node_role(&to) {
Some(pc) => &pc.zone,
None => "",
};

View file

@ -23,12 +23,13 @@ use netapp::{NetApp, NetworkKey, NodeID, NodeKey};
use garage_util::background::BackgroundRunner;
use garage_util::config::Config;
use garage_util::data::Uuid;
use garage_util::data::*;
use garage_util::error::*;
use garage_util::persister::Persister;
use garage_util::time::*;
use crate::consul::*;
use crate::layout::*;
use crate::ring::*;
use crate::rpc_helper::*;
@ -48,13 +49,13 @@ pub enum SystemRpc {
Ok,
/// Request to connect to a specific node (in <pubkey>@<host>:<port> format)
Connect(String),
/// Ask other node its config. Answered with AdvertiseConfig
PullConfig,
/// Ask other node its cluster layout. Answered with AdvertiseClusterLayout
PullClusterLayout,
/// Advertise Garage status. Answered with another AdvertiseStatus.
/// Exchanged with every node on a regular basis.
AdvertiseStatus(NodeStatus),
/// Advertisement of nodes config. Sent spontanously or in response to PullConfig
AdvertiseConfig(NetworkConfig),
/// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout
AdvertiseClusterLayout(ClusterLayout),
/// Get known nodes states
GetKnownNodes,
/// Return known nodes
@ -70,7 +71,7 @@ pub struct System {
/// The id of this node
pub id: Uuid,
persist_config: Persister<NetworkConfig>,
persist_cluster_layout: Persister<ClusterLayout>,
persist_peer_list: Persister<Vec<(Uuid, SocketAddr)>>,
local_status: ArcSwap<NodeStatus>,
@ -103,8 +104,10 @@ pub struct NodeStatus {
pub hostname: String,
/// Replication factor configured on the node
pub replication_factor: usize,
/// Configuration version
pub config_version: u64,
/// Cluster layout version
pub cluster_layout_version: u64,
/// Hash of cluster layout staging data
pub cluster_layout_staging_hash: Hash,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -187,17 +190,17 @@ impl System {
gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID");
info!("Node public key: {}", hex::encode(&node_key.public_key()));
let persist_config = Persister::new(&config.metadata_dir, "network_config");
let persist_cluster_layout = Persister::new(&config.metadata_dir, "cluster_layout");
let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list");
let net_config = match persist_config.load() {
let cluster_layout = match persist_cluster_layout.load() {
Ok(x) => x,
Err(e) => {
info!(
"No valid previous network configuration stored ({}), starting fresh.",
"No valid previous cluster layout stored ({}), starting fresh.",
e
);
NetworkConfig::new()
ClusterLayout::new(replication_factor)
}
};
@ -206,10 +209,11 @@ impl System {
.into_string()
.unwrap_or_else(|_| "<invalid utf-8>".to_string()),
replication_factor,
config_version: net_config.version,
cluster_layout_version: cluster_layout.version,
cluster_layout_staging_hash: cluster_layout.staging_hash,
};
let ring = Ring::new(net_config, replication_factor);
let ring = Ring::new(cluster_layout, replication_factor);
let (update_ring, ring) = watch::channel(Arc::new(ring));
if let Some(addr) = config.rpc_public_addr {
@ -229,7 +233,7 @@ impl System {
let sys = Arc::new(System {
id: netapp.id.into(),
persist_config,
persist_cluster_layout,
persist_peer_list,
local_status: ArcSwap::new(Arc::new(local_status)),
node_status: RwLock::new(HashMap::new()),
@ -292,12 +296,12 @@ impl System {
}
/// Save network configuration to disc
async fn save_network_config(self: Arc<Self>) -> Result<(), Error> {
async fn save_cluster_layout(self: Arc<Self>) -> Result<(), Error> {
let ring: Arc<Ring> = self.ring.borrow().clone();
self.persist_config
.save_async(&ring.config)
self.persist_cluster_layout
.save_async(&ring.layout)
.await
.expect("Cannot save current cluster configuration");
.expect("Cannot save current cluster layout");
Ok(())
}
@ -305,7 +309,8 @@ impl System {
let mut new_si: NodeStatus = self.local_status.load().as_ref().clone();
let ring = self.ring.borrow();
new_si.config_version = ring.config.version;
new_si.cluster_layout_version = ring.layout.version;
new_si.cluster_layout_staging_hash = ring.layout.staging_hash;
self.local_status.swap(Arc::new(new_si));
}
@ -337,9 +342,9 @@ impl System {
)));
}
fn handle_pull_config(&self) -> SystemRpc {
fn handle_pull_cluster_layout(&self) -> SystemRpc {
let ring = self.ring.borrow().clone();
SystemRpc::AdvertiseConfig(ring.config.clone())
SystemRpc::AdvertiseClusterLayout(ring.layout.clone())
}
fn handle_get_known_nodes(&self) -> SystemRpc {
@ -360,7 +365,8 @@ impl System {
.unwrap_or(NodeStatus {
hostname: "?".to_string(),
replication_factor: 0,
config_version: 0,
cluster_layout_version: 0,
cluster_layout_staging_hash: Hash::from([0u8; 32]),
}),
})
.collect::<Vec<_>>();
@ -381,10 +387,12 @@ impl System {
std::process::exit(1);
}
if info.config_version > local_info.config_version {
if info.cluster_layout_version > local_info.cluster_layout_version
|| info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash
{
let self2 = self.clone();
self.background.spawn_cancellable(async move {
self2.pull_config(from).await;
self2.pull_cluster_layout(from).await;
Ok(())
});
}
@ -397,32 +405,39 @@ impl System {
Ok(SystemRpc::Ok)
}
async fn handle_advertise_config(
async fn handle_advertise_cluster_layout(
self: Arc<Self>,
adv: &NetworkConfig,
adv: &ClusterLayout,
) -> Result<SystemRpc, Error> {
let update_ring = self.update_ring.lock().await;
let ring: Arc<Ring> = self.ring.borrow().clone();
let mut layout: ClusterLayout = self.ring.borrow().layout.clone();
if adv.version > ring.config.version {
let ring = Ring::new(adv.clone(), self.replication_factor);
let prev_layout_check = layout.check();
if layout.merge(adv) {
if prev_layout_check && !layout.check() {
error!("New cluster layout is invalid, discarding.");
return Err(Error::Message(
"New cluster layout is invalid, discarding.".into(),
));
}
let ring = Ring::new(layout.clone(), self.replication_factor);
update_ring.send(Arc::new(ring))?;
drop(update_ring);
let self2 = self.clone();
let adv = adv.clone();
self.background.spawn_cancellable(async move {
self2
.rpc
.broadcast(
&self2.system_endpoint,
SystemRpc::AdvertiseConfig(adv),
SystemRpc::AdvertiseClusterLayout(layout),
RequestStrategy::with_priority(PRIO_HIGH),
)
.await;
Ok(())
});
self.background.spawn(self.clone().save_network_config());
self.background.spawn(self.clone().save_cluster_layout());
}
Ok(SystemRpc::Ok)
@ -456,14 +471,15 @@ impl System {
};
while !*stop_signal.borrow() {
let not_configured = self.ring.borrow().config.members.is_empty();
let not_configured = !self.ring.borrow().layout.check();
let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor;
let expected_n_nodes = self.ring.borrow().layout.num_nodes();
let bad_peers = self
.fullmesh
.get_peer_list()
.iter()
.filter(|p| p.is_up())
.count() != self.ring.borrow().config.members.len();
.count() != expected_n_nodes;
if not_configured || no_peers || bad_peers {
info!("Doing a bootstrap/discovery step (not_configured: {}, no_peers: {}, bad_peers: {})", not_configured, no_peers, bad_peers);
@ -533,18 +549,18 @@ impl System {
self.persist_peer_list.save_async(&peer_list).await
}
async fn pull_config(self: Arc<Self>, peer: Uuid) {
async fn pull_cluster_layout(self: Arc<Self>, peer: Uuid) {
let resp = self
.rpc
.call(
&self.system_endpoint,
peer,
SystemRpc::PullConfig,
SystemRpc::PullClusterLayout,
RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
)
.await;
if let Ok(SystemRpc::AdvertiseConfig(config)) = resp {
let _: Result<_, _> = self.handle_advertise_config(&config).await;
if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {
let _: Result<_, _> = self.handle_advertise_cluster_layout(&layout).await;
}
}
}
@ -554,9 +570,11 @@ impl EndpointHandler<SystemRpc> for System {
async fn handle(self: &Arc<Self>, msg: &SystemRpc, from: NodeID) -> Result<SystemRpc, Error> {
match msg {
SystemRpc::Connect(node) => self.handle_connect(node).await,
SystemRpc::PullConfig => Ok(self.handle_pull_config()),
SystemRpc::PullClusterLayout => Ok(self.handle_pull_cluster_layout()),
SystemRpc::AdvertiseStatus(adv) => self.handle_advertise_status(from.into(), adv).await,
SystemRpc::AdvertiseConfig(adv) => self.clone().handle_advertise_config(adv).await,
SystemRpc::AdvertiseClusterLayout(adv) => {
self.clone().handle_advertise_cluster_layout(adv).await
}
SystemRpc::GetKnownNodes => Ok(self.handle_get_known_nodes()),
_ => Err(Error::BadRpc("Unexpected RPC message".to_string())),
}

View file

@ -1,11 +1,12 @@
[package]
name = "garage_table"
version = "0.4.0"
version = "0.5.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
description = "Table sharding and replication engine (DynamoDB-like) for the Garage object store"
repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
readme = "../../README.md"
[lib]
path = "lib.rs"
@ -13,8 +14,8 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
garage_rpc = { version = "0.4.0", path = "../rpc" }
garage_util = { version = "0.4.0", path = "../util" }
garage_rpc = { version = "0.5.0", path = "../rpc" }
garage_util = { version = "0.5.0", path = "../util" }
async-trait = "0.1.7"
bytes = "1.0"

View file

@ -4,7 +4,6 @@
#[macro_use]
extern crate log;
pub mod crdt;
pub mod schema;
pub mod util;
@ -18,3 +17,7 @@ pub mod table;
pub use schema::*;
pub use table::*;
pub use util::*;
pub mod crdt {
pub use garage_util::crdt::*;
}

View file

@ -28,10 +28,10 @@ impl TableReplication for TableFullReplication {
fn write_nodes(&self, _hash: &Hash) -> Vec<Uuid> {
let ring = self.system.ring.borrow();
ring.config.members.keys().cloned().collect::<Vec<_>>()
ring.layout.node_ids().to_vec()
}
fn write_quorum(&self) -> usize {
let nmembers = self.system.ring.borrow().config.members.len();
let nmembers = self.system.ring.borrow().layout.node_ids().len();
if nmembers > self.max_faults {
nmembers - self.max_faults
} else {

View file

@ -1,11 +1,12 @@
[package]
name = "garage_util"
version = "0.4.0"
version = "0.5.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
description = "Utility crate for the Garage object store"
repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
readme = "../../README.md"
[lib]
path = "lib.rs"

View file

@ -1,4 +1,4 @@
use garage_util::data::*;
use crate::data::*;
/// Definition of a CRDT - all CRDT Rust types implement this.
///

View file

@ -1,6 +1,8 @@
use std::cmp::Ordering;
use serde::{Deserialize, Serialize};
use garage_util::time::now_msec;
use crate::time::now_msec;
use crate::crdt::crdt::*;
@ -104,11 +106,15 @@ where
T: Clone + Crdt,
{
fn merge(&mut self, other: &Self) {
if other.ts > self.ts {
self.ts = other.ts;
self.v = other.v.clone();
} else if other.ts == self.ts {
self.v.merge(&other.v);
match other.ts.cmp(&self.ts) {
Ordering::Greater => {
self.ts = other.ts;
self.v = other.v.clone();
}
Ordering::Equal => {
self.v.merge(&other.v);
}
Ordering::Less => (),
}
}
}

View file

@ -1,6 +1,8 @@
use std::cmp::Ordering;
use serde::{Deserialize, Serialize};
use garage_util::time::now_msec;
use crate::time::now_msec;
use crate::crdt::crdt::*;
@ -135,11 +137,15 @@ where
match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(k)) {
Ok(i) => {
let (_, ts1, _v1) = &self.vals[i];
if ts2 > ts1 {
self.vals[i].1 = *ts2;
self.vals[i].2 = v2.clone();
} else if ts1 == ts2 {
self.vals[i].2.merge(v2);
match ts2.cmp(ts1) {
Ordering::Greater => {
self.vals[i].1 = *ts2;
self.vals[i].2 = v2.clone();
}
Ordering::Equal => {
self.vals[i].2.merge(v2);
}
Ordering::Less => (),
}
}
Err(i) => {

View file

@ -5,6 +5,7 @@ extern crate log;
pub mod background;
pub mod config;
pub mod crdt;
pub mod data;
pub mod error;
pub mod persister;

View file

@ -1,11 +1,12 @@
[package]
name = "garage_web"
version = "0.4.0"
version = "0.5.0"
authors = ["Alex Auvolat <alex@adnab.me>", "Quentin Dufour <quentin@dufour.io>"]
edition = "2018"
license = "AGPL-3.0"
description = "S3-like website endpoint crate for the Garage object store"
repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
readme = "../../README.md"
[lib]
path = "lib.rs"
@ -13,10 +14,10 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
garage_api = { version = "0.4.0", path = "../api" }
garage_model = { version = "0.4.0", path = "../model" }
garage_util = { version = "0.4.0", path = "../util" }
garage_table = { version = "0.4.0", path = "../table" }
garage_api = { version = "0.5.0", path = "../api" }
garage_model = { version = "0.5.0", path = "../model" }
garage_util = { version = "0.5.0", path = "../util" }
garage_table = { version = "0.5.0", path = "../table" }
err-derive = "0.3"
log = "0.4"