Merge branch 'dev-0.2'

This commit is contained in:
Alex Auvolat 2021-03-18 19:27:02 +01:00
commit 4348bde180
71 changed files with 4654 additions and 3631 deletions

View file

@ -2,20 +2,9 @@ kind: pipeline
name: default name: default
workspace: workspace:
base: /drone base: /drone/garage
clone:
disable: true
steps: steps:
- name: clone
image: alpine/git
commands:
- mkdir -p cargo
- git clone https://git.deuxfleurs.fr/Deuxfleurs/garage.git
- cd garage
- git checkout $DRONE_COMMIT
- name: restore-cache - name: restore-cache
image: meltwater/drone-cache:dev image: meltwater/drone-cache:dev
environment: environment:
@ -31,11 +20,11 @@ steps:
cache_key: '{{ .Repo.Name }}_{{ checksum "garage/Cargo.lock" }}_{{ arch }}_{{ os }}_gzip' cache_key: '{{ .Repo.Name }}_{{ checksum "garage/Cargo.lock" }}_{{ arch }}_{{ os }}_gzip'
region: garage region: garage
mount: mount:
- 'garage/target' - 'target'
- 'cargo/registry/index' - '/drone/cargo/registry/index'
- 'cargo/registry/cache' - '/drone/cargo/registry/cache'
- 'cargo/git/db' - '/drone/cargo/bin'
- 'cargo/bin' - '/drone/cargo/git/db'
path_style: true path_style: true
endpoint: https://garage.deuxfleurs.fr endpoint: https://garage.deuxfleurs.fr
@ -47,7 +36,6 @@ steps:
- apt-get update - apt-get update
- apt-get install --yes libsodium-dev - apt-get install --yes libsodium-dev
- pwd - pwd
- cd garage
- cargo build - cargo build
- name: cargo-test - name: cargo-test
@ -57,7 +45,6 @@ steps:
commands: commands:
- apt-get update - apt-get update
- apt-get install --yes libsodium-dev - apt-get install --yes libsodium-dev
- cd garage
- cargo test - cargo test
- name: rebuild-cache - name: rebuild-cache
@ -75,11 +62,11 @@ steps:
cache_key: '{{ .Repo.Name }}_{{ checksum "garage/Cargo.lock" }}_{{ arch }}_{{ os }}_gzip' cache_key: '{{ .Repo.Name }}_{{ checksum "garage/Cargo.lock" }}_{{ arch }}_{{ os }}_gzip'
region: garage region: garage
mount: mount:
- 'garage/target' - 'target'
- 'cargo/registry/index' - '/drone/cargo/registry/index'
- 'cargo/registry/cache' - '/drone/cargo/registry/cache'
- 'cargo/git/db' - '/drone/cargo/git/db'
- 'cargo/bin' - '/drone/cargo/bin'
path_style: true path_style: true
endpoint: https://garage.deuxfleurs.fr endpoint: https://garage.deuxfleurs.fr
@ -91,5 +78,4 @@ steps:
- apt-get update - apt-get update
- apt-get install --yes libsodium-dev awscli python-pip - apt-get install --yes libsodium-dev awscli python-pip
- pip install s3cmd - pip install s3cmd
- cd garage
- ./script/test-smoke.sh || (cat /tmp/garage.log; false) - ./script/test-smoke.sh || (cat /tmp/garage.log; false)

785
Cargo.lock generated

File diff suppressed because it is too large Load diff

142
LICENSE
View file

@ -1,5 +1,5 @@
GNU GENERAL PUBLIC LICENSE GNU AFFERO GENERAL PUBLIC LICENSE
Version 3, 29 June 2007 Version 3, 19 November 2007
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/> Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies Everyone is permitted to copy and distribute verbatim copies
@ -7,17 +7,15 @@
Preamble Preamble
The GNU General Public License is a free, copyleft license for The GNU Affero General Public License is a free, copyleft license for
software and other kinds of works. software and other kinds of works, specifically designed to ensure
cooperation with the community in the case of network server software.
The licenses for most software and other practical works are designed The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast, to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to our General Public Licenses are intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the software for all its users.
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you price. Our General Public Licenses are designed to make sure that you
@ -26,44 +24,34 @@ them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things. free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you Developers that use our General Public Licenses protect your rights
these rights or asking you to surrender the rights. Therefore, you have with two steps: (1) assert copyright on the software, and (2) offer
certain responsibilities if you distribute copies of the software, or if you this License which gives you legal permission to copy, distribute
you modify it: responsibilities to respect the freedom of others. and/or modify the software.
For example, if you distribute copies of such a program, whether A secondary benefit of defending all users' freedom is that
gratis or for a fee, you must pass on to the recipients the same improvements made in alternate versions of the program, if they
freedoms that you received. You must make sure that they, too, receive receive widespread use, become available for other developers to
or can get the source code. And you must show them these terms so they incorporate. Many developers of free software are heartened and
know their rights. encouraged by the resulting cooperation. However, in the case of
software used on network servers, this result may fail to come about.
The GNU General Public License permits making a modified version and
letting the public access it on a server without ever releasing its
source code to the public.
Developers that use the GNU GPL protect your rights with two steps: The GNU Affero General Public License is designed specifically to
(1) assert copyright on the software, and (2) offer you this License ensure that, in such cases, the modified source code becomes available
giving you legal permission to copy, distribute and/or modify it. to the community. It requires the operator of a network server to
provide the source code of the modified version running there to the
users of that server. Therefore, public use of a modified version, on
a publicly accessible server, gives the public access to the source
code of the modified version.
For the developers' and authors' protection, the GPL clearly explains An older license, called the Affero General Public License and
that there is no warranty for this free software. For both users' and published by Affero, was designed to accomplish similar goals. This is
authors' sake, the GPL requires that modified versions be marked as a different license, not a version of the Affero GPL, but Affero has
changed, so that their problems will not be attributed erroneously to released a new version of the Affero GPL which permits relicensing under
authors of previous versions. this license.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and The precise terms and conditions for copying, distribution and
modification follow. modification follow.
@ -72,7 +60,7 @@ modification follow.
0. Definitions. 0. Definitions.
"This License" refers to version 3 of the GNU General Public License. "This License" refers to version 3 of the GNU Affero General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks. works, such as semiconductor masks.
@ -549,35 +537,45 @@ to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program. License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License. 13. Remote Network Interaction; Use with the GNU General Public License.
Notwithstanding any other provision of this License, if you modify the
Program, your modified version must prominently offer all users
interacting with it remotely through a computer network (if your version
supports such interaction) an opportunity to receive the Corresponding
Source of your version by providing access to the Corresponding Source
from a network server at no charge, through some standard or customary
means of facilitating copying of software. This Corresponding Source
shall include the Corresponding Source for any work covered by version 3
of the GNU General Public License that is incorporated pursuant to the
following paragraph.
Notwithstanding any other provision of this License, you have Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single under version 3 of the GNU General Public License into a single
combined work, and to convey the resulting work. The terms of this combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work, License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License, but the work with which it is combined will remain governed by version
section 13, concerning interaction through a network will apply to the 3 of the GNU General Public License.
combination as such.
14. Revised Versions of this License. 14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will the GNU Affero General Public License from time to time. Such new versions
be similar in spirit to the present version, but may differ in detail to will be similar in spirit to the present version, but may differ in detail to
address new problems or concerns. address new problems or concerns.
Each version is given a distinguishing version number. If the Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General Program specifies that a certain numbered version of the GNU Affero General
Public License "or any later version" applies to it, you have the Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published GNU Affero General Public License, you may choose any version ever published
by the Free Software Foundation. by the Free Software Foundation.
If the Program specifies that a proxy can decide which future If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's versions of the GNU Affero General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you public statement of acceptance of a version permanently authorizes you
to choose that version for the Program. to choose that version for the Program.
@ -635,41 +633,29 @@ the "copyright" line and a pointer to where the full notice is found.
Copyright (C) <year> <name of author> Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or the Free Software Foundation, either version 3 of the License, or
(at your option) any later version. (at your option) any later version.
This program is distributed in the hope that it will be useful, This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. GNU Affero General Public License for more details.
You should have received a copy of the GNU General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. along with this program. If not, see <https://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail. Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short If your software can interact with users remotely through a computer
notice like this when it starts in an interactive mode: network, you should also make sure that it provides a way for users to
get its source. For example, if your program is a web application, its
<program> Copyright (C) <year> <name of author> interface could display a "Source" link that leads users to an archive
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. of the code. There are many ways you could offer source, and different
This is free software, and you are welcome to redistribute it solutions will be better for different programs; see section 13 for the
under certain conditions; type `show c' for details. specific requirements.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school, You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary. if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see For more information on this, and how to apply and follow the GNU AGPL, see
<https://www.gnu.org/licenses/>. <https://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
<https://www.gnu.org/licenses/why-not-lgpl.html>.

View file

@ -4,7 +4,7 @@ DOCKER=lxpz/garage_amd64
all: all:
#cargo fmt || true #cargo fmt || true
#RUSTFLAGS="-C link-arg=-fuse-ld=lld" cargo build #RUSTFLAGS="-C link-arg=-fuse-ld=lld" cargo build
cargo build clear; cargo build
$(BIN): $(BIN):
#RUSTFLAGS="-C link-arg=-fuse-ld=lld" cargo build --release #RUSTFLAGS="-C link-arg=-fuse-ld=lld" cargo build --release

View file

@ -1,6 +1,11 @@
# Garage Garage [![Build Status](https://drone.deuxfleurs.fr/api/badges/Deuxfleurs/garage/status.svg)](https://drone.deuxfleurs.fr/Deuxfleurs/garage)
===
[![Build Status](https://drone.deuxfleurs.fr/api/badges/Deuxfleurs/garage/status.svg)](https://drone.deuxfleurs.fr/Deuxfleurs/garage) <p align="center" style="text-align:center;">
<a href="https://git.deuxfleurs.fr/Deuxfleurs/garage">
<img alt="Garage logo" src="doc/logo/garage.png" height="200" />
</a>
</p>
Garage is a lightweight S3-compatible distributed object store, with the following goals: Garage is a lightweight S3-compatible distributed object store, with the following goals:

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.1 KiB

View file

@ -0,0 +1,113 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="250"
height="250"
viewBox="0 0 66.145832 66.145831"
version="1.1"
id="svg916"
inkscape:version="1.0.2 (e86c870879, 2021-01-15)"
sodipodi:docname="garage-dark-notext.svg"
inkscape:export-filename="/home/lx/Deuxfleurs/garage/garage-dark-notext.png"
inkscape:export-xdpi="96"
inkscape:export-ydpi="96">
<defs
id="defs910" />
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="0.0"
inkscape:pageshadow="2"
inkscape:zoom="2.3640695"
inkscape:cx="127.28732"
inkscape:cy="150.37984"
inkscape:document-units="mm"
inkscape:current-layer="layer1"
inkscape:document-rotation="0"
showgrid="false"
fit-margin-top="0"
fit-margin-left="0"
fit-margin-right="0"
fit-margin-bottom="0"
units="px"
inkscape:window-width="1920"
inkscape:window-height="1039"
inkscape:window-x="0"
inkscape:window-y="20"
inkscape:window-maximized="0" />
<metadata
id="metadata913">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(-141.5009,-98.254059)">
<rect
style="fill:#4e4e4e;fill-opacity:1;stroke-width:1.01574"
id="rect858"
width="66.592186"
height="66.832306"
x="141.5009"
y="98.056282" />
<g
id="g1775"
transform="matrix(1.9019239,0,0,1.9019239,-157.45231,-108.13709)">
<path
class="cls-2"
d="m 187.70646,127.72029 a 0.39366647,0.39366647 0 0 1 -0.0176,0.1366 0.02790919,0.02790919 0 0 1 0,0.0117 l -0.0176,0.0455 v 0 l -0.0176,0.0338 -2.83058,5.59653 c -0.39367,0.77705 -1.11784,0.75355 -0.99592,-0.0323 l 0.56994,-3.18164 c 0.0191,-0.1043 0.18655,-0.83875 0.34666,-1.37049 l -5.46286,1.7054 c -0.85784,5.57155 -8.18914,5.66409 -9.38483,0 l -5.47461,-1.70981 c 0.16011,0.53174 0.32904,1.2706 0.34813,1.3749 l 0.56994,3.18164 c 0.12192,0.78587 -0.60225,0.80937 -0.99592,0.0323 l -2.84822,-5.63031 a 0.20417776,0.20417776 0 0 1 -0.0176,-0.047 0.42304456,0.42304456 0 0 1 0.22181,-0.56552 l 11.69689,-5.17495 a 2.9113691,2.9113691 0 0 1 2.35024,0 l 11.69689,5.17495 a 0.41863785,0.41863785 0 0 1 0.26293,0.41864 z"
id="path24-31"
style="stroke-width:0.146891" />
<path
class="cls-3"
d="m 178.30988,128.69564 5.05744,-2.03591 a 0.21446009,0.21446009 0 0 0 0,-0.39807 c -0.58756,-0.2453 -1.3132,-0.52733 -2.02415,-0.82259 -0.13073,-0.0543 -1.36902,0.83434 -1.48213,0.92542 l -2.17985,1.74212 c -0.52734,0.44214 -0.0705,0.86959 0.62869,0.58903 z"
id="path26-9"
style="stroke-width:0.146891" />
<circle
class="cls-3"
cx="174.64349"
cy="130.68452"
r="2.6366842"
id="circle28-4"
style="stroke-width:0.146891" />
<path
id="path24-3-6-9-0"
style="fill:#ff9329;fill-opacity:1;stroke-width:0.146891"
d="m 174.54269,116.93385 a 2.9113691,2.9113691 0 0 0 -1.14618,0.24753 l -11.69696,5.17488 a 0.42304456,0.42304456 0 0 0 -0.22169,0.56586 0.20417776,0.20417776 0 0 0 0.0176,0.047 l 0.79634,1.57355 11.10475,-4.91288 a 2.9113691,2.9113691 0 0 1 1.14618,-0.24753 2.9113691,2.9113691 0 0 1 1.20406,0.24753 l 11.12387,4.92115 0.7829,-1.54823 0.0176,-0.0336 0.0181,-0.0455 a 0.02790919,0.02790919 0 0 0 0,-0.0119 0.39366647,0.39366647 0 0 0 0.0176,-0.13642 0.41863785,0.41863785 0 0 0 -0.26303,-0.4191 l -11.69697,-5.17488 a 2.9113691,2.9113691 0 0 0 -1.20406,-0.24753 z m -10.12134,9.52449 c 0.0218,0.0723 0.0408,0.14674 0.0615,0.22066 h 0.51831 l -0.008,-0.0419 z m 20.32227,0.005 -0.57103,0.17828 -0.007,0.0377 h 0.5178 c 0.0202,-0.0723 0.0386,-0.14514 0.0599,-0.216 z" />
<path
class="cls-2"
d="m 187.70647,127.72029 a 0.39366647,0.39366647 0 0 1 -0.0176,0.13661 0.02790919,0.02790919 0 0 1 0,0.0117 l -0.0176,0.0455 v 0 l -0.0176,0.0338 -2.83058,5.59652 c -0.39366,0.77705 -1.11783,0.75355 -0.99591,-0.0323 l 0.56993,-3.18165 c 0.0191,-0.10429 0.18655,-0.83874 0.34666,-1.37049 l -5.46285,1.7054 c -0.85784,5.57156 -8.18915,5.6641 -9.38484,0 l -5.4746,-1.70981 c 0.16011,0.53175 0.32903,1.27061 0.34813,1.3749 l 0.56993,3.18165 c 0.12192,0.78586 -0.60225,0.80936 -0.99592,0.0323 l -2.84822,-5.63031 a 0.20417776,0.20417776 0 0 1 -0.0176,-0.047 0.42304456,0.42304456 0 0 1 0.22181,-0.56553 l 11.69688,-5.17495 a 2.9113691,2.9113691 0 0 1 2.35025,0 l 11.69689,5.17495 a 0.41863785,0.41863785 0 0 1 0.26293,0.41864 z"
id="path24-0-3"
style="fill:#ff9329;fill-opacity:1;stroke-width:0.146891" />
<path
class="cls-3"
d="m 178.30988,128.69564 5.05744,-2.0359 a 0.21446009,0.21446009 0 0 0 0,-0.39807 c -0.58756,-0.24531 -1.3132,-0.52734 -2.02415,-0.82259 -0.13073,-0.0543 -1.36902,0.83434 -1.48212,0.92541 l -2.17986,1.74212 c -0.52734,0.44214 -0.0705,0.86959 0.62869,0.58903 z"
id="path26-2-2"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.146891" />
<circle
class="cls-3"
cx="174.64349"
cy="130.68452"
r="2.6366842"
id="circle28-3-0"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.146891" />
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 5.8 KiB

174
doc/logo/garage-dark.svg Normal file
View file

@ -0,0 +1,174 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="250"
height="250"
viewBox="0 0 66.145832 66.145831"
version="1.1"
id="svg916"
inkscape:version="1.0.2 (e86c870879, 2021-01-15)"
sodipodi:docname="garage-dark.svg">
<defs
id="defs910" />
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="0.0"
inkscape:pageshadow="2"
inkscape:zoom="2.3640695"
inkscape:cx="132.7426"
inkscape:cy="151.74366"
inkscape:document-units="mm"
inkscape:current-layer="layer1"
inkscape:document-rotation="0"
showgrid="false"
fit-margin-top="0"
fit-margin-left="0"
fit-margin-right="0"
fit-margin-bottom="0"
units="px"
inkscape:window-width="1920"
inkscape:window-height="1039"
inkscape:window-x="0"
inkscape:window-y="20"
inkscape:window-maximized="0" />
<metadata
id="metadata913">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(-141.5009,-98.254059)">
<rect
style="fill:#4e4e4e;fill-opacity:1;stroke-width:1.01574"
id="rect858"
width="66.592186"
height="66.832306"
x="141.5009"
y="98.056282" />
<g
id="g1637"
transform="translate(1.5164686,-0.22143797)">
<g
id="g1034-5"
transform="matrix(0.26458333,0,0,0.26458333,140.0054,98.562655)">
<path
class="cls-1"
d="m 85.377935,159.38378 5.163143,-0.0333 h 0.06662 q 2.864711,0 2.864711,2.69816 v 8.69407 a 24.849705,24.849705 0 0 1 -8.649651,1.43235 q -4.730105,0 -7.128468,-3.21447 -2.398363,-3.21447 -2.398363,-8.76068 0,-5.55177 2.981299,-8.62745 a 9.7600046,9.7600046 0 0 1 7.29502,-3.08123 13.368653,13.368653 0 0 1 7.811335,2.43167 3.9250986,3.9250986 0 0 1 -0.682867,1.76547 4.7634152,4.7634152 0 0 1 -1.282458,1.33242 9.798867,9.798867 0 0 0 -5.679457,-1.96533 5.3574542,5.3574542 0 0 0 -4.480275,2.04861 q -1.598909,2.03749 -1.598909,6.41229 0,8.22771 6.062529,8.22771 a 16.910679,16.910679 0 0 0 3.697476,-0.43303 v -3.16451 q 0,-1.49898 0.06662,-2.22071 h -2.442777 a 2.2873276,2.2873276 0 0 1 -1.515632,-0.41638 1.6655298,1.6655298 0 0 1 -0.483004,-1.33242 5.7072154,5.7072154 0 0 1 0.333106,-1.79322 z"
id="path8-2"
style="stroke-width:0.555177" />
<path
class="cls-1"
d="m 111.07151,169.8433 a 4.3137222,4.3137222 0 0 1 -0.55518,1.18253 4.0305821,4.0305821 0 0 1 -0.84942,0.94935 3.7640973,3.7640973 0 0 1 -3.05902,-1.95422 6.7453957,6.7453957 0 0 1 -4.76342,2.13188 q -2.564913,0 -3.886233,-1.49898 a 5.1298318,5.1298318 0 0 1 -1.299113,-3.4643 q 0,-2.77588 1.815427,-4.21379 a 7.3338829,7.3338829 0 0 1 4.669039,-1.3935 q 1.53228,0 2.89802,0.13325 v -0.99932 q 0,-2.63154 -2.53161,-2.63154 -1.79877,0 -5.096518,1.19918 a 4.674587,4.674587 0 0 1 -1.110353,-2.96464 18.581761,18.581761 0 0 1 7.217291,-1.49898 5.8682167,5.8682167 0 0 1 4.0639,1.39905 q 1.56559,1.39904 1.56559,4.23044 v 6.79537 q -0.0111,1.83208 0.9216,2.59822 z m -8.36096,-0.83276 a 4.7134493,4.7134493 0 0 0 3.33106,-1.59891 v -2.94244 a 22.368065,22.368065 0 0 0 -2.53161,-0.13324 2.775883,2.775883 0 0 0 -2.06525,0.68842 2.3928111,2.3928111 0 0 0 -0.69953,1.76546 2.3539488,2.3539488 0 0 0 0.55518,1.66553 1.8431863,1.8431863 0 0 0 1.41015,0.55518 z"
id="path10-28"
style="stroke-width:0.555177" />
<path
class="cls-1"
d="m 113.76966,157.11865 a 3.986168,3.986168 0 0 1 0.55518,-1.21583 3.3310596,3.3310596 0 0 1 0.84942,-0.94935 4.1638245,4.1638245 0 0 1 3.51427,2.96464 q 1.33242,-2.96464 4.29707,-2.96464 a 10.215249,10.215249 0 0 1 1.93201,0.23317 7.4782288,7.4782288 0 0 1 -0.99932,3.88624 8.4497879,8.4497879 0 0 0 -1.49897,-0.19987 q -2.03195,0 -3.26444,2.16519 v 10.64829 a 11.575432,11.575432 0 0 1 -2.03195,0.16655 12.769062,12.769062 0 0 1 -2.09857,-0.16655 v -11.15905 q -0.0222,-2.40947 -1.2547,-3.40879 z"
id="path12-9"
style="stroke-width:0.555177" />
<path
class="cls-1"
d="m 140.38483,169.8433 a 4.3137222,4.3137222 0 0 1 -0.58293,1.18253 4.0305821,4.0305821 0 0 1 -0.84942,0.94935 3.7640973,3.7640973 0 0 1 -3.05348,-1.95422 6.7453957,6.7453957 0 0 1 -4.76341,2.13188 q -2.56492,0 -3.88624,-1.49898 a 5.1298318,5.1298318 0 0 1 -1.29911,-3.4643 q 0,-2.77588 1.81543,-4.21379 a 7.3338829,7.3338829 0 0 1 4.64682,-1.4157 q 1.53229,0 2.89803,0.13324 v -0.99932 q 0,-2.63153 -2.53161,-2.63153 -1.79877,0 -5.09652,1.19918 a 4.674587,4.674587 0 0 1 -1.11035,-2.96465 18.581761,18.581761 0 0 1 7.21729,-1.49897 5.8682167,5.8682167 0 0 1 4.0639,1.39904 q 1.56559,1.39905 1.56559,4.23045 v 6.81757 q 0.0333,1.83208 0.96601,2.59822 z m -8.37206,-0.83276 a 4.7134493,4.7134493 0 0 0 3.33106,-1.59891 v -2.94244 a 22.368065,22.368065 0 0 0 -2.53161,-0.13324 2.775883,2.775883 0 0 0 -2.06526,0.69952 2.3928111,2.3928111 0 0 0 -0.69952,1.76546 2.3539488,2.3539488 0 0 0 0.55518,1.66553 1.8431863,1.8431863 0 0 0 1.41015,0.54408 z"
id="path14-7"
style="stroke-width:0.555177" />
<path
class="cls-1"
d="m 144.48203,169.71006 q -1.49897,-2.29843 -1.49897,-6.34567 0,-4.04724 1.8987,-6.34567 a 5.740526,5.740526 0 0 1 4.56355,-2.29843 6.4400486,6.4400486 0 0 1 4.49693,1.66553 3.7696491,3.7696491 0 0 1 2.63154,-1.43235 3.1200925,3.1200925 0 0 1 0.88273,0.93269 3.8862362,3.8862362 0 0 1 0.55518,1.16587 q -0.9327,0.79946 -0.9327,2.86472 v 9.438 q 0,5.29638 -1.73215,7.49488 -1.73215,2.1985 -5.69611,2.22071 a 16.100121,16.100121 0 0 1 -5.9626,-1.11036 4.4802752,4.4802752 0 0 1 1.03263,-3.03126 10.892565,10.892565 0 0 0 4.48028,1.03263 q 2.18184,0 3.0146,-1.11035 a 4.9965894,4.9965894 0 0 0 0.83277,-3.06458 V 170.454 a 6.4011862,6.4011862 0 0 1 -4.16383,1.56559 4.9188647,4.9188647 0 0 1 -4.40255,-2.30953 z m 8.56083,-2.69816 v -7.72806 a 4.2915151,4.2915151 0 0 0 -2.86471,-1.36573 2.4039147,2.4039147 0 0 0 -2.18185,1.43235 8.6885138,8.6885138 0 0 0 -0.7828,4.09721 q 0,2.66485 0.71618,3.93065 a 2.1318781,2.1318781 0 0 0 1.88205,1.2658 4.2304457,4.2304457 0 0 0 3.23113,-1.63222 z"
id="path16-3"
style="stroke-width:0.555177" />
<path
class="cls-1"
d="m 174.20619,164.78009 h -9.32697 a 5.6405943,5.6405943 0 0 0 0.88273,3.04792 q 0.7828,1.0826 2.74813,1.0826 a 10.120869,10.120869 0 0 0 4.36369,-1.16587 4.3803434,4.3803434 0 0 1 1.19918,2.5316 10.759323,10.759323 0 0 1 -6.41229,1.8987 q -3.74744,0 -5.37966,-2.43167 -1.63222,-2.43167 -1.63222,-6.2957 0,-3.88624 1.79877,-6.2957 a 6.0181143,6.0181143 0 0 1 5.14649,-2.43168 q 3.33106,0 5.14648,2.01529 a 7.3449864,7.3449864 0 0 1 1.79878,5.07987 13.04665,13.04665 0 0 1 -0.33311,2.96464 z m -6.42895,-7.06184 q -2.73146,0 -2.93133,4.13051 h 5.79605 v -0.39973 a 4.7245529,4.7245529 0 0 0 -0.69953,-2.69816 2.4316735,2.4316735 0 0 0 -2.14298,-1.03262 z"
id="path18-6"
style="stroke-width:0.555177" />
<path
class="cls-2"
d="m 174.55595,111.039 a 1.4878733,1.4878733 0 0 1 -0.0666,0.51631 0.10548355,0.10548355 0 0 1 0,0.0444 l -0.0666,0.17211 v 0 l -0.0666,0.12769 -10.69826,21.15223 c -1.48787,2.93688 -4.22489,2.84806 -3.76409,-0.12214 l 2.15408,-12.02512 c 0.0722,-0.39418 0.70508,-3.17006 1.31022,-5.1798 l -20.64702,6.4456 c -3.24223,21.05785 -30.95109,21.40761 -35.47023,0 l -20.691432,-6.46226 c 0.605143,2.00974 1.243596,4.80228 1.315769,5.19646 l 2.154085,12.02512 c 0.460796,2.9702 -2.276224,3.05902 -3.764098,0.12214 L 75.49024,111.77183 a 0.77169547,0.77169547 0 0 1 -0.06662,-0.17766 1.5989086,1.5989086 0 0 1 0.838317,-2.13743 L 120.47065,89.897871 a 11.0036,11.0036 0 0 1 8.88282,0 l 44.20871,19.558869 a 1.5822533,1.5822533 0 0 1 0.99377,1.58226 z"
id="path24-31"
style="stroke-width:0.555177" />
<path
class="cls-3"
d="m 139.0413,114.72537 19.11473,-7.69475 a 0.81055784,0.81055784 0 0 0 0,-1.50453 c -2.2207,-0.92714 -4.96328,-1.99308 -7.65033,-3.10899 -0.49411,-0.20541 -5.17425,3.15341 -5.60173,3.49762 l -8.23882,6.58439 c -1.99309,1.67108 -0.26649,3.28665 2.37615,2.22626 z"
id="path26-9"
style="stroke-width:0.555177" />
<circle
class="cls-3"
cx="125.18409"
cy="122.24245"
r="9.9654207"
id="circle28-4"
style="stroke-width:0.555177" />
</g>
<path
class="cls-1"
d="m 162.59498,140.73295 1.36608,-0.009 h 0.0176 q 0.75796,0 0.75796,0.71389 v 2.30031 a 6.5748177,6.5748177 0 0 1 -2.28855,0.37897 q -1.25151,0 -1.88608,-0.85049 -0.63456,-0.8505 -0.63456,-2.31793 0,-1.46891 0.7888,-2.28268 a 2.5823345,2.5823345 0 0 1 1.93014,-0.81524 3.5371227,3.5371227 0 0 1 2.06675,0.64338 1.0385157,1.0385157 0 0 1 -0.18068,0.46711 1.2603203,1.2603203 0 0 1 -0.33931,0.35254 2.5926169,2.5926169 0 0 0 -1.5027,-0.52 1.4174931,1.4174931 0 0 0 -1.1854,0.54203 q -0.42305,0.53909 -0.42305,1.69658 0,2.17692 1.60405,2.17692 a 4.4742838,4.4742838 0 0 0 0.97829,-0.11457 v -0.83728 q 0,-0.3966 0.0176,-0.58756 h -0.64632 a 0.60518875,0.60518875 0 0 1 -0.40101,-0.11017 0.44067142,0.44067142 0 0 1 -0.12779,-0.35254 1.5100341,1.5100341 0 0 1 0.0881,-0.47445 z"
id="path8-6-4"
style="fill:#c3c3c3;fill-opacity:1;stroke-width:0.146891" />
<path
class="cls-1"
d="m 169.39307,143.50037 a 1.141339,1.141339 0 0 1 -0.14689,0.31288 1.0664248,1.0664248 0 0 1 -0.22474,0.25118 0.9959174,0.9959174 0 0 1 -0.80937,-0.51706 1.7847193,1.7847193 0 0 1 -1.26032,0.56406 q -0.67863,0 -1.02823,-0.3966 a 1.357268,1.357268 0 0 1 -0.34373,-0.9166 q 0,-0.73445 0.48034,-1.1149 a 1.9404232,1.9404232 0 0 1 1.23535,-0.36869 q 0.40541,0 0.76676,0.0352 v -0.2644 q 0,-0.69626 -0.66982,-0.69626 -0.47592,0 -1.34845,0.31728 a 1.2368178,1.2368178 0 0 1 -0.29378,-0.78439 4.9164242,4.9164242 0 0 1 1.90957,-0.39661 1.5526323,1.5526323 0 0 1 1.07524,0.37017 q 0.41423,0.37016 0.41423,1.1193 v 1.79794 q -0.003,0.48474 0.24384,0.68745 z m -2.21217,-0.22034 a 1.2471001,1.2471001 0 0 0 0.88134,-0.42304 v -0.77852 a 5.9182171,5.9182171 0 0 0 -0.66982,-0.0353 0.73445237,0.73445237 0 0 0 -0.54643,0.18215 0.63309793,0.63309793 0 0 0 -0.18508,0.46711 0.62281561,0.62281561 0 0 0 0.14689,0.44067 0.48767637,0.48767637 0 0 0 0.3731,0.14689 z"
id="path10-2-5"
style="fill:#c3c3c3;fill-opacity:1;stroke-width:0.146891" />
<path
class="cls-1"
d="m 170.10696,140.13364 a 1.0546736,1.0546736 0 0 1 0.14689,-0.32169 0.88134284,0.88134284 0 0 1 0.22474,-0.25118 1.1016786,1.1016786 0 0 1 0.92982,0.78439 q 0.35254,-0.78439 1.13693,-0.78439 a 2.7027846,2.7027846 0 0 1 0.51118,0.0617 1.9786147,1.9786147 0 0 1 -0.2644,1.02823 2.235673,2.235673 0 0 0 -0.39661,-0.0529 q -0.53762,0 -0.86371,0.57287 v 2.81736 a 3.0626663,3.0626663 0 0 1 -0.53762,0.0441 3.3784809,3.3784809 0 0 1 -0.55525,-0.0441 v -2.95249 q -0.006,-0.63751 -0.33197,-0.90191 z"
id="path12-6-0"
style="fill:#c3c3c3;fill-opacity:1;stroke-width:0.146891" />
<path
class="cls-1"
d="m 177.14889,143.50037 a 1.141339,1.141339 0 0 1 -0.15424,0.31288 1.0664248,1.0664248 0 0 1 -0.22474,0.25118 0.9959174,0.9959174 0 0 1 -0.8079,-0.51706 1.7847193,1.7847193 0 0 1 -1.26032,0.56406 q -0.67863,0 -1.02823,-0.3966 a 1.357268,1.357268 0 0 1 -0.34372,-0.9166 q 0,-0.73445 0.48033,-1.1149 a 1.9404232,1.9404232 0 0 1 1.22947,-0.37457 q 0.40542,0 0.76677,0.0353 v -0.26441 q 0,-0.69626 -0.66982,-0.69626 -0.47593,0 -1.34846,0.31729 a 1.2368178,1.2368178 0 0 1 -0.29378,-0.7844 4.9164242,4.9164242 0 0 1 1.90958,-0.3966 1.5526323,1.5526323 0 0 1 1.07524,0.37016 q 0.41423,0.37017 0.41423,1.11931 v 1.80381 q 0.009,0.48474 0.25559,0.68745 z m -2.21511,-0.22034 a 1.2471001,1.2471001 0 0 0 0.88134,-0.42304 v -0.77852 a 5.9182171,5.9182171 0 0 0 -0.66982,-0.0353 0.73445237,0.73445237 0 0 0 -0.54643,0.18509 0.63309793,0.63309793 0 0 0 -0.18508,0.46711 0.62281561,0.62281561 0 0 0 0.14689,0.44067 0.48767637,0.48767637 0 0 0 0.3731,0.14395 z"
id="path14-1-3"
style="fill:#c3c3c3;fill-opacity:1;stroke-width:0.146891" />
<path
class="cls-1"
d="m 178.23294,143.46511 q -0.3966,-0.60812 -0.3966,-1.67895 0,-1.07084 0.50236,-1.67896 a 1.5188475,1.5188475 0 0 1 1.20744,-0.60813 1.7039295,1.7039295 0 0 1 1.18981,0.44067 0.99738631,0.99738631 0 0 1 0.69626,-0.37897 0.82552446,0.82552446 0 0 1 0.23356,0.24677 1.0282333,1.0282333 0 0 1 0.14689,0.30847 q -0.24678,0.21152 -0.24678,0.75796 v 2.49714 q 0,1.40133 -0.45829,1.98302 -0.4583,0.58168 -1.5071,0.58756 a 4.2598236,4.2598236 0 0 1 -1.5776,-0.29378 1.1854061,1.1854061 0 0 1 0.27321,-0.80203 2.8819911,2.8819911 0 0 0 1.18541,0.27322 q 0.57728,0 0.79761,-0.29378 a 1.3220143,1.3220143 0 0 0 0.22034,-0.81084 v -0.35253 a 1.6936472,1.6936472 0 0 1 -1.10168,0.41423 1.3014496,1.3014496 0 0 1 -1.16484,-0.61107 z m 2.26505,-0.71388 v -2.04472 a 1.1354634,1.1354634 0 0 0 -0.75795,-0.36135 0.63603576,0.63603576 0 0 0 -0.57728,0.37898 2.2988359,2.2988359 0 0 0 -0.20712,1.08405 q 0,0.70508 0.18949,1.03998 a 0.56405941,0.56405941 0 0 0 0.49796,0.33491 1.1193054,1.1193054 0 0 0 0.8549,-0.43185 z"
id="path16-8-6"
style="fill:#c3c3c3;fill-opacity:1;stroke-width:0.146891" />
<path
class="cls-1"
d="m 186.09746,142.16073 h -2.46776 a 1.4924072,1.4924072 0 0 0 0.23355,0.80643 q 0.20712,0.28643 0.72711,0.28643 a 2.6778132,2.6778132 0 0 0 1.15456,-0.30847 1.1589658,1.1589658 0 0 1 0.31728,0.66982 2.8467375,2.8467375 0 0 1 -1.69658,0.50237 q -0.99151,0 -1.42337,-0.64338 -0.43186,-0.64338 -0.43186,-1.66574 0,-1.02823 0.47593,-1.66574 a 1.5922927,1.5922927 0 0 1 1.36167,-0.64338 q 0.88134,0 1.36167,0.53321 a 1.943361,1.943361 0 0 1 0.47593,1.34405 3.4519261,3.4519261 0 0 1 -0.0881,0.7844 z m -1.701,-1.86845 q -0.7227,0 -0.77558,1.09287 h 1.53354 v -0.10577 a 1.2500379,1.2500379 0 0 0 -0.18508,-0.71388 0.64338027,0.64338027 0 0 0 -0.567,-0.27322 z"
id="path18-7-1"
style="fill:#c3c3c3;fill-opacity:1;stroke-width:0.146891" />
<path
id="path24-3-6-9-0"
style="fill:#ff9329;fill-opacity:1;stroke-width:0.146891"
d="m 173.02622,117.15529 a 2.9113691,2.9113691 0 0 0 -1.14618,0.24753 l -11.69696,5.17488 a 0.42304456,0.42304456 0 0 0 -0.22169,0.56586 0.20417776,0.20417776 0 0 0 0.0176,0.047 l 0.79634,1.57355 11.10475,-4.91288 a 2.9113691,2.9113691 0 0 1 1.14618,-0.24753 2.9113691,2.9113691 0 0 1 1.20406,0.24753 l 11.12387,4.92115 0.7829,-1.54823 0.0176,-0.0336 0.0181,-0.0455 a 0.02790919,0.02790919 0 0 0 0,-0.0119 0.39366647,0.39366647 0 0 0 0.0176,-0.13642 0.41863785,0.41863785 0 0 0 -0.26303,-0.4191 l -11.69697,-5.17488 a 2.9113691,2.9113691 0 0 0 -1.20406,-0.24753 z m -10.12134,9.52449 c 0.0218,0.0723 0.0408,0.14674 0.0615,0.22066 h 0.51831 l -0.008,-0.0419 z m 20.32227,0.005 -0.57103,0.17828 -0.007,0.0377 h 0.5178 c 0.0202,-0.0723 0.0386,-0.14514 0.0599,-0.216 z" />
<path
class="cls-2"
d="m 186.19,127.94173 a 0.39366647,0.39366647 0 0 1 -0.0176,0.13661 0.02790919,0.02790919 0 0 1 0,0.0117 l -0.0176,0.0455 v 0 l -0.0176,0.0338 -2.83058,5.59652 c -0.39366,0.77705 -1.11783,0.75355 -0.99591,-0.0323 l 0.56993,-3.18165 c 0.0191,-0.10429 0.18655,-0.83874 0.34666,-1.37049 l -5.46285,1.7054 c -0.85784,5.57156 -8.18915,5.6641 -9.38484,0 l -5.4746,-1.70981 c 0.16011,0.53175 0.32903,1.27061 0.34813,1.3749 l 0.56993,3.18165 c 0.12192,0.78586 -0.60225,0.80936 -0.99592,0.0323 l -2.84822,-5.63031 a 0.20417776,0.20417776 0 0 1 -0.0176,-0.047 0.42304456,0.42304456 0 0 1 0.22181,-0.56553 l 11.69688,-5.17495 a 2.9113691,2.9113691 0 0 1 2.35025,0 l 11.69689,5.17495 a 0.41863785,0.41863785 0 0 1 0.26293,0.41864 z"
id="path24-0-3"
style="fill:#ff9329;fill-opacity:1;stroke-width:0.146891" />
<path
class="cls-3"
d="m 176.79341,128.91708 5.05744,-2.0359 a 0.21446009,0.21446009 0 0 0 0,-0.39807 c -0.58756,-0.24531 -1.3132,-0.52734 -2.02415,-0.82259 -0.13073,-0.0543 -1.36902,0.83434 -1.48212,0.92541 l -2.17986,1.74212 c -0.52734,0.44214 -0.0705,0.86959 0.62869,0.58903 z"
id="path26-2-2"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.146891" />
<circle
class="cls-3"
cx="173.12703"
cy="130.90596"
r="2.6366842"
id="circle28-3-0"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.146891" />
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 17 KiB

BIN
doc/logo/garage-notext.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.8 KiB

146
doc/logo/garage-notext.svg Normal file
View file

@ -0,0 +1,146 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
id="Calque_1"
data-name="Calque 1"
width="250"
height="250"
viewBox="0 0 249.99999 250"
version="1.1"
sodipodi:docname="garage-notext.svg"
inkscape:version="1.0.2 (e86c870879, 2021-01-15)"
inkscape:export-filename="/home/lx/Deuxfleurs/garage/garage-notext.png"
inkscape:export-xdpi="96"
inkscape:export-ydpi="96">
<metadata
id="metadata33">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<sodipodi:namedview
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1"
objecttolerance="10"
gridtolerance="10"
guidetolerance="10"
inkscape:pageopacity="1"
inkscape:pageshadow="2"
inkscape:window-width="1920"
inkscape:window-height="1039"
id="namedview31"
showgrid="false"
inkscape:zoom="2.1842656"
inkscape:cx="143.86571"
inkscape:cy="118.5836"
inkscape:window-x="0"
inkscape:window-y="20"
inkscape:window-maximized="0"
inkscape:current-layer="Calque_1"
inkscape:document-rotation="0"
units="px"
showguides="false"
inkscape:guide-bbox="true"
inkscape:snap-global="false"
width="250mm">
<sodipodi:guide
position="102.90662,161.07694"
orientation="0,-1"
id="guide1016" />
<sodipodi:guide
position="122.45269,170.65683"
orientation="0,-1"
id="guide1018" />
<sodipodi:guide
position="128.86504,180.08221"
orientation="0,-1"
id="guide1020" />
</sodipodi:namedview>
<defs
id="defs4">
<style
id="style2">.cls-1{fill:#3b2100;}.cls-2{fill:#ffd952;}.cls-3{fill:#45c8ff;}</style>
</defs>
<rect
style="fill:#ffffff;stroke-width:3.60793"
id="rect3824"
width="251.68179"
height="250.98253"
x="-0.59092933"
y="-0.31321606" />
<g
id="g1719"
transform="matrix(1.9099251,0,0,1.9099251,-113.74064,-74.610597)">
<path
d="m 138.41049,100.63656 a 8.327649,8.327649 0 0 1 -2.77589,-0.28869 l -34.78736,-9.388039 a 8.4442361,8.4442361 0 0 1 -2.620438,-1.238044 z"
id="path6"
style="stroke-width:0.555177" />
<path
id="path24-3-6"
style="fill:#ffd952;fill-opacity:1;stroke-width:0.555177"
d="m 124.88254,70.600847 a 11.0036,11.0036 0 0 0 -4.33203,0.935547 L 76.341524,91.094987 a 1.5989086,1.5989086 0 0 0 -0.837891,2.138672 0.77169547,0.77169547 0 0 0 0.06641,0.177735 l 7.09375,14.021486 h 6.15625 l -0.875,-4.88867 c -0.07217,-0.39418 -0.711263,-3.187537 -1.316406,-5.197269 l 20.691403,6.462899 c 0.27198,1.28839 0.63292,2.49204 1.0625,3.62304 h 33.54883 c 0.36964,-1.13128 0.66138,-2.33705 0.85938,-3.62304 l 20.64648,-6.445321 c -0.60514,2.009734 -1.23639,4.785511 -1.30859,5.179691 l -0.875,4.88867 h 6.15429 l 7.02735,-13.894533 0.0664,-0.126953 0.0684,-0.171875 a 0.10548355,0.10548355 0 0 0 0,-0.04492 1.4878733,1.4878733 0 0 0 0.0664,-0.515625 1.5822533,1.5822533 0 0 0 -0.99414,-1.583985 L 129.43333,71.536394 a 11.0036,11.0036 0 0 0 -4.55079,-0.935547 z" />
<path
id="path24-3"
style="fill:#49c8fa;fill-opacity:1;stroke-width:0.555177"
d="m 124.88254,79.854518 a 11.0036,11.0036 0 0 0 -4.33203,0.935547 L 76.341524,100.34866 a 1.5989086,1.5989086 0 0 0 -0.837891,2.13672 0.77169547,0.77169547 0 0 0 0.06641,0.17773 l 3.847657,7.60352 h 8.175781 c -0.257897,-1.08856 -0.591943,-2.42953 -0.964844,-3.66797 l 11.744141,3.66797 h 53.371092 l 11.69336,-3.65039 c -0.37193,1.23522 -0.70076,2.56719 -0.95703,3.65039 h 8.17383 l 3.78125,-7.47656 0.0664,-0.12696 0.0684,-0.17187 a 0.10548355,0.10548355 0 0 0 0,-0.0449 1.4878733,1.4878733 0 0 0 0.0664,-0.51563 1.5822533,1.5822533 0 0 0 -0.99414,-1.58203 L 129.43333,80.790065 a 11.0036,11.0036 0 0 0 -4.55079,-0.935547 z" />
<path
class="cls-2"
d="m 174.63576,111.36813 a 1.4878733,1.4878733 0 0 1 -0.0666,0.51631 0.10548355,0.10548355 0 0 1 0,0.0444 l -0.0666,0.17211 v 0 l -0.0666,0.12769 -10.69826,21.15223 c -1.48787,2.93688 -4.22489,2.84806 -3.76409,-0.12214 l 2.15408,-12.02512 c 0.0722,-0.39418 0.70508,-3.17006 1.31022,-5.1798 l -20.64702,6.4456 c -3.24223,21.05785 -30.95109,21.40761 -35.47023,0 l -20.691437,-6.46226 c 0.605143,2.00974 1.243596,4.80228 1.315769,5.19646 l 2.154085,12.02512 c 0.460796,2.9702 -2.276224,3.05902 -3.764098,0.12214 L 75.570045,112.10096 a 0.77169547,0.77169547 0 0 1 -0.06662,-0.17766 1.5989086,1.5989086 0 0 1 0.838317,-2.13743 L 120.55046,90.226998 a 11.0036,11.0036 0 0 1 8.88282,0 l 44.20871,19.558872 a 1.5822533,1.5822533 0 0 1 0.99377,1.58226 z"
id="path24"
style="stroke-width:0.555177" />
<path
class="cls-3"
d="m 139.12111,115.0545 19.11473,-7.69475 a 0.81055784,0.81055784 0 0 0 0,-1.50453 c -2.2207,-0.92714 -4.96328,-1.99308 -7.65033,-3.10899 -0.49411,-0.20541 -5.17425,3.15341 -5.60173,3.49762 l -8.23882,6.58439 c -1.99309,1.67108 -0.26649,3.28665 2.37615,2.22626 z"
id="path26"
style="stroke-width:0.555177" />
<circle
class="cls-3"
cx="125.26389"
cy="122.57157"
r="9.9654207"
id="circle28"
style="stroke-width:0.555177" />
<path
d="m 138.41049,100.63656 a 8.327649,8.327649 0 0 1 -2.77589,-0.28869 l -34.78736,-9.388039 a 8.4442361,8.4442361 0 0 1 -2.620438,-1.238044 z"
id="path6-0"
style="stroke-width:0.555177" />
<path
id="path24-3-6-9"
style="fill:#ff9329;fill-opacity:1;stroke-width:0.555177"
d="m 124.88254,70.600847 a 11.0036,11.0036 0 0 0 -4.33203,0.935547 L 76.341524,91.094987 a 1.5989086,1.5989086 0 0 0 -0.837891,2.138672 0.77169547,0.77169547 0 0 0 0.06641,0.177735 l 7.09375,14.021486 h 6.15625 l -0.875,-4.88867 c -0.07217,-0.39418 -0.711263,-3.187537 -1.316406,-5.197269 l 20.691403,6.462899 c 0.27198,1.28839 0.63292,2.49204 1.0625,3.62304 h 33.54883 c 0.36964,-1.13128 0.66138,-2.33705 0.85938,-3.62304 l 20.64648,-6.445321 c -0.60514,2.009734 -1.23639,4.785511 -1.30859,5.179691 l -0.875,4.88867 h 6.15429 l 7.02735,-13.894533 0.0664,-0.126953 0.0684,-0.171875 a 0.10548355,0.10548355 0 0 0 0,-0.04492 1.4878733,1.4878733 0 0 0 0.0664,-0.515625 1.5822533,1.5822533 0 0 0 -0.99414,-1.583985 L 129.43333,71.536394 a 11.0036,11.0036 0 0 0 -4.55079,-0.935547 z" />
<path
id="path24-3-2"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177"
d="m 124.88254,79.854518 a 11.0036,11.0036 0 0 0 -4.33203,0.935547 L 76.341524,100.34866 a 1.5989086,1.5989086 0 0 0 -0.837891,2.13672 0.77169547,0.77169547 0 0 0 0.06641,0.17773 l 3.847657,7.60352 h 8.175781 c -0.257897,-1.08856 -0.591943,-2.42953 -0.964844,-3.66797 l 11.744141,3.66797 h 53.371092 l 11.69336,-3.65039 c -0.37193,1.23522 -0.70076,2.56719 -0.95703,3.65039 h 8.17383 l 3.78125,-7.47656 0.0664,-0.12696 0.0684,-0.17187 a 0.10548355,0.10548355 0 0 0 0,-0.0449 1.4878733,1.4878733 0 0 0 0.0664,-0.51563 1.5822533,1.5822533 0 0 0 -0.99414,-1.58203 L 129.43333,80.790065 a 11.0036,11.0036 0 0 0 -4.55079,-0.935547 z" />
<path
class="cls-2"
d="m 174.63576,111.36813 a 1.4878733,1.4878733 0 0 1 -0.0666,0.51631 0.10548355,0.10548355 0 0 1 0,0.0444 l -0.0666,0.17211 v 0 l -0.0666,0.12769 -10.69826,21.15223 c -1.48787,2.93688 -4.22489,2.84806 -3.76409,-0.12214 l 2.15408,-12.02512 c 0.0722,-0.39418 0.70508,-3.17006 1.31022,-5.1798 l -20.64702,6.4456 c -3.24223,21.05785 -30.95109,21.40761 -35.47023,0 l -20.691437,-6.46226 c 0.605143,2.00974 1.243596,4.80228 1.315769,5.19646 l 2.154085,12.02512 c 0.460796,2.9702 -2.276224,3.05902 -3.764098,0.12214 L 75.570045,112.10096 a 0.77169547,0.77169547 0 0 1 -0.06662,-0.17766 1.5989086,1.5989086 0 0 1 0.838317,-2.13743 L 120.55046,90.226998 a 11.0036,11.0036 0 0 1 8.88282,0 l 44.20871,19.558872 a 1.5822533,1.5822533 0 0 1 0.99377,1.58226 z"
id="path24-0"
style="fill:#ff9329;fill-opacity:1;stroke-width:0.555177" />
<path
class="cls-3"
d="m 139.12111,115.0545 19.11473,-7.69475 a 0.81055784,0.81055784 0 0 0 0,-1.50453 c -2.2207,-0.92714 -4.96328,-1.99308 -7.65033,-3.10899 -0.49411,-0.20541 -5.17425,3.15341 -5.60173,3.49762 l -8.23882,6.58439 c -1.99309,1.67108 -0.26649,3.28665 2.37615,2.22626 z"
id="path26-2"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<circle
class="cls-3"
cx="125.26389"
cy="122.57157"
r="9.9654207"
id="circle28-3"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
</g>
</svg>

After

Width:  |  Height:  |  Size: 8.8 KiB

BIN
doc/logo/garage.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

206
doc/logo/garage.svg Normal file
View file

@ -0,0 +1,206 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
id="Calque_1"
data-name="Calque 1"
width="250"
height="250"
viewBox="0 0 249.99999 250"
version="1.1"
sodipodi:docname="garage.svg"
inkscape:version="1.0.2 (e86c870879, 2021-01-15)"
inkscape:export-filename="/home/lx/Deuxfleurs/garage/doc/logo/garage.png"
inkscape:export-xdpi="96"
inkscape:export-ydpi="96">
<metadata
id="metadata33">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title />
</cc:Work>
</rdf:RDF>
</metadata>
<sodipodi:namedview
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1"
objecttolerance="10"
gridtolerance="10"
guidetolerance="10"
inkscape:pageopacity="1"
inkscape:pageshadow="2"
inkscape:window-width="1920"
inkscape:window-height="1080"
id="namedview31"
showgrid="false"
inkscape:zoom="2.1842656"
inkscape:cx="90.853672"
inkscape:cy="123.63257"
inkscape:window-x="0"
inkscape:window-y="0"
inkscape:window-maximized="0"
inkscape:current-layer="Calque_1"
inkscape:document-rotation="0"
units="px"
showguides="false"
inkscape:guide-bbox="true"
inkscape:snap-global="false"
width="250mm">
<sodipodi:guide
position="102.90662,161.07694"
orientation="0,-1"
id="guide1016" />
<sodipodi:guide
position="122.45269,170.65683"
orientation="0,-1"
id="guide1018" />
<sodipodi:guide
position="128.86504,180.08221"
orientation="0,-1"
id="guide1020" />
</sodipodi:namedview>
<defs
id="defs4">
<style
id="style2">.cls-1{fill:#3b2100;}.cls-2{fill:#ffd952;}.cls-3{fill:#45c8ff;}</style>
</defs>
<rect
style="fill:#ffffff;stroke-width:3.60793"
id="rect3824"
width="251.68179"
height="250.98253"
x="-0.59092933"
y="-0.31321606" />
<g
id="g1663"
transform="matrix(1.7099534,0,0,1.7099534,-88.607712,-87.994557)">
<path
d="m 138.33068,100.19817 a 8.327649,8.327649 0 0 1 -2.77589,-0.288688 l -34.78736,-9.388036 a 8.4442361,8.4442361 0 0 1 -2.620433,-1.238044 z"
id="path6"
style="stroke-width:0.555177" />
<path
class="cls-1"
d="m 85.377935,159.27452 5.163143,-0.0333 h 0.06662 q 2.864711,0 2.864711,2.69816 v 8.69407 a 24.849705,24.849705 0 0 1 -8.649651,1.43235 q -4.730105,0 -7.128468,-3.21447 -2.398363,-3.21447 -2.398363,-8.76068 0,-5.55177 2.981299,-8.62745 a 9.7600046,9.7600046 0 0 1 7.29502,-3.08123 13.368653,13.368653 0 0 1 7.811335,2.43167 3.9250986,3.9250986 0 0 1 -0.682867,1.76547 4.7634152,4.7634152 0 0 1 -1.282458,1.33242 9.798867,9.798867 0 0 0 -5.679457,-1.96533 5.3574542,5.3574542 0 0 0 -4.480275,2.04861 q -1.598909,2.03749 -1.598909,6.41229 0,8.22771 6.062529,8.22771 a 16.910679,16.910679 0 0 0 3.697476,-0.43303 v -3.16451 q 0,-1.49898 0.06662,-2.22071 h -2.442777 a 2.2873276,2.2873276 0 0 1 -1.515632,-0.41638 1.6655298,1.6655298 0 0 1 -0.483004,-1.33242 5.7072154,5.7072154 0 0 1 0.333106,-1.79322 z"
id="path8"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<path
class="cls-1"
d="m 111.07151,169.73404 a 4.3137222,4.3137222 0 0 1 -0.55518,1.18253 4.0305821,4.0305821 0 0 1 -0.84942,0.94935 3.7640973,3.7640973 0 0 1 -3.05902,-1.95422 6.7453957,6.7453957 0 0 1 -4.76342,2.13188 q -2.564913,0 -3.886233,-1.49898 a 5.1298318,5.1298318 0 0 1 -1.299113,-3.4643 q 0,-2.77588 1.815427,-4.21379 a 7.3338829,7.3338829 0 0 1 4.669039,-1.3935 q 1.53228,0 2.89802,0.13325 v -0.99932 q 0,-2.63154 -2.53161,-2.63154 -1.79877,0 -5.096518,1.19918 a 4.674587,4.674587 0 0 1 -1.110353,-2.96464 18.581761,18.581761 0 0 1 7.217291,-1.49898 5.8682167,5.8682167 0 0 1 4.0639,1.39905 q 1.56559,1.39904 1.56559,4.23044 v 6.79537 q -0.0111,1.83208 0.9216,2.59822 z m -8.36096,-0.83276 a 4.7134493,4.7134493 0 0 0 3.33106,-1.59891 v -2.94244 a 22.368065,22.368065 0 0 0 -2.53161,-0.13324 2.775883,2.775883 0 0 0 -2.06525,0.68842 2.3928111,2.3928111 0 0 0 -0.69953,1.76546 2.3539488,2.3539488 0 0 0 0.55518,1.66553 1.8431863,1.8431863 0 0 0 1.41015,0.55518 z"
id="path10"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<path
class="cls-1"
d="m 113.76966,157.00939 a 3.986168,3.986168 0 0 1 0.55518,-1.21583 3.3310596,3.3310596 0 0 1 0.84942,-0.94935 4.1638245,4.1638245 0 0 1 3.51427,2.96464 q 1.33242,-2.96464 4.29707,-2.96464 a 10.215249,10.215249 0 0 1 1.93201,0.23317 7.4782288,7.4782288 0 0 1 -0.99932,3.88624 8.4497879,8.4497879 0 0 0 -1.49897,-0.19987 q -2.03195,0 -3.26444,2.16519 v 10.64829 a 11.575432,11.575432 0 0 1 -2.03195,0.16655 12.769062,12.769062 0 0 1 -2.09857,-0.16655 v -11.15905 q -0.0222,-2.40947 -1.2547,-3.40879 z"
id="path12"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<path
class="cls-1"
d="m 140.38483,169.73404 a 4.3137222,4.3137222 0 0 1 -0.58293,1.18253 4.0305821,4.0305821 0 0 1 -0.84942,0.94935 3.7640973,3.7640973 0 0 1 -3.05348,-1.95422 6.7453957,6.7453957 0 0 1 -4.76341,2.13188 q -2.56492,0 -3.88624,-1.49898 a 5.1298318,5.1298318 0 0 1 -1.29911,-3.4643 q 0,-2.77588 1.81543,-4.21379 a 7.3338829,7.3338829 0 0 1 4.64682,-1.4157 q 1.53229,0 2.89803,0.13324 v -0.99932 q 0,-2.63153 -2.53161,-2.63153 -1.79877,0 -5.09652,1.19918 a 4.674587,4.674587 0 0 1 -1.11035,-2.96465 18.581761,18.581761 0 0 1 7.21729,-1.49897 5.8682167,5.8682167 0 0 1 4.0639,1.39904 q 1.56559,1.39905 1.56559,4.23045 v 6.81757 q 0.0333,1.83208 0.96601,2.59822 z m -8.37206,-0.83276 a 4.7134493,4.7134493 0 0 0 3.33106,-1.59891 v -2.94244 a 22.368065,22.368065 0 0 0 -2.53161,-0.13324 2.775883,2.775883 0 0 0 -2.06526,0.69952 2.3928111,2.3928111 0 0 0 -0.69952,1.76546 2.3539488,2.3539488 0 0 0 0.55518,1.66553 1.8431863,1.8431863 0 0 0 1.41015,0.54408 z"
id="path14"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<path
class="cls-1"
d="m 144.48203,169.6008 q -1.49897,-2.29843 -1.49897,-6.34567 0,-4.04724 1.8987,-6.34567 a 5.740526,5.740526 0 0 1 4.56355,-2.29843 6.4400486,6.4400486 0 0 1 4.49693,1.66553 3.7696491,3.7696491 0 0 1 2.63154,-1.43235 3.1200925,3.1200925 0 0 1 0.88273,0.93269 3.8862362,3.8862362 0 0 1 0.55518,1.16587 q -0.9327,0.79946 -0.9327,2.86472 v 9.438 q 0,5.29638 -1.73215,7.49488 -1.73215,2.1985 -5.69611,2.22071 a 16.100121,16.100121 0 0 1 -5.9626,-1.11036 4.4802752,4.4802752 0 0 1 1.03263,-3.03126 10.892565,10.892565 0 0 0 4.48028,1.03263 q 2.18184,0 3.0146,-1.11035 a 4.9965894,4.9965894 0 0 0 0.83277,-3.06458 v -1.33242 a 6.4011862,6.4011862 0 0 1 -4.16383,1.56559 4.9188647,4.9188647 0 0 1 -4.40255,-2.30953 z m 8.56083,-2.69816 v -7.72806 a 4.2915151,4.2915151 0 0 0 -2.86471,-1.36573 2.4039147,2.4039147 0 0 0 -2.18185,1.43235 8.6885138,8.6885138 0 0 0 -0.7828,4.09721 q 0,2.66485 0.71618,3.93065 a 2.1318781,2.1318781 0 0 0 1.88205,1.2658 4.2304457,4.2304457 0 0 0 3.23113,-1.63222 z"
id="path16"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<path
class="cls-1"
d="m 174.20619,164.67083 h -9.32697 a 5.6405943,5.6405943 0 0 0 0.88273,3.04792 q 0.7828,1.0826 2.74813,1.0826 a 10.120869,10.120869 0 0 0 4.36369,-1.16587 4.3803434,4.3803434 0 0 1 1.19918,2.5316 10.759323,10.759323 0 0 1 -6.41229,1.8987 q -3.74744,0 -5.37966,-2.43167 -1.63222,-2.43167 -1.63222,-6.2957 0,-3.88624 1.79877,-6.2957 a 6.0181143,6.0181143 0 0 1 5.14649,-2.43168 q 3.33106,0 5.14648,2.01529 a 7.3449864,7.3449864 0 0 1 1.79878,5.07987 13.04665,13.04665 0 0 1 -0.33311,2.96464 z m -6.42895,-7.06184 q -2.73146,0 -2.93133,4.13051 h 5.79605 v -0.39973 a 4.7245529,4.7245529 0 0 0 -0.69953,-2.69816 2.4316735,2.4316735 0 0 0 -2.14298,-1.03262 z"
id="path18"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<path
id="path24-3-6"
style="fill:#ffd952;fill-opacity:1;stroke-width:0.555177"
d="m 124.80273,70.162462 a 11.0036,11.0036 0 0 0 -4.33203,0.935547 L 76.261719,90.656602 a 1.5989086,1.5989086 0 0 0 -0.837891,2.138672 0.77169547,0.77169547 0 0 0 0.06641,0.177735 l 7.09375,14.021481 h 6.15625 l -0.875,-4.88867 c -0.07217,-0.39418 -0.711263,-3.187532 -1.316406,-5.197264 l 20.691398,6.462894 c 0.27198,1.28839 0.63292,2.49204 1.0625,3.62304 h 33.54883 c 0.36964,-1.13128 0.66138,-2.33705 0.85938,-3.62304 l 20.64648,-6.445316 c -0.60514,2.009734 -1.23639,4.785506 -1.30859,5.179686 l -0.875,4.88867 h 6.15429 l 7.02735,-13.894528 0.0664,-0.126953 0.0684,-0.171875 a 0.10548355,0.10548355 0 0 0 0,-0.04492 1.4878733,1.4878733 0 0 0 0.0664,-0.515625 1.5822533,1.5822533 0 0 0 -0.99414,-1.583985 L 129.35352,71.098009 a 11.0036,11.0036 0 0 0 -4.55079,-0.935547 z" />
<path
id="path24-3"
style="fill:#49c8fa;fill-opacity:1;stroke-width:0.555177"
d="M 124.80273,79.416133 A 11.0036,11.0036 0 0 0 120.4707,80.35168 L 76.261719,99.910272 a 1.5989086,1.5989086 0 0 0 -0.837891,2.136718 0.77169547,0.77169547 0 0 0 0.06641,0.17773 l 3.847657,7.60352 h 8.175781 c -0.257897,-1.08856 -0.591943,-2.42953 -0.964844,-3.66797 l 11.744141,3.66797 h 53.371087 l 11.69336,-3.65039 c -0.37193,1.23522 -0.70076,2.56719 -0.95703,3.65039 h 8.17383 l 3.78125,-7.47656 0.0664,-0.12696 0.0684,-0.17187 a 0.10548355,0.10548355 0 0 0 0,-0.0449 1.4878733,1.4878733 0 0 0 0.0664,-0.51563 1.5822533,1.5822533 0 0 0 -0.99414,-1.582028 L 129.35352,80.35168 a 11.0036,11.0036 0 0 0 -4.55079,-0.935547 z" />
<path
class="cls-2"
d="m 174.55595,110.92974 a 1.4878733,1.4878733 0 0 1 -0.0666,0.51631 0.10548355,0.10548355 0 0 1 0,0.0444 l -0.0666,0.17211 v 0 l -0.0666,0.12769 -10.69826,21.15223 c -1.48787,2.93688 -4.22489,2.84806 -3.76409,-0.12214 l 2.15408,-12.02512 c 0.0722,-0.39418 0.70508,-3.17006 1.31022,-5.1798 l -20.64702,6.4456 c -3.24223,21.05785 -30.95109,21.40761 -35.47023,0 l -20.691432,-6.46226 c 0.605143,2.00974 1.243596,4.80228 1.315769,5.19646 l 2.154085,12.02512 c 0.460796,2.9702 -2.276224,3.05902 -3.764098,0.12214 L 75.49024,111.66257 a 0.77169547,0.77169547 0 0 1 -0.06662,-0.17766 1.5989086,1.5989086 0 0 1 0.838317,-2.13743 L 120.47065,89.788613 a 11.0036,11.0036 0 0 1 8.88282,0 l 44.20871,19.558867 a 1.5822533,1.5822533 0 0 1 0.99377,1.58226 z"
id="path24"
style="stroke-width:0.555177" />
<path
class="cls-3"
d="m 139.0413,114.61611 19.11473,-7.69475 a 0.81055784,0.81055784 0 0 0 0,-1.50453 c -2.2207,-0.92714 -4.96328,-1.99308 -7.65033,-3.10899 -0.49411,-0.20541 -5.17425,3.15341 -5.60173,3.49762 l -8.23882,6.58439 c -1.99309,1.67108 -0.26649,3.28665 2.37615,2.22626 z"
id="path26"
style="stroke-width:0.555177" />
<circle
class="cls-3"
cx="125.18409"
cy="122.13319"
r="9.9654207"
id="circle28"
style="stroke-width:0.555177" />
<path
d="m 138.33068,100.19817 a 8.327649,8.327649 0 0 1 -2.77589,-0.288688 l -34.78736,-9.388036 a 8.4442361,8.4442361 0 0 1 -2.620433,-1.238044 z"
id="path6-0"
style="stroke-width:0.555177" />
<path
class="cls-1"
d="m 85.377935,159.27452 5.163143,-0.0333 h 0.06662 q 2.864711,0 2.864711,2.69816 v 8.69407 a 24.849705,24.849705 0 0 1 -8.649651,1.43235 q -4.730105,0 -7.128468,-3.21447 -2.398363,-3.21447 -2.398363,-8.76068 0,-5.55177 2.981299,-8.62745 a 9.7600046,9.7600046 0 0 1 7.29502,-3.08123 13.368653,13.368653 0 0 1 7.811335,2.43167 3.9250986,3.9250986 0 0 1 -0.682867,1.76547 4.7634152,4.7634152 0 0 1 -1.282458,1.33242 9.798867,9.798867 0 0 0 -5.679457,-1.96533 5.3574542,5.3574542 0 0 0 -4.480275,2.04861 q -1.598909,2.03749 -1.598909,6.41229 0,8.22771 6.062529,8.22771 a 16.910679,16.910679 0 0 0 3.697476,-0.43303 v -3.16451 q 0,-1.49898 0.06662,-2.22071 h -2.442777 a 2.2873276,2.2873276 0 0 1 -1.515632,-0.41638 1.6655298,1.6655298 0 0 1 -0.483004,-1.33242 5.7072154,5.7072154 0 0 1 0.333106,-1.79322 z"
id="path8-6"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<path
class="cls-1"
d="m 111.07151,169.73404 a 4.3137222,4.3137222 0 0 1 -0.55518,1.18253 4.0305821,4.0305821 0 0 1 -0.84942,0.94935 3.7640973,3.7640973 0 0 1 -3.05902,-1.95422 6.7453957,6.7453957 0 0 1 -4.76342,2.13188 q -2.564913,0 -3.886233,-1.49898 a 5.1298318,5.1298318 0 0 1 -1.299113,-3.4643 q 0,-2.77588 1.815427,-4.21379 a 7.3338829,7.3338829 0 0 1 4.669039,-1.3935 q 1.53228,0 2.89802,0.13325 v -0.99932 q 0,-2.63154 -2.53161,-2.63154 -1.79877,0 -5.096518,1.19918 a 4.674587,4.674587 0 0 1 -1.110353,-2.96464 18.581761,18.581761 0 0 1 7.217291,-1.49898 5.8682167,5.8682167 0 0 1 4.0639,1.39905 q 1.56559,1.39904 1.56559,4.23044 v 6.79537 q -0.0111,1.83208 0.9216,2.59822 z m -8.36096,-0.83276 a 4.7134493,4.7134493 0 0 0 3.33106,-1.59891 v -2.94244 a 22.368065,22.368065 0 0 0 -2.53161,-0.13324 2.775883,2.775883 0 0 0 -2.06525,0.68842 2.3928111,2.3928111 0 0 0 -0.69953,1.76546 2.3539488,2.3539488 0 0 0 0.55518,1.66553 1.8431863,1.8431863 0 0 0 1.41015,0.55518 z"
id="path10-2"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<path
class="cls-1"
d="m 113.76966,157.00939 a 3.986168,3.986168 0 0 1 0.55518,-1.21583 3.3310596,3.3310596 0 0 1 0.84942,-0.94935 4.1638245,4.1638245 0 0 1 3.51427,2.96464 q 1.33242,-2.96464 4.29707,-2.96464 a 10.215249,10.215249 0 0 1 1.93201,0.23317 7.4782288,7.4782288 0 0 1 -0.99932,3.88624 8.4497879,8.4497879 0 0 0 -1.49897,-0.19987 q -2.03195,0 -3.26444,2.16519 v 10.64829 a 11.575432,11.575432 0 0 1 -2.03195,0.16655 12.769062,12.769062 0 0 1 -2.09857,-0.16655 v -11.15905 q -0.0222,-2.40947 -1.2547,-3.40879 z"
id="path12-6"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<path
class="cls-1"
d="m 140.38483,169.73404 a 4.3137222,4.3137222 0 0 1 -0.58293,1.18253 4.0305821,4.0305821 0 0 1 -0.84942,0.94935 3.7640973,3.7640973 0 0 1 -3.05348,-1.95422 6.7453957,6.7453957 0 0 1 -4.76341,2.13188 q -2.56492,0 -3.88624,-1.49898 a 5.1298318,5.1298318 0 0 1 -1.29911,-3.4643 q 0,-2.77588 1.81543,-4.21379 a 7.3338829,7.3338829 0 0 1 4.64682,-1.4157 q 1.53229,0 2.89803,0.13324 v -0.99932 q 0,-2.63153 -2.53161,-2.63153 -1.79877,0 -5.09652,1.19918 a 4.674587,4.674587 0 0 1 -1.11035,-2.96465 18.581761,18.581761 0 0 1 7.21729,-1.49897 5.8682167,5.8682167 0 0 1 4.0639,1.39904 q 1.56559,1.39905 1.56559,4.23045 v 6.81757 q 0.0333,1.83208 0.96601,2.59822 z m -8.37206,-0.83276 a 4.7134493,4.7134493 0 0 0 3.33106,-1.59891 v -2.94244 a 22.368065,22.368065 0 0 0 -2.53161,-0.13324 2.775883,2.775883 0 0 0 -2.06526,0.69952 2.3928111,2.3928111 0 0 0 -0.69952,1.76546 2.3539488,2.3539488 0 0 0 0.55518,1.66553 1.8431863,1.8431863 0 0 0 1.41015,0.54408 z"
id="path14-1"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<path
class="cls-1"
d="m 144.48203,169.6008 q -1.49897,-2.29843 -1.49897,-6.34567 0,-4.04724 1.8987,-6.34567 a 5.740526,5.740526 0 0 1 4.56355,-2.29843 6.4400486,6.4400486 0 0 1 4.49693,1.66553 3.7696491,3.7696491 0 0 1 2.63154,-1.43235 3.1200925,3.1200925 0 0 1 0.88273,0.93269 3.8862362,3.8862362 0 0 1 0.55518,1.16587 q -0.9327,0.79946 -0.9327,2.86472 v 9.438 q 0,5.29638 -1.73215,7.49488 -1.73215,2.1985 -5.69611,2.22071 a 16.100121,16.100121 0 0 1 -5.9626,-1.11036 4.4802752,4.4802752 0 0 1 1.03263,-3.03126 10.892565,10.892565 0 0 0 4.48028,1.03263 q 2.18184,0 3.0146,-1.11035 a 4.9965894,4.9965894 0 0 0 0.83277,-3.06458 v -1.33242 a 6.4011862,6.4011862 0 0 1 -4.16383,1.56559 4.9188647,4.9188647 0 0 1 -4.40255,-2.30953 z m 8.56083,-2.69816 v -7.72806 a 4.2915151,4.2915151 0 0 0 -2.86471,-1.36573 2.4039147,2.4039147 0 0 0 -2.18185,1.43235 8.6885138,8.6885138 0 0 0 -0.7828,4.09721 q 0,2.66485 0.71618,3.93065 a 2.1318781,2.1318781 0 0 0 1.88205,1.2658 4.2304457,4.2304457 0 0 0 3.23113,-1.63222 z"
id="path16-8"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<path
class="cls-1"
d="m 174.20619,164.67083 h -9.32697 a 5.6405943,5.6405943 0 0 0 0.88273,3.04792 q 0.7828,1.0826 2.74813,1.0826 a 10.120869,10.120869 0 0 0 4.36369,-1.16587 4.3803434,4.3803434 0 0 1 1.19918,2.5316 10.759323,10.759323 0 0 1 -6.41229,1.8987 q -3.74744,0 -5.37966,-2.43167 -1.63222,-2.43167 -1.63222,-6.2957 0,-3.88624 1.79877,-6.2957 a 6.0181143,6.0181143 0 0 1 5.14649,-2.43168 q 3.33106,0 5.14648,2.01529 a 7.3449864,7.3449864 0 0 1 1.79878,5.07987 13.04665,13.04665 0 0 1 -0.33311,2.96464 z m -6.42895,-7.06184 q -2.73146,0 -2.93133,4.13051 h 5.79605 v -0.39973 a 4.7245529,4.7245529 0 0 0 -0.69953,-2.69816 2.4316735,2.4316735 0 0 0 -2.14298,-1.03262 z"
id="path18-7"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<path
id="path24-3-6-9"
style="fill:#ff9329;fill-opacity:1;stroke-width:0.555177"
d="m 124.80273,70.162462 a 11.0036,11.0036 0 0 0 -4.33203,0.935547 L 76.261719,90.656602 a 1.5989086,1.5989086 0 0 0 -0.837891,2.138672 0.77169547,0.77169547 0 0 0 0.06641,0.177735 l 7.09375,14.021481 h 6.15625 l -0.875,-4.88867 c -0.07217,-0.39418 -0.711263,-3.187532 -1.316406,-5.197264 l 20.691398,6.462894 c 0.27198,1.28839 0.63292,2.49204 1.0625,3.62304 h 33.54883 c 0.36964,-1.13128 0.66138,-2.33705 0.85938,-3.62304 l 20.64648,-6.445316 c -0.60514,2.009734 -1.23639,4.785506 -1.30859,5.179686 l -0.875,4.88867 h 6.15429 l 7.02735,-13.894528 0.0664,-0.126953 0.0684,-0.171875 a 0.10548355,0.10548355 0 0 0 0,-0.04492 1.4878733,1.4878733 0 0 0 0.0664,-0.515625 1.5822533,1.5822533 0 0 0 -0.99414,-1.583985 L 129.35352,71.098009 a 11.0036,11.0036 0 0 0 -4.55079,-0.935547 z" />
<path
id="path24-3-2"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177"
d="M 124.80273,79.416133 A 11.0036,11.0036 0 0 0 120.4707,80.35168 L 76.261719,99.910272 a 1.5989086,1.5989086 0 0 0 -0.837891,2.136718 0.77169547,0.77169547 0 0 0 0.06641,0.17773 l 3.847657,7.60352 h 8.175781 c -0.257897,-1.08856 -0.591943,-2.42953 -0.964844,-3.66797 l 11.744141,3.66797 h 53.371087 l 11.69336,-3.65039 c -0.37193,1.23522 -0.70076,2.56719 -0.95703,3.65039 h 8.17383 l 3.78125,-7.47656 0.0664,-0.12696 0.0684,-0.17187 a 0.10548355,0.10548355 0 0 0 0,-0.0449 1.4878733,1.4878733 0 0 0 0.0664,-0.51563 1.5822533,1.5822533 0 0 0 -0.99414,-1.582028 L 129.35352,80.35168 a 11.0036,11.0036 0 0 0 -4.55079,-0.935547 z" />
<path
class="cls-2"
d="m 174.55595,110.92974 a 1.4878733,1.4878733 0 0 1 -0.0666,0.51631 0.10548355,0.10548355 0 0 1 0,0.0444 l -0.0666,0.17211 v 0 l -0.0666,0.12769 -10.69826,21.15223 c -1.48787,2.93688 -4.22489,2.84806 -3.76409,-0.12214 l 2.15408,-12.02512 c 0.0722,-0.39418 0.70508,-3.17006 1.31022,-5.1798 l -20.64702,6.4456 c -3.24223,21.05785 -30.95109,21.40761 -35.47023,0 l -20.691432,-6.46226 c 0.605143,2.00974 1.243596,4.80228 1.315769,5.19646 l 2.154085,12.02512 c 0.460796,2.9702 -2.276224,3.05902 -3.764098,0.12214 L 75.49024,111.66257 a 0.77169547,0.77169547 0 0 1 -0.06662,-0.17766 1.5989086,1.5989086 0 0 1 0.838317,-2.13743 L 120.47065,89.788613 a 11.0036,11.0036 0 0 1 8.88282,0 l 44.20871,19.558867 a 1.5822533,1.5822533 0 0 1 0.99377,1.58226 z"
id="path24-0"
style="fill:#ff9329;fill-opacity:1;stroke-width:0.555177" />
<path
class="cls-3"
d="m 139.0413,114.61611 19.11473,-7.69475 a 0.81055784,0.81055784 0 0 0 0,-1.50453 c -2.2207,-0.92714 -4.96328,-1.99308 -7.65033,-3.10899 -0.49411,-0.20541 -5.17425,3.15341 -5.60173,3.49762 l -8.23882,6.58439 c -1.99309,1.67108 -0.26649,3.28665 2.37615,2.22626 z"
id="path26-2"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
<circle
class="cls-3"
cx="125.18409"
cy="122.13319"
r="9.9654207"
id="circle28-3"
style="fill:#4e4e4e;fill-opacity:1;stroke-width:0.555177" />
</g>
</svg>

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

View file

@ -1,119 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="108.2099mm"
height="108.00987mm"
viewBox="0 0 108.2099 108.00987"
version="1.1"
id="svg8"
inkscape:version="1.0.1 (3bc2e813f5, 2020-09-07)"
sodipodi:docname="garage.svg"
inkscape:export-filename="/home/lx/garage.png"
inkscape:export-xdpi="96"
inkscape:export-ydpi="96">
<defs
id="defs2" />
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="1"
inkscape:pageshadow="2"
inkscape:zoom="0.5"
inkscape:cx="-212.52783"
inkscape:cy="204.9547"
inkscape:document-units="mm"
inkscape:current-layer="layer1"
inkscape:document-rotation="0"
showgrid="false"
fit-margin-top="20"
fit-margin-left="20"
fit-margin-right="20"
fit-margin-bottom="20"
inkscape:window-width="1404"
inkscape:window-height="1016"
inkscape:window-x="103"
inkscape:window-y="27"
inkscape:window-maximized="0" />
<metadata
id="metadata5">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(-45.667412,-33.028536)">
<path
style="fill:none;stroke:#000000;stroke-width:2.065;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="M 66.78016,73.340623 99.921832,54.219898 132.84481,73.130965 V 120.00591 H 66.701651 Z"
id="path124"
sodipodi:nodetypes="cccccc" />
<g
id="g1106-5"
transform="matrix(0,0.95201267,-0.95201267,0,194.01664,-65.058377)"
style="stroke-width:2.17959;stroke-miterlimit:4;stroke-dasharray:none">
<g
id="g1061-3"
style="stroke-width:2.17959;stroke-miterlimit:4;stroke-dasharray:none">
<circle
style="fill:none;stroke:#000000;stroke-width:2.17959;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
id="path956-5"
cx="168.8569"
cy="92.889587"
r="13.125794" />
<circle
style="fill:none;stroke:#000000;stroke-width:2.17959;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
id="path958-6"
cx="168.77444"
cy="92.702293"
r="3.0778286" />
<path
id="path960-2"
style="fill:none;stroke:#000000;stroke-width:2.17959;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 169.46072,82.84435 c 4.95795,0.336608 8.87296,4.341959 9.09638,9.306301"
sodipodi:nodetypes="cc" />
</g>
<path
style="fill:none;stroke:#000000;stroke-width:2.17959;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 154.67824,112.84018 11.89881,-13.038071 c 1.46407,-1.552664 3.79541,0.878511 2.81832,2.089181 l -10.57965,14.481 c -1.8851,2.02632 -6.10786,-1.06119 -4.13748,-3.53211 z"
id="path964-9"
sodipodi:nodetypes="ccccc" />
<g
id="g1071-1"
style="stroke-width:2.17959;stroke-miterlimit:4;stroke-dasharray:none" />
<g
id="g1065-3"
style="stroke-width:2.17959;stroke-miterlimit:4;stroke-dasharray:none">
<rect
style="fill:none;stroke:#000000;stroke-width:2.17959;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
id="rect949-6"
width="35.576611"
height="48.507355"
x="150.9623"
y="74.698929"
ry="2.7302756" />
<path
style="fill:none;stroke:#000000;stroke-width:2.17959;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 150.76919,106.16944 6.36181,-0.0223 c 2.53845,3.46232 6.29787,4.20243 10.1055,4.40362 l 0.0176,13.09251"
id="path1033-0"
sodipodi:nodetypes="cccc" />
</g>
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 4.5 KiB

View file

@ -11,7 +11,7 @@ PATH="${GARAGE_DEBUG}:${GARAGE_RELEASE}:$PATH"
garage bucket create eprouvette garage bucket create eprouvette
KEY_INFO=`garage key new --name opérateur` KEY_INFO=`garage key new --name opérateur`
ACCESS_KEY=`echo $KEY_INFO|grep -Po 'GK[a-f0-9]+'` ACCESS_KEY=`echo $KEY_INFO|grep -Po 'GK[a-f0-9]+'`
SECRET_KEY=`echo $KEY_INFO|grep -Po 'secret_key: "[a-f0-9]+'|grep -Po '[a-f0-9]+$'` SECRET_KEY=`echo $KEY_INFO|grep -Po 'Secret key: [a-f0-9]+'|grep -Po '[a-f0-9]+$'`
garage bucket allow eprouvette --read --write --key $ACCESS_KEY garage bucket allow eprouvette --read --write --key $ACCESS_KEY
echo "$ACCESS_KEY $SECRET_KEY" > /tmp/garage.s3 echo "$ACCESS_KEY $SECRET_KEY" > /tmp/garage.s3

View file

@ -17,26 +17,25 @@ garage_util = { version = "0.1.1", path = "../util" }
garage_table = { version = "0.1.1", path = "../table" } garage_table = { version = "0.1.1", path = "../table" }
garage_model = { version = "0.1.1", path = "../model" } garage_model = { version = "0.1.1", path = "../model" }
err-derive = "0.2.3" err-derive = "0.3"
bytes = "0.4" bytes = "1.0"
hex = "0.3" hex = "0.4"
base64 = "0.13" base64 = "0.13"
log = "0.4" log = "0.4"
chrono = "0.4" chrono = "0.4"
md-5 = "0.9.1" md-5 = "0.9"
sha2 = "0.8" sha2 = "0.9"
hmac = "0.7" hmac = "0.10"
crypto-mac = "0.7" crypto-mac = "0.10"
rand = "0.7"
futures = "0.3" futures = "0.3"
futures-util = "0.3" futures-util = "0.3"
tokio = { version = "0.2", default-features = false, features = ["rt-core", "rt-threaded", "io-driver", "net", "tcp", "time", "macros", "sync", "signal", "fs"] } tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
http = "0.2" http = "0.2"
hyper = "^0.13.6" hyper = "0.14"
url = "2.1" url = "2.1"
httpdate = "0.3" httpdate = "0.3"
percent-encoding = "2.1.0" percent-encoding = "2.1.0"
roxmltree = "0.11" roxmltree = "0.14"
http-range = "0.1" http-range = "0.1"

View file

@ -137,7 +137,10 @@ async fn handler_inner(garage: Arc<Garage>, req: Request<Body>) -> Result<Respon
))); )));
} }
let source_key = source_key.ok_or_bad_request("No source key specified")?; let source_key = source_key.ok_or_bad_request("No source key specified")?;
Ok(handle_copy(garage, &bucket, &key, &source_bucket, &source_key).await?) Ok(
handle_copy(garage, &req, &bucket, &key, &source_bucket, &source_key)
.await?,
)
} else { } else {
// PutObject query // PutObject query
Ok(handle_put(garage, req, &bucket, &key, content_sha256).await?) Ok(handle_put(garage, req, &bucket, &key, content_sha256).await?)

View file

@ -33,7 +33,7 @@ pub enum Error {
InvalidBase64(#[error(source)] base64::DecodeError), InvalidBase64(#[error(source)] base64::DecodeError),
#[error(display = "Invalid XML: {}", _0)] #[error(display = "Invalid XML: {}", _0)]
InvalidXML(#[error(source)] roxmltree::Error), InvalidXML(String),
#[error(display = "Invalid header value: {}", _0)] #[error(display = "Invalid header value: {}", _0)]
InvalidHeader(#[error(source)] hyper::header::ToStrError), InvalidHeader(#[error(source)] hyper::header::ToStrError),
@ -45,6 +45,12 @@ pub enum Error {
BadRequest(String), BadRequest(String),
} }
impl From<roxmltree::Error> for Error {
fn from(err: roxmltree::Error) -> Self {
Self::InvalidXML(format!("{}", err))
}
}
impl Error { impl Error {
pub fn http_status_code(&self) -> StatusCode { pub fn http_status_code(&self) -> StatusCode {
match self { match self {

View file

@ -1,11 +1,11 @@
use std::fmt::Write; use std::fmt::Write;
use std::sync::Arc; use std::sync::Arc;
use chrono::{SecondsFormat, Utc}; use hyper::{Body, Request, Response};
use hyper::{Body, Response};
use garage_table::*; use garage_table::*;
use garage_util::data::*; use garage_util::data::*;
use garage_util::time::*;
use garage_model::block_ref_table::*; use garage_model::block_ref_table::*;
use garage_model::garage::Garage; use garage_model::garage::Garage;
@ -13,9 +13,11 @@ use garage_model::object_table::*;
use garage_model::version_table::*; use garage_model::version_table::*;
use crate::error::*; use crate::error::*;
use crate::s3_put::get_headers;
pub async fn handle_copy( pub async fn handle_copy(
garage: Arc<Garage>, garage: Arc<Garage>,
req: &Request<Body>,
dest_bucket: &str, dest_bucket: &str,
dest_key: &str, dest_key: &str,
source_bucket: &str, source_bucket: &str,
@ -41,17 +43,37 @@ pub async fn handle_copy(
}; };
let new_uuid = gen_uuid(); let new_uuid = gen_uuid();
let dest_object_version = ObjectVersion { let new_timestamp = now_msec();
uuid: new_uuid,
timestamp: now_msec(),
state: ObjectVersionState::Complete(source_last_state.clone()),
};
match &source_last_state { // Implement x-amz-metadata-directive: REPLACE
let old_meta = match source_last_state {
ObjectVersionData::DeleteMarker => { ObjectVersionData::DeleteMarker => {
return Err(Error::NotFound); return Err(Error::NotFound);
} }
ObjectVersionData::Inline(_meta, _bytes) => { ObjectVersionData::Inline(meta, _bytes) => meta,
ObjectVersionData::FirstBlock(meta, _fbh) => meta,
};
let new_meta = match req.headers().get("x-amz-metadata-directive") {
Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => ObjectVersionMeta {
headers: get_headers(req)?,
size: old_meta.size,
etag: old_meta.etag.clone(),
},
_ => old_meta.clone(),
};
// Save object copy
match source_last_state {
ObjectVersionData::DeleteMarker => unreachable!(),
ObjectVersionData::Inline(_meta, bytes) => {
let dest_object_version = ObjectVersion {
uuid: new_uuid,
timestamp: new_timestamp,
state: ObjectVersionState::Complete(ObjectVersionData::Inline(
new_meta,
bytes.clone(),
)),
};
let dest_object = Object::new( let dest_object = Object::new(
dest_bucket.to_string(), dest_bucket.to_string(),
dest_key.to_string(), dest_key.to_string(),
@ -59,44 +81,84 @@ pub async fn handle_copy(
); );
garage.object_table.insert(&dest_object).await?; garage.object_table.insert(&dest_object).await?;
} }
ObjectVersionData::FirstBlock(_meta, _first_block_hash) => { ObjectVersionData::FirstBlock(_meta, first_block_hash) => {
// Get block list from source version
let source_version = garage let source_version = garage
.version_table .version_table
.get(&source_last_v.uuid, &EmptyKey) .get(&source_last_v.uuid, &EmptyKey)
.await?; .await?;
let source_version = source_version.ok_or(Error::NotFound)?; let source_version = source_version.ok_or(Error::NotFound)?;
let dest_version = Version::new( // Write an "uploading" marker in Object table
// This holds a reference to the object in the Version table
// so that it won't be deleted, e.g. by repair_versions.
let tmp_dest_object_version = ObjectVersion {
uuid: new_uuid,
timestamp: new_timestamp,
state: ObjectVersionState::Uploading(new_meta.headers.clone()),
};
let tmp_dest_object = Object::new(
dest_bucket.to_string(),
dest_key.to_string(),
vec![tmp_dest_object_version],
);
garage.object_table.insert(&tmp_dest_object).await?;
// Write version in the version table. Even with empty block list,
// this means that the BlockRef entries linked to this version cannot be
// marked as deleted (they are marked as deleted only if the Version
// doesn't exist or is marked as deleted).
let mut dest_version = Version::new(
new_uuid, new_uuid,
dest_bucket.to_string(), dest_bucket.to_string(),
dest_key.to_string(), dest_key.to_string(),
false, false,
source_version.blocks().to_vec(),
); );
garage.version_table.insert(&dest_version).await?;
// Fill in block list for version and insert block refs
for (bk, bv) in source_version.blocks.items().iter() {
dest_version.blocks.put(*bk, *bv);
}
let dest_block_refs = dest_version
.blocks
.items()
.iter()
.map(|b| BlockRef {
block: b.1.hash,
version: new_uuid,
deleted: false.into(),
})
.collect::<Vec<_>>();
futures::try_join!(
garage.version_table.insert(&dest_version),
garage.block_ref_table.insert_many(&dest_block_refs[..]),
)?;
// Insert final object
// We do this last because otherwise there is a race condition in the case where
// the copy call has the same source and destination (this happens, rclone does
// it to update the modification timestamp for instance). If we did this concurrently
// with the stuff before, the block's reference counts could be decremented before
// they are incremented again for the new version, leading to data being deleted.
let dest_object_version = ObjectVersion {
uuid: new_uuid,
timestamp: new_timestamp,
state: ObjectVersionState::Complete(ObjectVersionData::FirstBlock(
new_meta,
*first_block_hash,
)),
};
let dest_object = Object::new( let dest_object = Object::new(
dest_bucket.to_string(), dest_bucket.to_string(),
dest_key.to_string(), dest_key.to_string(),
vec![dest_object_version], vec![dest_object_version],
); );
let dest_block_refs = dest_version garage.object_table.insert(&dest_object).await?;
.blocks()
.iter()
.map(|b| BlockRef {
block: b.hash,
version: new_uuid,
deleted: false,
})
.collect::<Vec<_>>();
futures::try_join!(
garage.object_table.insert(&dest_object),
garage.version_table.insert(&dest_version),
garage.block_ref_table.insert_many(&dest_block_refs[..]),
)?;
} }
} }
let now = Utc::now(); let last_modified = msec_to_rfc3339(new_timestamp);
let last_modified = now.to_rfc3339_opts(SecondsFormat::Secs, true);
let mut xml = String::new(); let mut xml = String::new();
writeln!(&mut xml, r#"<?xml version="1.0" encoding="UTF-8"?>"#).unwrap(); writeln!(&mut xml, r#"<?xml version="1.0" encoding="UTF-8"?>"#).unwrap();
writeln!(&mut xml, r#"<CopyObjectResult>"#).unwrap(); writeln!(&mut xml, r#"<CopyObjectResult>"#).unwrap();

View file

@ -4,6 +4,7 @@ use std::sync::Arc;
use hyper::{Body, Request, Response}; use hyper::{Body, Request, Response};
use garage_util::data::*; use garage_util::data::*;
use garage_util::time::*;
use garage_model::garage::Garage; use garage_model::garage::Garage;
use garage_model::object_table::*; use garage_model::object_table::*;
@ -29,16 +30,16 @@ async fn handle_delete_internal(
_ => true, _ => true,
}); });
let mut must_delete = None; let mut version_to_delete = None;
let mut timestamp = now_msec(); let mut timestamp = now_msec();
for v in interesting_versions { for v in interesting_versions {
if v.timestamp + 1 > timestamp || must_delete.is_none() { if v.timestamp + 1 > timestamp || version_to_delete.is_none() {
must_delete = Some(v.uuid); version_to_delete = Some(v.uuid);
} }
timestamp = std::cmp::max(timestamp, v.timestamp + 1); timestamp = std::cmp::max(timestamp, v.timestamp + 1);
} }
let deleted_version = must_delete.ok_or(Error::NotFound)?; let deleted_version = version_to_delete.ok_or(Error::NotFound)?;
let version_uuid = gen_uuid(); let version_uuid = gen_uuid();
@ -47,7 +48,7 @@ async fn handle_delete_internal(
key.into(), key.into(),
vec![ObjectVersion { vec![ObjectVersion {
uuid: version_uuid, uuid: version_uuid,
timestamp: now_msec(), timestamp,
state: ObjectVersionState::Complete(ObjectVersionData::DeleteMarker), state: ObjectVersionState::Complete(ObjectVersionData::DeleteMarker),
}], }],
); );

View file

@ -146,9 +146,10 @@ pub async fn handle_get(
let version = version.ok_or(Error::NotFound)?; let version = version.ok_or(Error::NotFound)?;
let mut blocks = version let mut blocks = version
.blocks() .blocks
.items()
.iter() .iter()
.map(|vb| (vb.hash, None)) .map(|(_, vb)| (vb.hash, None))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
blocks[0].1 = Some(first_block); blocks[0].1 = Some(first_block);
@ -219,11 +220,12 @@ pub async fn handle_get_range(
// file (whereas block.offset designates the offset of the block WITHIN THE PART // file (whereas block.offset designates the offset of the block WITHIN THE PART
// block.part_number, which is not the same in the case of a multipart upload) // block.part_number, which is not the same in the case of a multipart upload)
let mut blocks = Vec::with_capacity(std::cmp::min( let mut blocks = Vec::with_capacity(std::cmp::min(
version.blocks().len(), version.blocks.len(),
4 + ((end - begin) / std::cmp::max(version.blocks()[0].size as u64, 1024)) as usize, 4 + ((end - begin) / std::cmp::max(version.blocks.items()[0].1.size as u64, 1024))
as usize,
)); ));
let mut true_offset = 0; let mut true_offset = 0;
for b in version.blocks().iter() { for (_, b) in version.blocks.items().iter() {
if true_offset >= end { if true_offset >= end {
break; break;
} }

View file

@ -2,10 +2,10 @@ use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::fmt::Write; use std::fmt::Write;
use std::sync::Arc; use std::sync::Arc;
use chrono::{DateTime, NaiveDateTime, SecondsFormat, Utc};
use hyper::{Body, Response}; use hyper::{Body, Response};
use garage_util::error::Error as GarageError; use garage_util::error::Error as GarageError;
use garage_util::time::*;
use garage_model::garage::Garage; use garage_model::garage::Garage;
use garage_model::object_table::*; use garage_model::object_table::*;
@ -42,7 +42,7 @@ pub fn parse_list_objects_query(
Ok(ListObjectsQuery { Ok(ListObjectsQuery {
is_v2: params.get("list-type").map(|x| x == "2").unwrap_or(false), is_v2: params.get("list-type").map(|x| x == "2").unwrap_or(false),
bucket: bucket.to_string(), bucket: bucket.to_string(),
delimiter: params.get("delimiter").cloned(), delimiter: params.get("delimiter").filter(|x| !x.is_empty()).cloned(),
max_keys: params max_keys: params
.get("max-keys") .get("max-keys")
.map(|x| { .map(|x| {
@ -247,9 +247,7 @@ pub async fn handle_list(
} }
for (key, info) in result_keys.iter() { for (key, info) in result_keys.iter() {
let last_modif = NaiveDateTime::from_timestamp(info.last_modified as i64 / 1000, 0); let last_modif = msec_to_rfc3339(info.last_modified);
let last_modif = DateTime::<Utc>::from_utc(last_modif, Utc);
let last_modif = last_modif.to_rfc3339_opts(SecondsFormat::Millis, true);
writeln!(&mut xml, "\t<Contents>").unwrap(); writeln!(&mut xml, "\t<Contents>").unwrap();
writeln!( writeln!(
&mut xml, &mut xml,

View file

@ -5,11 +5,12 @@ use std::sync::Arc;
use futures::stream::*; use futures::stream::*;
use hyper::{Body, Request, Response}; use hyper::{Body, Request, Response};
use md5::{digest::generic_array::*, Digest as Md5Digest, Md5}; use md5::{digest::generic_array::*, Digest as Md5Digest, Md5};
use sha2::{Digest as Sha256Digest, Sha256}; use sha2::Sha256;
use garage_table::*; use garage_table::*;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error as GarageError; use garage_util::error::Error as GarageError;
use garage_util::time::*;
use garage_model::block::INLINE_THRESHOLD; use garage_model::block::INLINE_THRESHOLD;
use garage_model::block_ref_table::*; use garage_model::block_ref_table::*;
@ -52,14 +53,14 @@ pub async fn handle_put(
if first_block.len() < INLINE_THRESHOLD { if first_block.len() < INLINE_THRESHOLD {
let mut md5sum = Md5::new(); let mut md5sum = Md5::new();
md5sum.update(&first_block[..]); md5sum.update(&first_block[..]);
let md5sum_arr = md5sum.finalize(); let data_md5sum = md5sum.finalize();
let md5sum_hex = hex::encode(md5sum_arr); let data_md5sum_hex = hex::encode(data_md5sum);
let sha256sum_hash = sha256sum(&first_block[..]); let data_sha256sum = sha256sum(&first_block[..]);
ensure_checksum_matches( ensure_checksum_matches(
md5sum_arr.as_slice(), data_md5sum.as_slice(),
sha256sum_hash, data_sha256sum,
content_md5.as_deref(), content_md5.as_deref(),
content_sha256, content_sha256,
)?; )?;
@ -71,7 +72,7 @@ pub async fn handle_put(
ObjectVersionMeta { ObjectVersionMeta {
headers, headers,
size: first_block.len() as u64, size: first_block.len() as u64,
etag: md5sum_hex.clone(), etag: data_md5sum_hex.clone(),
}, },
first_block, first_block,
)), )),
@ -80,41 +81,45 @@ pub async fn handle_put(
let object = Object::new(bucket.into(), key.into(), vec![object_version]); let object = Object::new(bucket.into(), key.into(), vec![object_version]);
garage.object_table.insert(&object).await?; garage.object_table.insert(&object).await?;
return Ok(put_response(version_uuid, md5sum_hex)); return Ok(put_response(version_uuid, data_md5sum_hex));
} }
// Write version identifier in object table so that we have a trace // Write version identifier in object table so that we have a trace
// that we are uploading something // that we are uploading something
let mut object_version = ObjectVersion { let mut object_version = ObjectVersion {
uuid: version_uuid, uuid: version_uuid,
timestamp: now_msec(), timestamp: version_timestamp,
state: ObjectVersionState::Uploading(headers.clone()), state: ObjectVersionState::Uploading(headers.clone()),
}; };
let object = Object::new(bucket.into(), key.into(), vec![object_version.clone()]); let object = Object::new(bucket.into(), key.into(), vec![object_version.clone()]);
garage.object_table.insert(&object).await?; garage.object_table.insert(&object).await?;
// Initialize corresponding entry in version table // Initialize corresponding entry in version table
let version = Version::new(version_uuid, bucket.into(), key.into(), false, vec![]); // Write this entry now, even with empty block list,
let first_block_hash = sha256sum(&first_block[..]); // to prevent block_ref entries from being deleted (they can be deleted
// if the reference a version that isn't found in the version table)
let version = Version::new(version_uuid, bucket.into(), key.into(), false);
garage.version_table.insert(&version).await?;
// Transfer data and verify checksum // Transfer data and verify checksum
let first_block_hash = blake2sum(&first_block[..]);
let tx_result = read_and_put_blocks( let tx_result = read_and_put_blocks(
&garage, &garage,
version, &version,
1, 1,
first_block, first_block,
first_block_hash, first_block_hash,
&mut chunker, &mut chunker,
) )
.await .await
.and_then(|(total_size, md5sum_arr, sha256sum)| { .and_then(|(total_size, data_md5sum, data_sha256sum)| {
ensure_checksum_matches( ensure_checksum_matches(
md5sum_arr.as_slice(), data_md5sum.as_slice(),
sha256sum, data_sha256sum,
content_md5.as_deref(), content_md5.as_deref(),
content_sha256, content_sha256,
) )
.map(|()| (total_size, md5sum_arr)) .map(|()| (total_size, data_md5sum))
}); });
// If something went wrong, clean up // If something went wrong, clean up
@ -148,13 +153,13 @@ pub async fn handle_put(
/// Validate MD5 sum against content-md5 header /// Validate MD5 sum against content-md5 header
/// and sha256sum against signed content-sha256 /// and sha256sum against signed content-sha256
fn ensure_checksum_matches( fn ensure_checksum_matches(
md5sum: &[u8], data_md5sum: &[u8],
sha256sum: garage_util::data::FixedBytes32, data_sha256sum: garage_util::data::FixedBytes32,
content_md5: Option<&str>, content_md5: Option<&str>,
content_sha256: Option<garage_util::data::FixedBytes32>, content_sha256: Option<garage_util::data::FixedBytes32>,
) -> Result<(), Error> { ) -> Result<(), Error> {
if let Some(expected_sha256) = content_sha256 { if let Some(expected_sha256) = content_sha256 {
if expected_sha256 != sha256sum { if expected_sha256 != data_sha256sum {
return Err(Error::BadRequest(format!( return Err(Error::BadRequest(format!(
"Unable to validate x-amz-content-sha256" "Unable to validate x-amz-content-sha256"
))); )));
@ -163,7 +168,7 @@ fn ensure_checksum_matches(
} }
} }
if let Some(expected_md5) = content_md5 { if let Some(expected_md5) = content_md5 {
if expected_md5.trim_matches('"') != base64::encode(md5sum) { if expected_md5.trim_matches('"') != base64::encode(data_md5sum) {
return Err(Error::BadRequest(format!("Unable to validate content-md5"))); return Err(Error::BadRequest(format!("Unable to validate content-md5")));
} else { } else {
trace!("Successfully validated content-md5"); trace!("Successfully validated content-md5");
@ -173,8 +178,8 @@ fn ensure_checksum_matches(
} }
async fn read_and_put_blocks( async fn read_and_put_blocks(
garage: &Arc<Garage>, garage: &Garage,
version: Version, version: &Version,
part_number: u64, part_number: u64,
first_block: Vec<u8>, first_block: Vec<u8>,
first_block_hash: Hash, first_block_hash: Hash,
@ -183,11 +188,11 @@ async fn read_and_put_blocks(
let mut md5hasher = Md5::new(); let mut md5hasher = Md5::new();
let mut sha256hasher = Sha256::new(); let mut sha256hasher = Sha256::new();
md5hasher.update(&first_block[..]); md5hasher.update(&first_block[..]);
sha256hasher.input(&first_block[..]); sha256hasher.update(&first_block[..]);
let mut next_offset = first_block.len(); let mut next_offset = first_block.len();
let mut put_curr_version_block = put_block_meta( let mut put_curr_version_block = put_block_meta(
garage.clone(), &garage,
&version, &version,
part_number, part_number,
0, 0,
@ -203,11 +208,11 @@ async fn read_and_put_blocks(
futures::try_join!(put_curr_block, put_curr_version_block, chunker.next())?; futures::try_join!(put_curr_block, put_curr_version_block, chunker.next())?;
if let Some(block) = next_block { if let Some(block) = next_block {
md5hasher.update(&block[..]); md5hasher.update(&block[..]);
sha256hasher.input(&block[..]); sha256hasher.update(&block[..]);
let block_hash = sha256sum(&block[..]); let block_hash = blake2sum(&block[..]);
let block_len = block.len(); let block_len = block.len();
put_curr_version_block = put_block_meta( put_curr_version_block = put_block_meta(
garage.clone(), &garage,
&version, &version,
part_number, part_number,
next_offset as u64, next_offset as u64,
@ -222,39 +227,35 @@ async fn read_and_put_blocks(
} }
let total_size = next_offset as u64; let total_size = next_offset as u64;
let md5sum_arr = md5hasher.finalize(); let data_md5sum = md5hasher.finalize();
let sha256sum_arr = sha256hasher.result(); let data_sha256sum = sha256hasher.finalize();
let mut hash = [0u8; 32]; let data_sha256sum = Hash::try_from(&data_sha256sum[..]).unwrap();
hash.copy_from_slice(&sha256sum_arr[..]);
let sha256sum_arr = Hash::from(hash);
Ok((total_size, md5sum_arr, sha256sum_arr)) Ok((total_size, data_md5sum, data_sha256sum))
} }
async fn put_block_meta( async fn put_block_meta(
garage: Arc<Garage>, garage: &Garage,
version: &Version, version: &Version,
part_number: u64, part_number: u64,
offset: u64, offset: u64,
hash: Hash, hash: Hash,
size: u64, size: u64,
) -> Result<(), GarageError> { ) -> Result<(), GarageError> {
// TODO: don't clone, restart from empty block list ??
let mut version = version.clone(); let mut version = version.clone();
version version.blocks.put(
.add_block(VersionBlock { VersionBlockKey {
part_number, part_number,
offset, offset,
hash, },
size, VersionBlock { hash, size },
}) );
.unwrap();
let block_ref = BlockRef { let block_ref = BlockRef {
block: hash, block: hash,
version: version.uuid, version: version.uuid,
deleted: false, deleted: false.into(),
}; };
futures::try_join!( futures::try_join!(
@ -319,6 +320,7 @@ pub async fn handle_create_multipart_upload(
let version_uuid = gen_uuid(); let version_uuid = gen_uuid();
let headers = get_headers(req)?; let headers = get_headers(req)?;
// Create object in object table
let object_version = ObjectVersion { let object_version = ObjectVersion {
uuid: version_uuid, uuid: version_uuid,
timestamp: now_msec(), timestamp: now_msec(),
@ -327,6 +329,14 @@ pub async fn handle_create_multipart_upload(
let object = Object::new(bucket.to_string(), key.to_string(), vec![object_version]); let object = Object::new(bucket.to_string(), key.to_string(), vec![object_version]);
garage.object_table.insert(&object).await?; garage.object_table.insert(&object).await?;
// Insert empty version so that block_ref entries refer to something
// (they are inserted concurrently with blocks in the version table, so
// there is the possibility that they are inserted before the version table
// is created, in which case it is allowed to delete them, e.g. in repair_*)
let version = Version::new(version_uuid, bucket.into(), key.into(), false);
garage.version_table.insert(&version).await?;
// Send success response
let mut xml = String::new(); let mut xml = String::new();
writeln!(&mut xml, r#"<?xml version="1.0" encoding="UTF-8"?>"#).unwrap(); writeln!(&mut xml, r#"<?xml version="1.0" encoding="UTF-8"?>"#).unwrap();
writeln!( writeln!(
@ -389,11 +399,11 @@ pub async fn handle_put_part(
} }
// Copy block to store // Copy block to store
let version = Version::new(version_uuid, bucket, key, false, vec![]); let version = Version::new(version_uuid, bucket, key, false);
let first_block_hash = sha256sum(&first_block[..]); let first_block_hash = blake2sum(&first_block[..]);
let (_, md5sum_arr, sha256sum) = read_and_put_blocks( let (_, data_md5sum, data_sha256sum) = read_and_put_blocks(
&garage, &garage,
version, &version,
part_number, part_number,
first_block, first_block,
first_block_hash, first_block_hash,
@ -401,15 +411,24 @@ pub async fn handle_put_part(
) )
.await?; .await?;
// Verify that checksums map
ensure_checksum_matches( ensure_checksum_matches(
md5sum_arr.as_slice(), data_md5sum.as_slice(),
sha256sum, data_sha256sum,
content_md5.as_deref(), content_md5.as_deref(),
content_sha256, content_sha256,
)?; )?;
// Store part etag in version
let data_md5sum_hex = hex::encode(data_md5sum);
let mut version = version;
version
.parts_etags
.put(part_number, data_md5sum_hex.clone());
garage.version_table.insert(&version).await?;
let response = Response::builder() let response = Response::builder()
.header("ETag", format!("\"{}\"", hex::encode(md5sum_arr))) .header("ETag", format!("\"{}\"", data_md5sum_hex))
.body(Body::from(vec![])) .body(Body::from(vec![]))
.unwrap(); .unwrap();
Ok(response) Ok(response)
@ -444,17 +463,15 @@ pub async fn handle_complete_multipart_upload(
)?; )?;
let object = object.ok_or(Error::BadRequest(format!("Object not found")))?; let object = object.ok_or(Error::BadRequest(format!("Object not found")))?;
let object_version = object let mut object_version = object
.versions() .versions()
.iter() .iter()
.find(|v| v.uuid == version_uuid && v.is_uploading()); .find(|v| v.uuid == version_uuid && v.is_uploading())
let mut object_version = match object_version { .cloned()
None => return Err(Error::NotFound), .ok_or(Error::BadRequest(format!("Version not found")))?;
Some(x) => x.clone(),
};
let version = version.ok_or(Error::BadRequest(format!("Version not found")))?; let version = version.ok_or(Error::BadRequest(format!("Version not found")))?;
if version.blocks().len() == 0 { if version.blocks.len() == 0 {
return Err(Error::BadRequest(format!("No data was uploaded"))); return Err(Error::BadRequest(format!("No data was uploaded")));
} }
@ -464,53 +481,50 @@ pub async fn handle_complete_multipart_upload(
}; };
// Check that the list of parts they gave us corresponds to the parts we have here // Check that the list of parts they gave us corresponds to the parts we have here
// TODO: check MD5 sum of all uploaded parts? but that would mean we have to store them somewhere... debug!("Expected parts from request: {:?}", body_list_of_parts);
let mut parts = version debug!("Parts stored in version: {:?}", version.parts_etags.items());
.blocks() let parts = version
.parts_etags
.items()
.iter() .iter()
.map(|x| x.part_number) .map(|pair| (&pair.0, &pair.1));
.collect::<Vec<_>>();
parts.dedup();
let same_parts = body_list_of_parts let same_parts = body_list_of_parts
.iter() .iter()
.map(|x| &x.part_number) .map(|x| (&x.part_number, &x.etag))
.eq(parts.iter()); .eq(parts);
if !same_parts { if !same_parts {
return Err(Error::BadRequest(format!("We don't have the same parts"))); return Err(Error::BadRequest(format!("We don't have the same parts")));
} }
// ETag calculation: we produce ETags that have the same form as // Calculate etag of final object
// those of S3 multipart uploads, but we don't use their actual // To understand how etags are calculated, read more here:
// calculation for the first part (we use random bytes). This // https://teppen.io/2018/06/23/aws_s3_etags/
// shouldn't impact compatibility as the S3 docs specify that let num_parts = version.blocks.items().last().unwrap().0.part_number
// the ETag is an opaque value in case of a multipart upload. - version.blocks.items().first().unwrap().0.part_number
// See also: https://teppen.io/2018/06/23/aws_s3_etags/
let num_parts = version.blocks().last().unwrap().part_number
- version.blocks().first().unwrap().part_number
+ 1; + 1;
let etag = format!( let mut etag_md5_hasher = Md5::new();
"{}-{}", for (_, etag) in version.parts_etags.items().iter() {
hex::encode(&rand::random::<[u8; 16]>()[..]), etag_md5_hasher.update(etag.as_bytes());
num_parts }
); let etag = format!("{}-{}", hex::encode(etag_md5_hasher.finalize()), num_parts);
let total_size = version // Calculate total size of final object
.blocks() let total_size = version.blocks.items().iter().map(|x| x.1.size).sum();
.iter()
.map(|x| x.size) // Write final object version
.fold(0, |x, y| x + y);
object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock(
ObjectVersionMeta { ObjectVersionMeta {
headers, headers,
size: total_size, size: total_size,
etag: etag, etag,
}, },
version.blocks()[0].hash, version.blocks.items()[0].1.hash,
)); ));
let final_object = Object::new(bucket.clone(), key.clone(), vec![object_version]); let final_object = Object::new(bucket.clone(), key.clone(), vec![object_version]);
garage.object_table.insert(&final_object).await?; garage.object_table.insert(&final_object).await?;
// Send response saying ok we're done
let mut xml = String::new(); let mut xml = String::new();
writeln!(&mut xml, r#"<?xml version="1.0" encoding="UTF-8"?>"#).unwrap(); writeln!(&mut xml, r#"<?xml version="1.0" encoding="UTF-8"?>"#).unwrap();
writeln!( writeln!(
@ -570,17 +584,19 @@ fn get_mime_type(req: &Request<Body>) -> Result<String, Error> {
.to_string()) .to_string())
} }
fn get_headers(req: &Request<Body>) -> Result<ObjectVersionHeaders, Error> { pub(crate) fn get_headers(req: &Request<Body>) -> Result<ObjectVersionHeaders, Error> {
let content_type = get_mime_type(req)?; let content_type = get_mime_type(req)?;
let other_headers = vec![ let mut other = BTreeMap::new();
// Preserve standard headers
let standard_header = vec![
hyper::header::CACHE_CONTROL, hyper::header::CACHE_CONTROL,
hyper::header::CONTENT_DISPOSITION, hyper::header::CONTENT_DISPOSITION,
hyper::header::CONTENT_ENCODING, hyper::header::CONTENT_ENCODING,
hyper::header::CONTENT_LANGUAGE, hyper::header::CONTENT_LANGUAGE,
hyper::header::EXPIRES, hyper::header::EXPIRES,
]; ];
let mut other = BTreeMap::new(); for h in standard_header.iter() {
for h in other_headers.iter() {
if let Some(v) = req.headers().get(h) { if let Some(v) = req.headers().get(h) {
match v.to_str() { match v.to_str() {
Ok(v_str) => { Ok(v_str) => {
@ -592,6 +608,21 @@ fn get_headers(req: &Request<Body>) -> Result<ObjectVersionHeaders, Error> {
} }
} }
} }
// Preserve x-amz-meta- headers
for (k, v) in req.headers().iter() {
if k.as_str().starts_with("x-amz-meta-") {
match v.to_str() {
Ok(v_str) => {
other.insert(k.to_string(), v_str.to_string());
}
Err(e) => {
warn!("Discarding header {}, error in .to_str(): {}", k, e);
}
}
}
}
Ok(ObjectVersionHeaders { Ok(ObjectVersionHeaders {
content_type, content_type,
other, other,

View file

@ -1,7 +1,7 @@
use std::collections::HashMap; use std::collections::HashMap;
use chrono::{DateTime, Duration, NaiveDateTime, Utc}; use chrono::{DateTime, Duration, NaiveDateTime, Utc};
use hmac::{Hmac, Mac}; use hmac::{Hmac, Mac, NewMac};
use hyper::{Body, Method, Request}; use hyper::{Body, Method, Request};
use sha2::{Digest, Sha256}; use sha2::{Digest, Sha256};
@ -91,8 +91,8 @@ pub async fn check_signature(
"s3", "s3",
) )
.ok_or_internal_error("Unable to build signing HMAC")?; .ok_or_internal_error("Unable to build signing HMAC")?;
hmac.input(string_to_sign.as_bytes()); hmac.update(string_to_sign.as_bytes());
let signature = hex::encode(hmac.result().code()); let signature = hex::encode(hmac.finalize().into_bytes());
if authorization.signature != signature { if authorization.signature != signature {
trace!("Canonical request: ``{}``", canonical_request); trace!("Canonical request: ``{}``", canonical_request);
@ -106,12 +106,10 @@ pub async fn check_signature(
} else { } else {
let bytes = hex::decode(authorization.content_sha256) let bytes = hex::decode(authorization.content_sha256)
.ok_or_bad_request("Invalid content sha256 hash")?; .ok_or_bad_request("Invalid content sha256 hash")?;
let mut hash = [0u8; 32]; Some(
if bytes.len() != 32 { Hash::try_from(&bytes[..])
return Err(Error::BadRequest(format!("Invalid content sha256 hash"))); .ok_or(Error::BadRequest(format!("Invalid content sha256 hash")))?,
} )
hash.copy_from_slice(&bytes[..]);
Some(Hash::from(hash))
}; };
Ok((key, content_sha256)) Ok((key, content_sha256))
@ -220,12 +218,12 @@ fn parse_credential(cred: &str) -> Result<(String, String), Error> {
fn string_to_sign(datetime: &DateTime<Utc>, scope_string: &str, canonical_req: &str) -> String { fn string_to_sign(datetime: &DateTime<Utc>, scope_string: &str, canonical_req: &str) -> String {
let mut hasher = Sha256::default(); let mut hasher = Sha256::default();
hasher.input(canonical_req.as_bytes()); hasher.update(canonical_req.as_bytes());
[ [
"AWS4-HMAC-SHA256", "AWS4-HMAC-SHA256",
&datetime.format(LONG_DATETIME).to_string(), &datetime.format(LONG_DATETIME).to_string(),
scope_string, scope_string,
&hex::encode(hasher.result().as_slice()), &hex::encode(hasher.finalize().as_slice()),
] ]
.join("\n") .join("\n")
} }
@ -238,14 +236,14 @@ fn signing_hmac(
) -> Result<HmacSha256, crypto_mac::InvalidKeyLength> { ) -> Result<HmacSha256, crypto_mac::InvalidKeyLength> {
let secret = String::from("AWS4") + secret_key; let secret = String::from("AWS4") + secret_key;
let mut date_hmac = HmacSha256::new_varkey(secret.as_bytes())?; let mut date_hmac = HmacSha256::new_varkey(secret.as_bytes())?;
date_hmac.input(datetime.format(SHORT_DATE).to_string().as_bytes()); date_hmac.update(datetime.format(SHORT_DATE).to_string().as_bytes());
let mut region_hmac = HmacSha256::new_varkey(&date_hmac.result().code())?; let mut region_hmac = HmacSha256::new_varkey(&date_hmac.finalize().into_bytes())?;
region_hmac.input(region.as_bytes()); region_hmac.update(region.as_bytes());
let mut service_hmac = HmacSha256::new_varkey(&region_hmac.result().code())?; let mut service_hmac = HmacSha256::new_varkey(&region_hmac.finalize().into_bytes())?;
service_hmac.input(service.as_bytes()); service_hmac.update(service.as_bytes());
let mut signing_hmac = HmacSha256::new_varkey(&service_hmac.result().code())?; let mut signing_hmac = HmacSha256::new_varkey(&service_hmac.finalize().into_bytes())?;
signing_hmac.input(b"aws4_request"); signing_hmac.update(b"aws4_request");
let hmac = HmacSha256::new_varkey(&signing_hmac.result().code())?; let hmac = HmacSha256::new_varkey(&signing_hmac.finalize().into_bytes())?;
Ok(hmac) Ok(hmac)
} }

View file

@ -21,21 +21,20 @@ garage_model = { version = "0.1.1", path = "../model" }
garage_api = { version = "0.1.1", path = "../api" } garage_api = { version = "0.1.1", path = "../api" }
garage_web = { version = "0.1.1", path = "../web" } garage_web = { version = "0.1.1", path = "../web" }
bytes = "0.4" bytes = "1.0"
rand = "0.7" rand = "0.8"
hex = "0.3" hex = "0.4"
sha2 = "0.8"
log = "0.4" log = "0.4"
pretty_env_logger = "0.4" pretty_env_logger = "0.4"
git-version = "0.3.4"
sled = "0.34" sled = "0.34"
old_sled = { package = "sled", version = "0.31" }
structopt = { version = "0.3", default-features = false } structopt = { version = "0.3", default-features = false }
toml = "0.5" toml = "0.5"
rmp-serde = "0.14.3" rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
futures = "0.3" futures = "0.3"
futures-util = "0.3" futures-util = "0.3"
tokio = { version = "0.2", default-features = false, features = ["rt-core", "rt-threaded", "io-driver", "net", "tcp", "time", "macros", "sync", "signal", "fs"] } tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }

View file

@ -1,3 +1,5 @@
use std::collections::HashMap;
use std::fmt::Write;
use std::sync::Arc; use std::sync::Arc;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -5,6 +7,7 @@ use serde::{Deserialize, Serialize};
use garage_util::error::Error; use garage_util::error::Error;
use garage_table::crdt::CRDT; use garage_table::crdt::CRDT;
use garage_table::replication::*;
use garage_table::*; use garage_table::*;
use garage_rpc::rpc_client::*; use garage_rpc::rpc_client::*;
@ -14,6 +17,7 @@ use garage_model::bucket_table::*;
use garage_model::garage::Garage; use garage_model::garage::Garage;
use garage_model::key_table::*; use garage_model::key_table::*;
use crate::cli::*;
use crate::repair::Repair; use crate::repair::Repair;
use crate::*; use crate::*;
@ -25,6 +29,7 @@ pub enum AdminRPC {
BucketOperation(BucketOperation), BucketOperation(BucketOperation),
KeyOperation(KeyOperation), KeyOperation(KeyOperation),
LaunchRepair(RepairOpt), LaunchRepair(RepairOpt),
Stats(StatsOpt),
// Replies // Replies
Ok(String), Ok(String),
@ -55,6 +60,7 @@ impl AdminRpcHandler {
AdminRPC::BucketOperation(bo) => self2.handle_bucket_cmd(bo).await, AdminRPC::BucketOperation(bo) => self2.handle_bucket_cmd(bo).await,
AdminRPC::KeyOperation(ko) => self2.handle_key_cmd(ko).await, AdminRPC::KeyOperation(ko) => self2.handle_key_cmd(ko).await,
AdminRPC::LaunchRepair(opt) => self2.handle_launch_repair(opt).await, AdminRPC::LaunchRepair(opt) => self2.handle_launch_repair(opt).await,
AdminRPC::Stats(opt) => self2.handle_stats(opt).await,
_ => Err(Error::BadRPC(format!("Invalid RPC"))), _ => Err(Error::BadRPC(format!("Invalid RPC"))),
} }
} }
@ -116,7 +122,7 @@ impl AdminRpcHandler {
for (key_id, _, _) in bucket.authorized_keys() { for (key_id, _, _) in bucket.authorized_keys() {
if let Some(key) = self.garage.key_table.get(&EmptyKey, key_id).await? { if let Some(key) = self.garage.key_table.get(&EmptyKey, key_id).await? {
if !key.deleted.get() { if !key.deleted.get() {
self.update_key_bucket(key, &bucket.name, false, false) self.update_key_bucket(&key, &bucket.name, false, false)
.await?; .await?;
} }
} else { } else {
@ -128,31 +134,31 @@ impl AdminRpcHandler {
Ok(AdminRPC::Ok(format!("Bucket {} was deleted.", query.name))) Ok(AdminRPC::Ok(format!("Bucket {} was deleted.", query.name)))
} }
BucketOperation::Allow(query) => { BucketOperation::Allow(query) => {
let key = self.get_existing_key(&query.key_id).await?; let key = self.get_existing_key(&query.key_pattern).await?;
let bucket = self.get_existing_bucket(&query.bucket).await?; let bucket = self.get_existing_bucket(&query.bucket).await?;
let allow_read = query.read || key.allow_read(&query.bucket); let allow_read = query.read || key.allow_read(&query.bucket);
let allow_write = query.write || key.allow_write(&query.bucket); let allow_write = query.write || key.allow_write(&query.bucket);
self.update_key_bucket(key, &query.bucket, allow_read, allow_write) self.update_key_bucket(&key, &query.bucket, allow_read, allow_write)
.await?; .await?;
self.update_bucket_key(bucket, &query.key_id, allow_read, allow_write) self.update_bucket_key(bucket, &key.key_id, allow_read, allow_write)
.await?; .await?;
Ok(AdminRPC::Ok(format!( Ok(AdminRPC::Ok(format!(
"New permissions for {} on {}: read {}, write {}.", "New permissions for {} on {}: read {}, write {}.",
&query.key_id, &query.bucket, allow_read, allow_write &key.key_id, &query.bucket, allow_read, allow_write
))) )))
} }
BucketOperation::Deny(query) => { BucketOperation::Deny(query) => {
let key = self.get_existing_key(&query.key_id).await?; let key = self.get_existing_key(&query.key_pattern).await?;
let bucket = self.get_existing_bucket(&query.bucket).await?; let bucket = self.get_existing_bucket(&query.bucket).await?;
let allow_read = !query.read && key.allow_read(&query.bucket); let allow_read = !query.read && key.allow_read(&query.bucket);
let allow_write = !query.write && key.allow_write(&query.bucket); let allow_write = !query.write && key.allow_write(&query.bucket);
self.update_key_bucket(key, &query.bucket, allow_read, allow_write) self.update_key_bucket(&key, &query.bucket, allow_read, allow_write)
.await?; .await?;
self.update_bucket_key(bucket, &query.key_id, allow_read, allow_write) self.update_bucket_key(bucket, &key.key_id, allow_read, allow_write)
.await?; .await?;
Ok(AdminRPC::Ok(format!( Ok(AdminRPC::Ok(format!(
"New permissions for {} on {}: read {}, write {}.", "New permissions for {} on {}: read {}, write {}.",
&query.key_id, &query.bucket, allow_read, allow_write &key.key_id, &query.bucket, allow_read, allow_write
))) )))
} }
BucketOperation::Website(query) => { BucketOperation::Website(query) => {
@ -187,7 +193,12 @@ impl AdminRpcHandler {
let key_ids = self let key_ids = self
.garage .garage
.key_table .key_table
.get_range(&EmptyKey, None, Some(DeletedFilter::NotDeleted), 10000) .get_range(
&EmptyKey,
None,
Some(KeyFilter::Deleted(DeletedFilter::NotDeleted)),
10000,
)
.await? .await?
.iter() .iter()
.map(|k| (k.key_id.to_string(), k.name.get().clone())) .map(|k| (k.key_id.to_string(), k.name.get().clone()))
@ -195,7 +206,7 @@ impl AdminRpcHandler {
Ok(AdminRPC::KeyList(key_ids)) Ok(AdminRPC::KeyList(key_ids))
} }
KeyOperation::Info(query) => { KeyOperation::Info(query) => {
let key = self.get_existing_key(&query.key_id).await?; let key = self.get_existing_key(&query.key_pattern).await?;
Ok(AdminRPC::KeyInfo(key)) Ok(AdminRPC::KeyInfo(key))
} }
KeyOperation::New(query) => { KeyOperation::New(query) => {
@ -204,13 +215,13 @@ impl AdminRpcHandler {
Ok(AdminRPC::KeyInfo(key)) Ok(AdminRPC::KeyInfo(key))
} }
KeyOperation::Rename(query) => { KeyOperation::Rename(query) => {
let mut key = self.get_existing_key(&query.key_id).await?; let mut key = self.get_existing_key(&query.key_pattern).await?;
key.name.update(query.new_name); key.name.update(query.new_name);
self.garage.key_table.insert(&key).await?; self.garage.key_table.insert(&key).await?;
Ok(AdminRPC::KeyInfo(key)) Ok(AdminRPC::KeyInfo(key))
} }
KeyOperation::Delete(query) => { KeyOperation::Delete(query) => {
let key = self.get_existing_key(&query.key_id).await?; let key = self.get_existing_key(&query.key_pattern).await?;
if !query.yes { if !query.yes {
return Err(Error::BadRPC(format!( return Err(Error::BadRPC(format!(
"Add --yes flag to really perform this operation" "Add --yes flag to really perform this operation"
@ -227,13 +238,24 @@ impl AdminRpcHandler {
return Err(Error::Message(format!("Bucket not found: {}", ab_name))); return Err(Error::Message(format!("Bucket not found: {}", ab_name)));
} }
} }
let del_key = Key::delete(key.key_id); let del_key = Key::delete(key.key_id.to_string());
self.garage.key_table.insert(&del_key).await?; self.garage.key_table.insert(&del_key).await?;
Ok(AdminRPC::Ok(format!( Ok(AdminRPC::Ok(format!(
"Key {} was deleted successfully.", "Key {} was deleted successfully.",
query.key_id key.key_id
))) )))
} }
KeyOperation::Import(query) => {
let prev_key = self.garage.key_table.get(&EmptyKey, &query.key_id)
.await?;
if prev_key.is_some() {
return Err(Error::Message(format!("Key {} already exists in data store. Even if it is deleted, we can't let you create a new key with the same ID. Sorry.", query.key_id)));
}
let imported_key = Key::import(&query.key_id, &query.secret_key, &query.name);
self.garage.key_table.insert(&imported_key).await?;
Ok(AdminRPC::KeyInfo(imported_key))
}
} }
} }
@ -250,14 +272,28 @@ impl AdminRpcHandler {
)))) ))))
} }
async fn get_existing_key(&self, id: &String) -> Result<Key, Error> { async fn get_existing_key(&self, pattern: &str) -> Result<Key, Error> {
self.garage let candidates = self
.garage
.key_table .key_table
.get(&EmptyKey, id) .get_range(
&EmptyKey,
None,
Some(KeyFilter::Matches(pattern.to_string())),
10,
)
.await? .await?
.into_iter()
.filter(|k| !k.deleted.get()) .filter(|k| !k.deleted.get())
.map(Ok) .collect::<Vec<_>>();
.unwrap_or(Err(Error::BadRPC(format!("Key {} does not exist", id)))) if candidates.len() != 1 {
Err(Error::Message(format!(
"{} matching keys",
candidates.len()
)))
} else {
Ok(candidates.into_iter().next().unwrap())
}
} }
/// Update **bucket table** to inform of the new linked key /// Update **bucket table** to inform of the new linked key
@ -290,11 +326,12 @@ impl AdminRpcHandler {
/// Update **key table** to inform of the new linked bucket /// Update **key table** to inform of the new linked bucket
async fn update_key_bucket( async fn update_key_bucket(
&self, &self,
mut key: Key, key: &Key,
bucket: &String, bucket: &String,
allow_read: bool, allow_read: bool,
allow_write: bool, allow_write: bool,
) -> Result<(), Error> { ) -> Result<(), Error> {
let mut key = key.clone();
let old_map = key.authorized_buckets.take_and_clear(); let old_map = key.authorized_buckets.take_and_clear();
key.authorized_buckets.merge(&old_map.update_mutator( key.authorized_buckets.merge(&old_map.update_mutator(
bucket.clone(), bucket.clone(),
@ -350,12 +387,118 @@ impl AdminRpcHandler {
.background .background
.spawn_worker("Repair worker".into(), move |must_exit| async move { .spawn_worker("Repair worker".into(), move |must_exit| async move {
repair.repair_worker(opt, must_exit).await repair.repair_worker(opt, must_exit).await
}) });
.await;
Ok(AdminRPC::Ok(format!( Ok(AdminRPC::Ok(format!(
"Repair launched on {:?}", "Repair launched on {:?}",
self.garage.system.id self.garage.system.id
))) )))
} }
} }
async fn handle_stats(&self, opt: StatsOpt) -> Result<AdminRPC, Error> {
if opt.all_nodes {
let mut ret = String::new();
let ring = self.garage.system.ring.borrow().clone();
for node in ring.config.members.keys() {
let mut opt = opt.clone();
opt.all_nodes = false;
writeln!(&mut ret, "\n======================").unwrap();
writeln!(&mut ret, "Stats for node {:?}:", node).unwrap();
match self
.rpc_client
.call(*node, AdminRPC::Stats(opt), ADMIN_RPC_TIMEOUT)
.await
{
Ok(AdminRPC::Ok(s)) => writeln!(&mut ret, "{}", s).unwrap(),
Ok(x) => writeln!(&mut ret, "Bad answer: {:?}", x).unwrap(),
Err(e) => writeln!(&mut ret, "Error: {}", e).unwrap(),
}
}
Ok(AdminRPC::Ok(ret))
} else {
Ok(AdminRPC::Ok(self.gather_stats_local(opt)?))
}
}
fn gather_stats_local(&self, opt: StatsOpt) -> Result<String, Error> {
let mut ret = String::new();
writeln!(
&mut ret,
"\nGarage version: {}",
git_version::git_version!()
)
.unwrap();
// Gather ring statistics
let ring = self.garage.system.ring.borrow().clone();
let mut ring_nodes = HashMap::new();
for r in ring.ring.iter() {
for n in r.nodes.iter() {
if !ring_nodes.contains_key(n) {
ring_nodes.insert(*n, 0usize);
}
*ring_nodes.get_mut(n).unwrap() += 1;
}
}
writeln!(&mut ret, "\nRing nodes & partition count:").unwrap();
for (n, c) in ring_nodes.iter() {
writeln!(&mut ret, " {:?} {}", n, c).unwrap();
}
self.gather_table_stats(&mut ret, &self.garage.bucket_table, &opt)?;
self.gather_table_stats(&mut ret, &self.garage.key_table, &opt)?;
self.gather_table_stats(&mut ret, &self.garage.object_table, &opt)?;
self.gather_table_stats(&mut ret, &self.garage.version_table, &opt)?;
self.gather_table_stats(&mut ret, &self.garage.block_ref_table, &opt)?;
writeln!(&mut ret, "\nBlock manager stats:").unwrap();
if opt.detailed {
writeln!(
&mut ret,
" number of blocks: {}",
self.garage.block_manager.rc_len()
)
.unwrap();
}
writeln!(
&mut ret,
" resync queue length: {}",
self.garage.block_manager.resync_queue_len()
)
.unwrap();
Ok(ret)
}
fn gather_table_stats<F, R>(
&self,
to: &mut String,
t: &Arc<Table<F, R>>,
opt: &StatsOpt,
) -> Result<(), Error>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
writeln!(to, "\nTable stats for {}", t.data.name).unwrap();
if opt.detailed {
writeln!(to, " number of items: {}", t.data.store.len()).unwrap();
writeln!(
to,
" Merkle tree size: {}",
t.merkle_updater.merkle_tree_len()
)
.unwrap();
}
writeln!(
to,
" Merkle updater todo queue length: {}",
t.merkle_updater.todo_len()
)
.unwrap();
writeln!(to, " GC todo queue length: {}", t.data.gc_todo_len()).unwrap();
Ok(())
}
} }

552
src/garage/cli.rs Normal file
View file

@ -0,0 +1,552 @@
use std::collections::HashSet;
use std::net::SocketAddr;
use std::path::PathBuf;
use serde::{Deserialize, Serialize};
use structopt::StructOpt;
use garage_util::error::Error;
use garage_util::time::*;
use garage_rpc::membership::*;
use garage_rpc::ring::*;
use garage_rpc::rpc_client::*;
use garage_model::bucket_table::*;
use garage_model::key_table::*;
use crate::admin_rpc::*;
#[derive(StructOpt, Debug)]
pub enum Command {
/// Run Garage server
#[structopt(name = "server")]
Server(ServerOpt),
/// Get network status
#[structopt(name = "status")]
Status,
/// Garage node operations
#[structopt(name = "node")]
Node(NodeOperation),
/// Bucket operations
#[structopt(name = "bucket")]
Bucket(BucketOperation),
/// Key operations
#[structopt(name = "key")]
Key(KeyOperation),
/// Start repair of node data
#[structopt(name = "repair")]
Repair(RepairOpt),
/// Gather node statistics
#[structopt(name = "stats")]
Stats(StatsOpt),
}
#[derive(StructOpt, Debug)]
pub struct ServerOpt {
/// Configuration file
#[structopt(short = "c", long = "config", default_value = "./config.toml")]
pub config_file: PathBuf,
}
#[derive(StructOpt, Debug)]
pub enum NodeOperation {
/// Configure Garage node
#[structopt(name = "configure")]
Configure(ConfigureNodeOpt),
/// Remove Garage node from cluster
#[structopt(name = "remove")]
Remove(RemoveNodeOpt),
}
#[derive(StructOpt, Debug)]
pub struct ConfigureNodeOpt {
/// Node to configure (prefix of hexadecimal node id)
node_id: String,
/// Location (datacenter) of the node
#[structopt(short = "d", long = "datacenter")]
datacenter: Option<String>,
/// Capacity (in relative terms, use 1 to represent your smallest server)
#[structopt(short = "c", long = "capacity")]
capacity: Option<u32>,
/// Optionnal node tag
#[structopt(short = "t", long = "tag")]
tag: Option<String>,
}
#[derive(StructOpt, Debug)]
pub struct RemoveNodeOpt {
/// Node to configure (prefix of hexadecimal node id)
node_id: String,
/// If this flag is not given, the node won't be removed
#[structopt(long = "yes")]
yes: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub enum BucketOperation {
/// List buckets
#[structopt(name = "list")]
List,
/// Get bucket info
#[structopt(name = "info")]
Info(BucketOpt),
/// Create bucket
#[structopt(name = "create")]
Create(BucketOpt),
/// Delete bucket
#[structopt(name = "delete")]
Delete(DeleteBucketOpt),
/// Allow key to read or write to bucket
#[structopt(name = "allow")]
Allow(PermBucketOpt),
/// Allow key to read or write to bucket
#[structopt(name = "deny")]
Deny(PermBucketOpt),
/// Expose as website or not
#[structopt(name = "website")]
Website(WebsiteOpt),
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct WebsiteOpt {
/// Create
#[structopt(long = "allow")]
pub allow: bool,
/// Delete
#[structopt(long = "deny")]
pub deny: bool,
/// Bucket name
pub bucket: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct BucketOpt {
/// Bucket name
pub name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct DeleteBucketOpt {
/// Bucket name
pub name: String,
/// If this flag is not given, the bucket won't be deleted
#[structopt(long = "yes")]
pub yes: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct PermBucketOpt {
/// Access key name or ID
#[structopt(long = "key")]
pub key_pattern: String,
/// Allow/deny read operations
#[structopt(long = "read")]
pub read: bool,
/// Allow/deny write operations
#[structopt(long = "write")]
pub write: bool,
/// Bucket name
pub bucket: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub enum KeyOperation {
/// List keys
#[structopt(name = "list")]
List,
/// Get key info
#[structopt(name = "info")]
Info(KeyOpt),
/// Create new key
#[structopt(name = "new")]
New(KeyNewOpt),
/// Rename key
#[structopt(name = "rename")]
Rename(KeyRenameOpt),
/// Delete key
#[structopt(name = "delete")]
Delete(KeyDeleteOpt),
/// Import key
#[structopt(name = "import")]
Import(KeyImportOpt),
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyOpt {
/// ID or name of the key
pub key_pattern: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyNewOpt {
/// Name of the key
#[structopt(long = "name", default_value = "Unnamed key")]
pub name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyRenameOpt {
/// ID or name of the key
pub key_pattern: String,
/// New name of the key
pub new_name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyDeleteOpt {
/// ID or name of the key
pub key_pattern: String,
/// Confirm deletion
#[structopt(long = "yes")]
pub yes: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyImportOpt {
/// Access key ID
pub key_id: String,
/// Secret access key
pub secret_key: String,
/// Key name
#[structopt(short = "n", default_value = "Imported key")]
pub name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
pub struct RepairOpt {
/// Launch repair operation on all nodes
#[structopt(short = "a", long = "all-nodes")]
pub all_nodes: bool,
/// Confirm the launch of the repair operation
#[structopt(long = "yes")]
pub yes: bool,
#[structopt(subcommand)]
pub what: Option<RepairWhat>,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
pub enum RepairWhat {
/// Only do a full sync of metadata tables
#[structopt(name = "tables")]
Tables,
/// Only repair (resync/rebalance) the set of stored blocks
#[structopt(name = "blocks")]
Blocks,
/// Only redo the propagation of object deletions to the version table (slow)
#[structopt(name = "versions")]
Versions,
/// Only redo the propagation of version deletions to the block ref table (extremely slow)
#[structopt(name = "block_refs")]
BlockRefs,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
pub struct StatsOpt {
/// Gather statistics from all nodes
#[structopt(short = "a", long = "all-nodes")]
pub all_nodes: bool,
/// Gather detailed statistics (this can be long)
#[structopt(short = "d", long = "detailed")]
pub detailed: bool,
}
pub async fn cli_cmd(
cmd: Command,
membership_rpc_cli: RpcAddrClient<Message>,
admin_rpc_cli: RpcAddrClient<AdminRPC>,
rpc_host: SocketAddr,
) -> Result<(), Error> {
match cmd {
Command::Status => cmd_status(membership_rpc_cli, rpc_host).await,
Command::Node(NodeOperation::Configure(configure_opt)) => {
cmd_configure(membership_rpc_cli, rpc_host, configure_opt).await
}
Command::Node(NodeOperation::Remove(remove_opt)) => {
cmd_remove(membership_rpc_cli, rpc_host, remove_opt).await
}
Command::Bucket(bo) => {
cmd_admin(admin_rpc_cli, rpc_host, AdminRPC::BucketOperation(bo)).await
}
Command::Key(ko) => cmd_admin(admin_rpc_cli, rpc_host, AdminRPC::KeyOperation(ko)).await,
Command::Repair(ro) => cmd_admin(admin_rpc_cli, rpc_host, AdminRPC::LaunchRepair(ro)).await,
Command::Stats(so) => cmd_admin(admin_rpc_cli, rpc_host, AdminRPC::Stats(so)).await,
_ => unreachable!(),
}
}
pub async fn cmd_status(
rpc_cli: RpcAddrClient<Message>,
rpc_host: SocketAddr,
) -> Result<(), Error> {
let status = match rpc_cli
.call(&rpc_host, &Message::PullStatus, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseNodesUp(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let config = match rpc_cli
.call(&rpc_host, &Message::PullConfig, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
println!("Healthy nodes:");
for adv in status.iter().filter(|x| x.is_up) {
if let Some(cfg) = config.members.get(&adv.id) {
println!(
"{:?}\t{}\t{}\t[{}]\t{}\t{}",
adv.id, adv.state_info.hostname, adv.addr, cfg.tag, cfg.datacenter, cfg.capacity
);
} else {
println!(
"{:?}\t{}\t{}\tUNCONFIGURED/REMOVED",
adv.id, adv.state_info.hostname, adv.addr
);
}
}
let status_keys = status.iter().map(|x| x.id).collect::<HashSet<_>>();
let failure_case_1 = status.iter().any(|x| !x.is_up);
let failure_case_2 = config
.members
.iter()
.any(|(id, _)| !status_keys.contains(id));
if failure_case_1 || failure_case_2 {
println!("\nFailed nodes:");
for adv in status.iter().filter(|x| !x.is_up) {
if let Some(cfg) = config.members.get(&adv.id) {
println!(
"{:?}\t{}\t{}\t[{}]\t{}\t{}\tlast seen: {}s ago",
adv.id,
adv.state_info.hostname,
adv.addr,
cfg.tag,
cfg.datacenter,
cfg.capacity,
(now_msec() - adv.last_seen) / 1000,
);
}
}
for (id, cfg) in config.members.iter() {
if !status.iter().any(|x| x.id == *id) {
println!(
"{:?}\t{}\t{}\t{}\tnever seen",
id, cfg.tag, cfg.datacenter, cfg.capacity
);
}
}
}
Ok(())
}
pub async fn cmd_configure(
rpc_cli: RpcAddrClient<Message>,
rpc_host: SocketAddr,
args: ConfigureNodeOpt,
) -> Result<(), Error> {
let status = match rpc_cli
.call(&rpc_host, &Message::PullStatus, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseNodesUp(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let mut candidates = vec![];
for adv in status.iter() {
if hex::encode(&adv.id).starts_with(&args.node_id) {
candidates.push(adv.id);
}
}
if candidates.len() != 1 {
return Err(Error::Message(format!(
"{} matching nodes",
candidates.len()
)));
}
let mut config = match rpc_cli
.call(&rpc_host, &Message::PullConfig, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let new_entry = match config.members.get(&candidates[0]) {
None => NetworkConfigEntry {
datacenter: args
.datacenter
.expect("Please specifiy a datacenter with the -d flag"),
capacity: args
.capacity
.expect("Please specifiy a capacity with the -c flag"),
tag: args.tag.unwrap_or("".to_string()),
},
Some(old) => NetworkConfigEntry {
datacenter: args.datacenter.unwrap_or(old.datacenter.to_string()),
capacity: args.capacity.unwrap_or(old.capacity),
tag: args.tag.unwrap_or(old.tag.to_string()),
},
};
config.members.insert(candidates[0].clone(), new_entry);
config.version += 1;
rpc_cli
.call(
&rpc_host,
&Message::AdvertiseConfig(config),
ADMIN_RPC_TIMEOUT,
)
.await??;
Ok(())
}
pub async fn cmd_remove(
rpc_cli: RpcAddrClient<Message>,
rpc_host: SocketAddr,
args: RemoveNodeOpt,
) -> Result<(), Error> {
let mut config = match rpc_cli
.call(&rpc_host, &Message::PullConfig, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let mut candidates = vec![];
for (key, _) in config.members.iter() {
if hex::encode(key).starts_with(&args.node_id) {
candidates.push(*key);
}
}
if candidates.len() != 1 {
return Err(Error::Message(format!(
"{} matching nodes",
candidates.len()
)));
}
if !args.yes {
return Err(Error::Message(format!(
"Add the flag --yes to really remove {:?} from the cluster",
candidates[0]
)));
}
config.members.remove(&candidates[0]);
config.version += 1;
rpc_cli
.call(
&rpc_host,
&Message::AdvertiseConfig(config),
ADMIN_RPC_TIMEOUT,
)
.await??;
Ok(())
}
pub async fn cmd_admin(
rpc_cli: RpcAddrClient<AdminRPC>,
rpc_host: SocketAddr,
args: AdminRPC,
) -> Result<(), Error> {
match rpc_cli.call(&rpc_host, args, ADMIN_RPC_TIMEOUT).await?? {
AdminRPC::Ok(msg) => {
println!("{}", msg);
}
AdminRPC::BucketList(bl) => {
println!("List of buckets:");
for bucket in bl {
println!("{}", bucket);
}
}
AdminRPC::BucketInfo(bucket) => {
print_bucket_info(&bucket);
}
AdminRPC::KeyList(kl) => {
println!("List of keys:");
for key in kl {
println!("{}\t{}", key.0, key.1);
}
}
AdminRPC::KeyInfo(key) => {
print_key_info(&key);
}
r => {
error!("Unexpected response: {:?}", r);
}
}
Ok(())
}
fn print_key_info(key: &Key) {
println!("Key name: {}", key.name.get());
println!("Key ID: {}", key.key_id);
println!("Secret key: {}", key.secret_key);
if key.deleted.get() {
println!("Key is deleted.");
} else {
println!("Authorized buckets:");
for (b, _, perm) in key.authorized_buckets.items().iter() {
println!("- {} R:{} W:{}", b, perm.allow_read, perm.allow_write);
}
}
}
fn print_bucket_info(bucket: &Bucket) {
println!("Bucket name: {}", bucket.name);
match bucket.state.get() {
BucketState::Deleted => println!("Bucket is deleted."),
BucketState::Present(p) => {
println!("Authorized keys:");
for (k, _, perm) in p.authorized_keys.items().iter() {
println!("- {} R:{} W:{}", k, perm.allow_read, perm.allow_write);
}
println!("Website access: {}", p.website.get());
}
};
}

View file

@ -4,289 +4,67 @@
extern crate log; extern crate log;
mod admin_rpc; mod admin_rpc;
mod cli;
mod repair; mod repair;
mod server; mod server;
use std::collections::HashSet;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
use serde::{Deserialize, Serialize};
use structopt::StructOpt; use structopt::StructOpt;
use garage_util::config::TlsConfig; use garage_util::config::TlsConfig;
use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use garage_rpc::membership::*; use garage_rpc::membership::*;
use garage_rpc::ring::*;
use garage_rpc::rpc_client::*; use garage_rpc::rpc_client::*;
use admin_rpc::*; use admin_rpc::*;
use cli::*;
#[derive(StructOpt, Debug)] #[derive(StructOpt, Debug)]
#[structopt(name = "garage")] #[structopt(name = "garage")]
pub struct Opt { pub struct Opt {
/// RPC connect to this host to execute client operations /// RPC connect to this host to execute client operations
#[structopt(short = "h", long = "rpc-host", default_value = "127.0.0.1:3901")] #[structopt(short = "h", long = "rpc-host", default_value = "127.0.0.1:3901")]
rpc_host: SocketAddr, pub rpc_host: SocketAddr,
#[structopt(long = "ca-cert")] #[structopt(long = "ca-cert")]
ca_cert: Option<String>, pub ca_cert: Option<String>,
#[structopt(long = "client-cert")] #[structopt(long = "client-cert")]
client_cert: Option<String>, pub client_cert: Option<String>,
#[structopt(long = "client-key")] #[structopt(long = "client-key")]
client_key: Option<String>, pub client_key: Option<String>,
#[structopt(subcommand)] #[structopt(subcommand)]
cmd: Command, cmd: Command,
} }
#[derive(StructOpt, Debug)]
pub enum Command {
/// Run Garage server
#[structopt(name = "server")]
Server(ServerOpt),
/// Get network status
#[structopt(name = "status")]
Status,
/// Garage node operations
#[structopt(name = "node")]
Node(NodeOperation),
/// Bucket operations
#[structopt(name = "bucket")]
Bucket(BucketOperation),
/// Key operations
#[structopt(name = "key")]
Key(KeyOperation),
/// Start repair of node data
#[structopt(name = "repair")]
Repair(RepairOpt),
}
#[derive(StructOpt, Debug)]
pub struct ServerOpt {
/// Configuration file
#[structopt(short = "c", long = "config", default_value = "./config.toml")]
config_file: PathBuf,
}
#[derive(StructOpt, Debug)]
pub enum NodeOperation {
/// Configure Garage node
#[structopt(name = "configure")]
Configure(ConfigureNodeOpt),
/// Remove Garage node from cluster
#[structopt(name = "remove")]
Remove(RemoveNodeOpt),
}
#[derive(StructOpt, Debug)]
pub struct ConfigureNodeOpt {
/// Node to configure (prefix of hexadecimal node id)
node_id: String,
/// Location (datacenter) of the node
#[structopt(short = "d", long = "datacenter")]
datacenter: Option<String>,
/// Capacity (in relative terms, use 1 to represent your smallest server)
#[structopt(short = "c", long = "capacity")]
capacity: Option<u32>,
/// Optionnal node tag
#[structopt(short = "t", long = "tag")]
tag: Option<String>,
}
#[derive(StructOpt, Debug)]
pub struct RemoveNodeOpt {
/// Node to configure (prefix of hexadecimal node id)
node_id: String,
/// If this flag is not given, the node won't be removed
#[structopt(long = "yes")]
yes: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub enum BucketOperation {
/// List buckets
#[structopt(name = "list")]
List,
/// Get bucket info
#[structopt(name = "info")]
Info(BucketOpt),
/// Create bucket
#[structopt(name = "create")]
Create(BucketOpt),
/// Delete bucket
#[structopt(name = "delete")]
Delete(DeleteBucketOpt),
/// Allow key to read or write to bucket
#[structopt(name = "allow")]
Allow(PermBucketOpt),
/// Allow key to read or write to bucket
#[structopt(name = "deny")]
Deny(PermBucketOpt),
/// Expose as website or not
#[structopt(name = "website")]
Website(WebsiteOpt),
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct WebsiteOpt {
/// Create
#[structopt(long = "allow")]
pub allow: bool,
/// Delete
#[structopt(long = "deny")]
pub deny: bool,
/// Bucket name
pub bucket: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct BucketOpt {
/// Bucket name
pub name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct DeleteBucketOpt {
/// Bucket name
pub name: String,
/// If this flag is not given, the bucket won't be deleted
#[structopt(long = "yes")]
pub yes: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct PermBucketOpt {
/// Access key ID
#[structopt(long = "key")]
pub key_id: String,
/// Allow/deny read operations
#[structopt(long = "read")]
pub read: bool,
/// Allow/deny write operations
#[structopt(long = "write")]
pub write: bool,
/// Bucket name
pub bucket: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub enum KeyOperation {
/// List keys
#[structopt(name = "list")]
List,
/// Get key info
#[structopt(name = "info")]
Info(KeyOpt),
/// Create new key
#[structopt(name = "new")]
New(KeyNewOpt),
/// Rename key
#[structopt(name = "rename")]
Rename(KeyRenameOpt),
/// Delete key
#[structopt(name = "delete")]
Delete(KeyDeleteOpt),
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyOpt {
/// ID of the key
key_id: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyNewOpt {
/// Name of the key
#[structopt(long = "name", default_value = "Unnamed key")]
name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyRenameOpt {
/// ID of the key
key_id: String,
/// New name of the key
new_name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyDeleteOpt {
/// ID of the key
key_id: String,
/// Confirm deletion
#[structopt(long = "yes")]
yes: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
pub struct RepairOpt {
/// Launch repair operation on all nodes
#[structopt(short = "a", long = "all-nodes")]
pub all_nodes: bool,
/// Confirm the launch of the repair operation
#[structopt(long = "yes")]
pub yes: bool,
#[structopt(subcommand)]
pub what: Option<RepairWhat>,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
pub enum RepairWhat {
/// Only do a full sync of metadata tables
#[structopt(name = "tables")]
Tables,
/// Only repair (resync/rebalance) the set of stored blocks
#[structopt(name = "blocks")]
Blocks,
/// Only redo the propagation of object deletions to the version table (slow)
#[structopt(name = "versions")]
Versions,
/// Only redo the propagation of version deletions to the block ref table (extremely slow)
#[structopt(name = "block_refs")]
BlockRefs,
}
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
pretty_env_logger::init(); pretty_env_logger::init();
let opt = Opt::from_args(); let opt = Opt::from_args();
let res = if let Command::Server(server_opt) = opt.cmd {
// Abort on panic (same behavior as in Go)
std::panic::set_hook(Box::new(|panic_info| {
error!("{}", panic_info.to_string());
std::process::abort();
}));
server::run_server(server_opt.config_file).await
} else {
cli_command(opt).await
};
if let Err(e) = res {
error!("{}", e);
}
}
async fn cli_command(opt: Opt) -> Result<(), Error> {
let tls_config = match (opt.ca_cert, opt.client_cert, opt.client_key) { let tls_config = match (opt.ca_cert, opt.client_cert, opt.client_key) {
(Some(ca_cert), Some(client_cert), Some(client_key)) => Some(TlsConfig { (Some(ca_cert), Some(client_cert), Some(client_key)) => Some(TlsConfig {
ca_cert, ca_cert,
@ -306,245 +84,5 @@ async fn main() {
RpcAddrClient::new(rpc_http_cli.clone(), MEMBERSHIP_RPC_PATH.to_string()); RpcAddrClient::new(rpc_http_cli.clone(), MEMBERSHIP_RPC_PATH.to_string());
let admin_rpc_cli = RpcAddrClient::new(rpc_http_cli.clone(), ADMIN_RPC_PATH.to_string()); let admin_rpc_cli = RpcAddrClient::new(rpc_http_cli.clone(), ADMIN_RPC_PATH.to_string());
let resp = match opt.cmd { cli_cmd(opt.cmd, membership_rpc_cli, admin_rpc_cli, opt.rpc_host).await
Command::Server(server_opt) => {
// Abort on panic (same behavior as in Go)
std::panic::set_hook(Box::new(|panic_info| {
error!("{}", panic_info.to_string());
std::process::abort();
}));
server::run_server(server_opt.config_file).await
}
Command::Status => cmd_status(membership_rpc_cli, opt.rpc_host).await,
Command::Node(NodeOperation::Configure(configure_opt)) => {
cmd_configure(membership_rpc_cli, opt.rpc_host, configure_opt).await
}
Command::Node(NodeOperation::Remove(remove_opt)) => {
cmd_remove(membership_rpc_cli, opt.rpc_host, remove_opt).await
}
Command::Bucket(bo) => {
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::BucketOperation(bo)).await
}
Command::Key(bo) => {
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::KeyOperation(bo)).await
}
Command::Repair(ro) => {
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::LaunchRepair(ro)).await
}
};
if let Err(e) = resp {
error!("Error: {}", e);
}
}
async fn cmd_status(rpc_cli: RpcAddrClient<Message>, rpc_host: SocketAddr) -> Result<(), Error> {
let status = match rpc_cli
.call(&rpc_host, &Message::PullStatus, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseNodesUp(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let config = match rpc_cli
.call(&rpc_host, &Message::PullConfig, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
println!("Healthy nodes:");
for adv in status.iter().filter(|x| x.is_up) {
if let Some(cfg) = config.members.get(&adv.id) {
println!(
"{:?}\t{}\t{}\t[{}]\t{}\t{}",
adv.id, adv.state_info.hostname, adv.addr, cfg.tag, cfg.datacenter, cfg.capacity
);
} else {
println!(
"{:?}\t{}\t{}\tUNCONFIGURED/REMOVED",
adv.id, adv.state_info.hostname, adv.addr
);
}
}
let status_keys = status.iter().map(|x| x.id).collect::<HashSet<_>>();
let failure_case_1 = status.iter().any(|x| !x.is_up);
let failure_case_2 = config
.members
.iter()
.any(|(id, _)| !status_keys.contains(id));
if failure_case_1 || failure_case_2 {
println!("\nFailed nodes:");
for adv in status.iter().filter(|x| !x.is_up) {
if let Some(cfg) = config.members.get(&adv.id) {
println!(
"{:?}\t{}\t{}\t[{}]\t{}\t{}\tlast seen: {}s ago",
adv.id,
adv.state_info.hostname,
adv.addr,
cfg.tag,
cfg.datacenter,
cfg.capacity,
(now_msec() - adv.last_seen) / 1000,
);
}
}
for (id, cfg) in config.members.iter() {
if !status.iter().any(|x| x.id == *id) {
println!(
"{:?}\t{}\t{}\t{}\tnever seen",
id, cfg.tag, cfg.datacenter, cfg.capacity
);
}
}
}
Ok(())
}
async fn cmd_configure(
rpc_cli: RpcAddrClient<Message>,
rpc_host: SocketAddr,
args: ConfigureNodeOpt,
) -> Result<(), Error> {
let status = match rpc_cli
.call(&rpc_host, &Message::PullStatus, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseNodesUp(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let mut candidates = vec![];
for adv in status.iter() {
if hex::encode(&adv.id).starts_with(&args.node_id) {
candidates.push(adv.id);
}
}
if candidates.len() != 1 {
return Err(Error::Message(format!(
"{} matching nodes",
candidates.len()
)));
}
let mut config = match rpc_cli
.call(&rpc_host, &Message::PullConfig, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let new_entry = match config.members.get(&candidates[0]) {
None => NetworkConfigEntry {
datacenter: args
.datacenter
.expect("Please specifiy a datacenter with the -d flag"),
capacity: args
.capacity
.expect("Please specifiy a capacity with the -c flag"),
tag: args.tag.unwrap_or("".to_string()),
},
Some(old) => NetworkConfigEntry {
datacenter: args.datacenter.unwrap_or(old.datacenter.to_string()),
capacity: args.capacity.unwrap_or(old.capacity),
tag: args.tag.unwrap_or(old.tag.to_string()),
},
};
config.members.insert(candidates[0].clone(), new_entry);
config.version += 1;
rpc_cli
.call(
&rpc_host,
&Message::AdvertiseConfig(config),
ADMIN_RPC_TIMEOUT,
)
.await??;
Ok(())
}
async fn cmd_remove(
rpc_cli: RpcAddrClient<Message>,
rpc_host: SocketAddr,
args: RemoveNodeOpt,
) -> Result<(), Error> {
let mut config = match rpc_cli
.call(&rpc_host, &Message::PullConfig, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let mut candidates = vec![];
for (key, _) in config.members.iter() {
if hex::encode(key).starts_with(&args.node_id) {
candidates.push(*key);
}
}
if candidates.len() != 1 {
return Err(Error::Message(format!(
"{} matching nodes",
candidates.len()
)));
}
if !args.yes {
return Err(Error::Message(format!(
"Add the flag --yes to really remove {:?} from the cluster",
candidates[0]
)));
}
config.members.remove(&candidates[0]);
config.version += 1;
rpc_cli
.call(
&rpc_host,
&Message::AdvertiseConfig(config),
ADMIN_RPC_TIMEOUT,
)
.await??;
Ok(())
}
async fn cmd_admin(
rpc_cli: RpcAddrClient<AdminRPC>,
rpc_host: SocketAddr,
args: AdminRPC,
) -> Result<(), Error> {
match rpc_cli.call(&rpc_host, args, ADMIN_RPC_TIMEOUT).await?? {
AdminRPC::Ok(msg) => {
println!("{}", msg);
}
AdminRPC::BucketList(bl) => {
println!("List of buckets:");
for bucket in bl {
println!("{}", bucket);
}
}
AdminRPC::BucketInfo(bucket) => {
println!("{:?}", bucket);
}
AdminRPC::KeyList(kl) => {
println!("List of keys:");
for key in kl {
println!("{}\t{}", key.0, key.1);
}
}
AdminRPC::KeyInfo(key) => {
println!("{:?}", key);
}
r => {
error!("Unexpected response: {:?}", r);
}
}
Ok(())
} }

View file

@ -16,7 +16,13 @@ pub struct Repair {
} }
impl Repair { impl Repair {
pub async fn repair_worker( pub async fn repair_worker(&self, opt: RepairOpt, must_exit: watch::Receiver<bool>) {
if let Err(e) = self.repair_worker_aux(opt, must_exit).await {
warn!("Repair worker failed with error: {}", e);
}
}
async fn repair_worker_aux(
&self, &self,
opt: RepairOpt, opt: RepairOpt,
must_exit: watch::Receiver<bool>, must_exit: watch::Receiver<bool>,
@ -25,41 +31,11 @@ impl Repair {
if todo(RepairWhat::Tables) { if todo(RepairWhat::Tables) {
info!("Launching a full sync of tables"); info!("Launching a full sync of tables");
self.garage self.garage.bucket_table.syncer.add_full_sync();
.bucket_table self.garage.object_table.syncer.add_full_sync();
.syncer self.garage.version_table.syncer.add_full_sync();
.load_full() self.garage.block_ref_table.syncer.add_full_sync();
.unwrap() self.garage.key_table.syncer.add_full_sync();
.add_full_scan()
.await;
self.garage
.object_table
.syncer
.load_full()
.unwrap()
.add_full_scan()
.await;
self.garage
.version_table
.syncer
.load_full()
.unwrap()
.add_full_scan()
.await;
self.garage
.block_ref_table
.syncer
.load_full()
.unwrap()
.add_full_scan()
.await;
self.garage
.key_table
.syncer
.load_full()
.unwrap()
.add_full_scan()
.await;
} }
// TODO: wait for full sync to finish before proceeding to the rest? // TODO: wait for full sync to finish before proceeding to the rest?
@ -93,11 +69,13 @@ impl Repair {
async fn repair_versions(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> { async fn repair_versions(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
let mut pos = vec![]; let mut pos = vec![];
while let Some((item_key, item_bytes)) = self.garage.version_table.store.get_gt(&pos)? { while let Some((item_key, item_bytes)) =
self.garage.version_table.data.store.get_gt(&pos)?
{
pos = item_key.to_vec(); pos = item_key.to_vec();
let version = rmp_serde::decode::from_read_ref::<_, Version>(item_bytes.as_ref())?; let version = rmp_serde::decode::from_read_ref::<_, Version>(item_bytes.as_ref())?;
if version.deleted { if version.deleted.get() {
continue; continue;
} }
let object = self let object = self
@ -110,13 +88,7 @@ impl Repair {
.versions() .versions()
.iter() .iter()
.any(|x| x.uuid == version.uuid && x.state != ObjectVersionState::Aborted), .any(|x| x.uuid == version.uuid && x.state != ObjectVersionState::Aborted),
None => { None => false,
warn!(
"Repair versions: object for version {:?} not found, skipping.",
version
);
continue;
}
}; };
if !version_exists { if !version_exists {
info!("Repair versions: marking version as deleted: {:?}", version); info!("Repair versions: marking version as deleted: {:?}", version);
@ -127,7 +99,6 @@ impl Repair {
version.bucket, version.bucket,
version.key, version.key,
true, true,
vec![],
)) ))
.await?; .await?;
} }
@ -142,11 +113,13 @@ impl Repair {
async fn repair_block_ref(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> { async fn repair_block_ref(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
let mut pos = vec![]; let mut pos = vec![];
while let Some((item_key, item_bytes)) = self.garage.block_ref_table.store.get_gt(&pos)? { while let Some((item_key, item_bytes)) =
self.garage.block_ref_table.data.store.get_gt(&pos)?
{
pos = item_key.to_vec(); pos = item_key.to_vec();
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(item_bytes.as_ref())?; let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(item_bytes.as_ref())?;
if block_ref.deleted { if block_ref.deleted.get() {
continue; continue;
} }
let version = self let version = self
@ -154,16 +127,8 @@ impl Repair {
.version_table .version_table
.get(&block_ref.version, &EmptyKey) .get(&block_ref.version, &EmptyKey)
.await?; .await?;
let ref_exists = match version { // The version might not exist if it has been GC'ed
Some(v) => !v.deleted, let ref_exists = version.map(|v| !v.deleted.get()).unwrap_or(false);
None => {
warn!(
"Block ref repair: version for block ref {:?} not found, skipping.",
block_ref
);
continue;
}
};
if !ref_exists { if !ref_exists {
info!( info!(
"Repair block ref: marking block_ref as deleted: {:?}", "Repair block ref: marking block_ref as deleted: {:?}",
@ -174,7 +139,7 @@ impl Repair {
.insert(&BlockRef { .insert(&BlockRef {
block: block_ref.block, block: block_ref.block,
version: block_ref.version, version: block_ref.version,
deleted: true, deleted: true.into(),
}) })
.await?; .await?;
} }

View file

@ -21,13 +21,13 @@ async fn shutdown_signal(send_cancel: watch::Sender<bool>) -> Result<(), Error>
.await .await
.expect("failed to install CTRL+C signal handler"); .expect("failed to install CTRL+C signal handler");
info!("Received CTRL+C, shutting down."); info!("Received CTRL+C, shutting down.");
send_cancel.broadcast(true)?; send_cancel.send(true)?;
Ok(()) Ok(())
} }
async fn wait_from(mut chan: watch::Receiver<bool>) -> () { async fn wait_from(mut chan: watch::Receiver<bool>) -> () {
while let Some(exit_now) = chan.recv().await { while !*chan.borrow() {
if exit_now { if chan.changed().await.is_err() {
return; return;
} }
} }
@ -40,37 +40,22 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
info!("Opening database..."); info!("Opening database...");
let mut db_path = config.metadata_dir.clone(); let mut db_path = config.metadata_dir.clone();
db_path.push("db"); db_path.push("db");
let db = match sled::open(&db_path) { let db = sled::open(&db_path).expect("Unable to open sled DB");
Ok(db) => db,
Err(e) => {
warn!("Old DB could not be openned ({}), attempting migration.", e);
let old = old_sled::open(&db_path).expect("Unable to open old DB for migration");
let mut new_path = config.metadata_dir.clone();
new_path.push("db2");
let new = sled::open(&new_path).expect("Unable to open new DB for migration");
new.import(old.export());
if old.checksum().expect("unable to compute old db checksum")
!= new.checksum().expect("unable to compute new db checksum")
{
panic!("db checksums don't match after migration");
}
drop(new);
drop(old);
std::fs::remove_dir_all(&db_path).expect("Cannot remove old DB folder");
std::fs::rename(new_path, &db_path)
.expect("Cannot move new DB folder to correct place");
sled::open(db_path).expect("Unable to open new DB after migration")
}
};
info!("Initialize RPC server..."); info!("Initialize RPC server...");
let mut rpc_server = RpcServer::new(config.rpc_bind_addr.clone(), config.rpc_tls.clone()); let mut rpc_server = RpcServer::new(config.rpc_bind_addr.clone(), config.rpc_tls.clone());
info!("Initializing background runner..."); info!("Initializing background runner...");
let (send_cancel, watch_cancel) = watch::channel(false); let (send_cancel, watch_cancel) = watch::channel(false);
let background = BackgroundRunner::new(16, watch_cancel.clone()); let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone());
let garage = Garage::new(config, db, background.clone(), &mut rpc_server).await; info!("Initializing Garage main data store...");
let garage = Garage::new(config.clone(), db, background, &mut rpc_server);
let bootstrap = garage.system.clone().bootstrap(
&config.bootstrap_peers[..],
config.consul_host,
config.consul_service_name,
);
info!("Crate admin RPC handler..."); info!("Crate admin RPC handler...");
AdminRpcHandler::new(garage.clone()).register_handler(&mut rpc_server); AdminRpcHandler::new(garage.clone()).register_handler(&mut rpc_server);
@ -78,18 +63,10 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
info!("Initializing RPC and API servers..."); info!("Initializing RPC and API servers...");
let run_rpc_server = Arc::new(rpc_server).run(wait_from(watch_cancel.clone())); let run_rpc_server = Arc::new(rpc_server).run(wait_from(watch_cancel.clone()));
let api_server = api_server::run_api_server(garage.clone(), wait_from(watch_cancel.clone())); let api_server = api_server::run_api_server(garage.clone(), wait_from(watch_cancel.clone()));
let web_server = web_server::run_web_server(garage.clone(), wait_from(watch_cancel.clone())); let web_server = web_server::run_web_server(garage, wait_from(watch_cancel.clone()));
futures::try_join!( futures::try_join!(
garage bootstrap.map(|rv| {
.system
.clone()
.bootstrap(
&garage.config.bootstrap_peers[..],
garage.config.consul_host.clone(),
garage.config.consul_service_name.clone()
)
.map(|rv| {
info!("Bootstrap done"); info!("Bootstrap done");
Ok(rv) Ok(rv)
}), }),
@ -105,9 +82,9 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
info!("Web server exited"); info!("Web server exited");
rv rv
}), }),
background.run().map(|rv| { await_background_done.map(|rv| {
info!("Background runner exited"); info!("Background runner exited: {:?}", rv);
Ok(rv) Ok(())
}), }),
shutdown_signal(send_cancel), shutdown_signal(send_cancel),
)?; )?;

View file

@ -16,23 +16,18 @@ path = "lib.rs"
garage_util = { version = "0.1.1", path = "../util" } garage_util = { version = "0.1.1", path = "../util" }
garage_rpc = { version = "0.1.1", path = "../rpc" } garage_rpc = { version = "0.1.1", path = "../rpc" }
garage_table = { version = "0.1.1", path = "../table" } garage_table = { version = "0.1.1", path = "../table" }
model010 = { package = "garage_model_010b", version = "0.0.1" }
bytes = "0.4" rand = "0.8"
rand = "0.7" hex = "0.4"
hex = "0.3" arc-swap = "1.0"
sha2 = "0.8"
arc-swap = "0.4"
log = "0.4" log = "0.4"
sled = "0.34" sled = "0.34"
rmp-serde = "0.14.3" rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
serde_bytes = "0.11" serde_bytes = "0.11"
async-trait = "0.1.30"
futures = "0.3" futures = "0.3"
futures-util = "0.3" futures-util = "0.3"
tokio = { version = "0.2", default-features = false, features = ["rt-core", "rt-threaded", "io-driver", "net", "tcp", "time", "macros", "sync", "signal", "fs"] } tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }

View file

@ -5,22 +5,20 @@ use std::time::Duration;
use arc_swap::ArcSwapOption; use arc_swap::ArcSwapOption;
use futures::future::*; use futures::future::*;
use futures::select; use futures::select;
use futures::stream::*;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tokio::fs; use tokio::fs;
use tokio::prelude::*; use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::sync::{watch, Mutex, Notify}; use tokio::sync::{watch, Mutex, Notify};
use garage_util::data;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use garage_util::time::*;
use garage_rpc::membership::System; use garage_rpc::membership::System;
use garage_rpc::rpc_client::*; use garage_rpc::rpc_client::*;
use garage_rpc::rpc_server::*; use garage_rpc::rpc_server::*;
use garage_table::table_sharded::TableShardedReplication; use garage_table::replication::{sharded::TableShardedReplication, TableReplication};
use garage_table::TableReplication;
use crate::block_ref_table::*; use crate::block_ref_table::*;
@ -28,7 +26,10 @@ use crate::garage::Garage;
pub const INLINE_THRESHOLD: usize = 3072; pub const INLINE_THRESHOLD: usize = 3072;
pub const BACKGROUND_WORKERS: u64 = 1;
const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(42); const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(42);
const BLOCK_GC_TIMEOUT: Duration = Duration::from_secs(60);
const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5); const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5);
const RESYNC_RETRY_TIMEOUT: Duration = Duration::from_secs(10); const RESYNC_RETRY_TIMEOUT: Duration = Duration::from_secs(10);
@ -56,14 +57,14 @@ pub struct BlockManager {
pub data_dir: PathBuf, pub data_dir: PathBuf,
pub data_dir_lock: Mutex<()>, pub data_dir_lock: Mutex<()>,
pub rc: sled::Tree, rc: sled::Tree,
pub resync_queue: sled::Tree, resync_queue: sled::Tree,
pub resync_notify: Notify, resync_notify: Notify,
pub system: Arc<System>, system: Arc<System>,
rpc_client: Arc<RpcClient<Message>>, rpc_client: Arc<RpcClient<Message>>,
pub garage: ArcSwapOption<Garage>, pub(crate) garage: ArcSwapOption<Garage>,
} }
impl BlockManager { impl BlockManager {
@ -77,7 +78,6 @@ impl BlockManager {
let rc = db let rc = db
.open_tree("block_local_rc") .open_tree("block_local_rc")
.expect("Unable to open block_local_rc tree"); .expect("Unable to open block_local_rc tree");
rc.set_merge_operator(rc_merge);
let resync_queue = db let resync_queue = db
.open_tree("block_local_resync_queue") .open_tree("block_local_resync_queue")
@ -127,18 +127,16 @@ impl BlockManager {
} }
} }
pub async fn spawn_background_worker(self: Arc<Self>) { pub fn spawn_background_worker(self: Arc<Self>) {
// Launch 2 simultaneous workers for background resync loop preprocessing // Launch 2 simultaneous workers for background resync loop preprocessing
for i in 0..2usize { for i in 0..BACKGROUND_WORKERS {
let bm2 = self.clone(); let bm2 = self.clone();
let background = self.system.background.clone(); let background = self.system.background.clone();
tokio::spawn(async move { tokio::spawn(async move {
tokio::time::delay_for(Duration::from_secs(10)).await; tokio::time::sleep(Duration::from_secs(10 * (i + 1))).await;
background background.spawn_worker(format!("block resync worker {}", i), move |must_exit| {
.spawn_worker(format!("block resync worker {}", i), move |must_exit| {
bm2.resync_loop(must_exit) bm2.resync_loop(must_exit)
}) });
.await;
}); });
} }
} }
@ -168,7 +166,7 @@ impl BlockManager {
Ok(f) => f, Ok(f) => f,
Err(e) => { Err(e) => {
// Not found but maybe we should have had it ?? // Not found but maybe we should have had it ??
self.put_to_resync(hash, 0)?; self.put_to_resync(hash, Duration::from_millis(0))?;
return Err(Into::into(e)); return Err(Into::into(e));
} }
}; };
@ -176,11 +174,16 @@ impl BlockManager {
f.read_to_end(&mut data).await?; f.read_to_end(&mut data).await?;
drop(f); drop(f);
if data::sha256sum(&data[..]) != *hash { if blake2sum(&data[..]) != *hash {
let _lock = self.data_dir_lock.lock().await; let _lock = self.data_dir_lock.lock().await;
warn!("Block {:?} is corrupted. Deleting and resyncing.", hash); warn!(
fs::remove_file(path).await?; "Block {:?} is corrupted. Renaming to .corrupted and resyncing.",
self.put_to_resync(&hash, 0)?; hash
);
let mut path2 = path.clone();
path2.set_extension(".corrupted");
fs::rename(path, path2).await?;
self.put_to_resync(&hash, Duration::from_millis(0))?;
return Err(Error::CorruptData(*hash)); return Err(Error::CorruptData(*hash));
} }
@ -191,7 +194,7 @@ impl BlockManager {
let needed = self let needed = self
.rc .rc
.get(hash.as_ref())? .get(hash.as_ref())?
.map(|x| u64_from_bytes(x.as_ref()) > 0) .map(|x| u64_from_be_bytes(x) > 0)
.unwrap_or(false); .unwrap_or(false);
if needed { if needed {
let path = self.block_path(hash); let path = self.block_path(hash);
@ -215,84 +218,95 @@ impl BlockManager {
} }
pub fn block_incref(&self, hash: &Hash) -> Result<(), Error> { pub fn block_incref(&self, hash: &Hash) -> Result<(), Error> {
let old_rc = self.rc.get(&hash)?; let old_rc = self.rc.fetch_and_update(&hash, |old| {
self.rc.merge(&hash, vec![1])?; let old_v = old.map(u64_from_be_bytes).unwrap_or(0);
if old_rc.map(|x| u64_from_bytes(&x[..]) == 0).unwrap_or(true) { Some(u64::to_be_bytes(old_v + 1).to_vec())
self.put_to_resync(&hash, BLOCK_RW_TIMEOUT.as_millis() as u64)?; })?;
let old_rc = old_rc.map(u64_from_be_bytes).unwrap_or(0);
if old_rc == 0 {
self.put_to_resync(&hash, BLOCK_RW_TIMEOUT)?;
} }
Ok(()) Ok(())
} }
pub fn block_decref(&self, hash: &Hash) -> Result<(), Error> { pub fn block_decref(&self, hash: &Hash) -> Result<(), Error> {
let new_rc = self.rc.merge(&hash, vec![0])?; let new_rc = self.rc.update_and_fetch(&hash, |old| {
if new_rc.map(|x| u64_from_bytes(&x[..]) == 0).unwrap_or(true) { let old_v = old.map(u64_from_be_bytes).unwrap_or(0);
self.put_to_resync(&hash, 0)?; if old_v > 1 {
Some(u64::to_be_bytes(old_v - 1).to_vec())
} else {
None
}
})?;
if new_rc.is_none() {
self.put_to_resync(&hash, BLOCK_GC_TIMEOUT)?;
} }
Ok(()) Ok(())
} }
fn put_to_resync(&self, hash: &Hash, delay_millis: u64) -> Result<(), Error> { fn put_to_resync(&self, hash: &Hash, delay: Duration) -> Result<(), Error> {
let when = now_msec() + delay_millis; let when = now_msec() + delay.as_millis() as u64;
trace!("Put resync_queue: {} {:?}", when, hash); trace!("Put resync_queue: {} {:?}", when, hash);
let mut key = u64::to_be_bytes(when).to_vec(); let mut key = u64::to_be_bytes(when).to_vec();
key.extend(hash.as_ref()); key.extend(hash.as_ref());
self.resync_queue.insert(key, hash.as_ref())?; self.resync_queue.insert(key, hash.as_ref())?;
self.resync_notify.notify(); self.resync_notify.notify_waiters();
Ok(()) Ok(())
} }
async fn resync_loop( async fn resync_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
self: Arc<Self>,
mut must_exit: watch::Receiver<bool>,
) -> Result<(), Error> {
let mut n_failures = 0usize;
while !*must_exit.borrow() { while !*must_exit.borrow() {
if let Some((time_bytes, hash_bytes)) = self.resync_queue.pop_min()? { if let Err(e) = self.resync_iter(&mut must_exit).await {
let time_msec = u64_from_bytes(&time_bytes[0..8]); warn!("Error in block resync loop: {}", e);
select! {
_ = tokio::time::sleep(Duration::from_secs(1)).fuse() => (),
_ = must_exit.changed().fuse() => (),
}
}
}
}
async fn resync_iter(&self, must_exit: &mut watch::Receiver<bool>) -> Result<(), Error> {
if let Some(first_item) = self.resync_queue.iter().next() {
let (time_bytes, hash_bytes) = first_item?;
let time_msec = u64_from_be_bytes(&time_bytes[0..8]);
let now = now_msec(); let now = now_msec();
if now >= time_msec { if now >= time_msec {
let mut hash = [0u8; 32]; let hash = Hash::try_from(&hash_bytes[..]).unwrap();
hash.copy_from_slice(hash_bytes.as_ref()); let res = self.resync_block(&hash).await;
let hash = Hash::from(hash); if let Err(e) = &res {
warn!("Error when resyncing {:?}: {}", hash, e);
if let Err(e) = self.resync_iter(&hash).await { self.put_to_resync(&hash, RESYNC_RETRY_TIMEOUT)?;
warn!("Failed to resync block {:?}, retrying later: {}", hash, e);
self.put_to_resync(&hash, RESYNC_RETRY_TIMEOUT.as_millis() as u64)?;
n_failures += 1;
if n_failures >= 10 {
warn!("Too many resync failures, throttling.");
tokio::time::delay_for(Duration::from_secs(1)).await;
} }
self.resync_queue.remove(&time_bytes)?;
res?; // propagate error to delay main loop
} else { } else {
n_failures = 0; let delay = tokio::time::sleep(Duration::from_millis(time_msec - now));
}
} else {
self.resync_queue.insert(time_bytes, hash_bytes)?;
let delay = tokio::time::delay_for(Duration::from_millis(time_msec - now));
select! { select! {
_ = delay.fuse() => (), _ = delay.fuse() => (),
_ = self.resync_notify.notified().fuse() => (), _ = self.resync_notify.notified().fuse() => (),
_ = must_exit.recv().fuse() => (), _ = must_exit.changed().fuse() => (),
} }
} }
} else { } else {
select! { select! {
_ = self.resync_notify.notified().fuse() => (), _ = self.resync_notify.notified().fuse() => (),
_ = must_exit.recv().fuse() => (), _ = must_exit.changed().fuse() => (),
}
} }
} }
Ok(()) Ok(())
} }
async fn resync_iter(&self, hash: &Hash) -> Result<(), Error> { async fn resync_block(&self, hash: &Hash) -> Result<(), Error> {
let lock = self.data_dir_lock.lock().await;
let path = self.block_path(hash); let path = self.block_path(hash);
let exists = fs::metadata(&path).await.is_ok(); let exists = fs::metadata(&path).await.is_ok();
let needed = self let needed = self
.rc .rc
.get(hash.as_ref())? .get(hash.as_ref())?
.map(|x| u64_from_bytes(x.as_ref()) > 0) .map(|x| u64_from_be_bytes(x) > 0)
.unwrap_or(false); .unwrap_or(false);
if exists != needed { if exists != needed {
@ -305,9 +319,10 @@ impl BlockManager {
if exists && !needed { if exists && !needed {
trace!("Offloading block {:?}", hash); trace!("Offloading block {:?}", hash);
let ring = self.system.ring.borrow().clone(); let mut who = self.replication.write_nodes(&hash);
if who.len() < self.replication.write_quorum() {
let mut who = self.replication.replication_nodes(&hash, &ring); return Err(Error::Message(format!("Not trying to offload block because we don't have a quorum of nodes to write to")));
}
who.retain(|id| *id != self.system.id); who.retain(|id| *id != self.system.id);
let msg = Arc::new(Message::NeedBlockQuery(*hash)); let msg = Arc::new(Message::NeedBlockQuery(*hash));
@ -340,17 +355,17 @@ impl BlockManager {
need_nodes.len() need_nodes.len()
); );
let put_block_message = Arc::new(self.read_block(hash).await?); let put_block_message = self.read_block(hash).await?;
let put_resps = join_all(need_nodes.iter().map(|to| {
self.rpc_client self.rpc_client
.call_arc(*to, put_block_message.clone(), BLOCK_RW_TIMEOUT) .try_call_many(
})) &need_nodes[..],
.await; put_block_message,
for resp in put_resps { RequestStrategy::with_quorum(need_nodes.len())
resp?; .with_timeout(BLOCK_RW_TIMEOUT),
)
.await?;
} }
} info!(
trace!(
"Deleting block {:?}, offload finished ({} / {})", "Deleting block {:?}, offload finished ({} / {})",
hash, hash,
need_nodes.len(), need_nodes.len(),
@ -358,10 +373,11 @@ impl BlockManager {
); );
fs::remove_file(path).await?; fs::remove_file(path).await?;
self.resync_queue.remove(&hash)?;
} }
if needed && !exists { if needed && !exists {
drop(lock);
// TODO find a way to not do this if they are sending it to us // TODO find a way to not do this if they are sending it to us
// Let's suppose this isn't an issue for now with the BLOCK_RW_TIMEOUT delay // Let's suppose this isn't an issue for now with the BLOCK_RW_TIMEOUT delay
// between the RC being incremented and this part being called. // between the RC being incremented and this part being called.
@ -373,7 +389,7 @@ impl BlockManager {
} }
pub async fn rpc_get_block(&self, hash: &Hash) -> Result<Vec<u8>, Error> { pub async fn rpc_get_block(&self, hash: &Hash) -> Result<Vec<u8>, Error> {
let who = self.replication.read_nodes(&hash, &self.system); let who = self.replication.read_nodes(&hash);
let resps = self let resps = self
.rpc_client .rpc_client
.try_call_many( .try_call_many(
@ -397,12 +413,12 @@ impl BlockManager {
} }
pub async fn rpc_put_block(&self, hash: Hash, data: Vec<u8>) -> Result<(), Error> { pub async fn rpc_put_block(&self, hash: Hash, data: Vec<u8>) -> Result<(), Error> {
let who = self.replication.write_nodes(&hash, &self.system); let who = self.replication.write_nodes(&hash);
self.rpc_client self.rpc_client
.try_call_many( .try_call_many(
&who[..], &who[..],
Message::PutBlock(PutBlockMessage { hash, data }), Message::PutBlock(PutBlockMessage { hash, data }),
RequestStrategy::with_quorum(self.replication.write_quorum(&self.system)) RequestStrategy::with_quorum(self.replication.write_quorum())
.with_timeout(BLOCK_RW_TIMEOUT), .with_timeout(BLOCK_RW_TIMEOUT),
) )
.await?; .await?;
@ -414,15 +430,15 @@ impl BlockManager {
let garage = self.garage.load_full().unwrap(); let garage = self.garage.load_full().unwrap();
let mut last_hash = None; let mut last_hash = None;
let mut i = 0usize; let mut i = 0usize;
for entry in garage.block_ref_table.store.iter() { for entry in garage.block_ref_table.data.store.iter() {
let (_k, v_bytes) = entry?; let (_k, v_bytes) = entry?;
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(v_bytes.as_ref())?; let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(v_bytes.as_ref())?;
if Some(&block_ref.block) == last_hash.as_ref() { if Some(&block_ref.block) == last_hash.as_ref() {
continue; continue;
} }
if !block_ref.deleted { if !block_ref.deleted.get() {
last_hash = Some(block_ref.block); last_hash = Some(block_ref.block);
self.put_to_resync(&block_ref.block, 0)?; self.put_to_resync(&block_ref.block, Duration::from_secs(0))?;
} }
i += 1; i += 1;
if i & 0xFF == 0 && *must_exit.borrow() { if i & 0xFF == 0 && *must_exit.borrow() {
@ -447,8 +463,12 @@ impl BlockManager {
// so that we can offload them if necessary and then delete them locally. // so that we can offload them if necessary and then delete them locally.
async move { async move {
let mut ls_data_dir = fs::read_dir(path).await?; let mut ls_data_dir = fs::read_dir(path).await?;
while let Some(data_dir_ent) = ls_data_dir.next().await { loop {
let data_dir_ent = data_dir_ent?; let data_dir_ent = ls_data_dir.next_entry().await?;
let data_dir_ent = match data_dir_ent {
Some(x) => x,
None => break,
};
let name = data_dir_ent.file_name(); let name = data_dir_ent.file_name();
let name = match name.into_string() { let name = match name.into_string() {
Ok(x) => x, Ok(x) => x,
@ -466,7 +486,7 @@ impl BlockManager {
}; };
let mut hash = [0u8; 32]; let mut hash = [0u8; 32];
hash.copy_from_slice(&hash_bytes[..]); hash.copy_from_slice(&hash_bytes[..]);
self.put_to_resync(&hash.into(), 0)?; self.put_to_resync(&hash.into(), Duration::from_secs(0))?;
} }
if *must_exit.borrow() { if *must_exit.borrow() {
@ -477,32 +497,19 @@ impl BlockManager {
} }
.boxed() .boxed()
} }
pub fn resync_queue_len(&self) -> usize {
self.resync_queue.len()
}
pub fn rc_len(&self) -> usize {
self.rc.len()
}
} }
fn u64_from_bytes(bytes: &[u8]) -> u64 { fn u64_from_be_bytes<T: AsRef<[u8]>>(bytes: T) -> u64 {
assert!(bytes.len() == 8); assert!(bytes.as_ref().len() == 8);
let mut x8 = [0u8; 8]; let mut x8 = [0u8; 8];
x8.copy_from_slice(bytes); x8.copy_from_slice(bytes.as_ref());
u64::from_be_bytes(x8) u64::from_be_bytes(x8)
} }
fn rc_merge(_key: &[u8], old: Option<&[u8]>, new: &[u8]) -> Option<Vec<u8>> {
let old = old.map(u64_from_bytes).unwrap_or(0);
assert!(new.len() == 1);
let new = match new[0] {
0 => {
if old > 0 {
old - 1
} else {
0
}
}
1 => old + 1,
_ => unreachable!(),
};
if new == 0 {
None
} else {
Some(u64::to_be_bytes(new).to_vec())
}
}

View file

@ -1,9 +1,9 @@
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::sync::Arc; use std::sync::Arc;
use garage_util::background::*;
use garage_util::data::*; use garage_util::data::*;
use garage_table::crdt::CRDT;
use garage_table::*; use garage_table::*;
use crate::block::*; use crate::block::*;
@ -17,7 +17,7 @@ pub struct BlockRef {
pub version: UUID, pub version: UUID,
// Keep track of deleted status // Keep track of deleted status
pub deleted: bool, pub deleted: crdt::Bool,
} }
impl Entry<Hash, UUID> for BlockRef { impl Entry<Hash, UUID> for BlockRef {
@ -27,16 +27,18 @@ impl Entry<Hash, UUID> for BlockRef {
fn sort_key(&self) -> &UUID { fn sort_key(&self) -> &UUID {
&self.version &self.version
} }
fn is_tombstone(&self) -> bool {
fn merge(&mut self, other: &Self) { self.deleted.get()
if other.deleted {
self.deleted = true;
} }
}
impl CRDT for BlockRef {
fn merge(&mut self, other: &Self) {
self.deleted.merge(&other.deleted);
} }
} }
pub struct BlockRefTable { pub struct BlockRefTable {
pub background: Arc<BackgroundRunner>,
pub block_manager: Arc<BlockManager>, pub block_manager: Arc<BlockManager>,
} }
@ -48,8 +50,8 @@ impl TableSchema for BlockRefTable {
fn updated(&self, old: Option<Self::E>, new: Option<Self::E>) { fn updated(&self, old: Option<Self::E>, new: Option<Self::E>) {
let block = &old.as_ref().or(new.as_ref()).unwrap().block; let block = &old.as_ref().or(new.as_ref()).unwrap().block;
let was_before = old.as_ref().map(|x| !x.deleted).unwrap_or(false); let was_before = old.as_ref().map(|x| !x.deleted.get()).unwrap_or(false);
let is_after = new.as_ref().map(|x| !x.deleted).unwrap_or(false); let is_after = new.as_ref().map(|x| !x.deleted.get()).unwrap_or(false);
if is_after && !was_before { if is_after && !was_before {
if let Err(e) = self.block_manager.block_incref(block) { if let Err(e) = self.block_manager.block_incref(block) {
warn!("block_incref failed for block {:?}: {}", block, e); warn!("block_incref failed for block {:?}: {}", block, e);
@ -63,6 +65,6 @@ impl TableSchema for BlockRefTable {
} }
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
filter.apply(entry.deleted) filter.apply(entry.deleted.get())
} }
} }

View file

@ -5,11 +5,6 @@ use garage_table::*;
use crate::key_table::PermissionSet; use crate::key_table::PermissionSet;
// We import the same file but in its version 0.1.0.
// We can then access v0.1.0 data structures.
// We use them to perform migrations.
use model010::bucket_table as prev;
/// A bucket is a collection of objects /// A bucket is a collection of objects
/// ///
/// Its parameters are not directly accessible as: /// Its parameters are not directly accessible as:
@ -89,7 +84,9 @@ impl Entry<EmptyKey, String> for Bucket {
fn sort_key(&self) -> &String { fn sort_key(&self) -> &String {
&self.name &self.name
} }
}
impl CRDT for Bucket {
fn merge(&mut self, other: &Self) { fn merge(&mut self, other: &Self) {
self.state.merge(&other.state); self.state.merge(&other.state);
} }
@ -106,39 +103,4 @@ impl TableSchema for BucketTable {
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
filter.apply(entry.is_deleted()) filter.apply(entry.is_deleted())
} }
fn try_migrate(bytes: &[u8]) -> Option<Self::E> {
let old = match rmp_serde::decode::from_read_ref::<_, prev::Bucket>(bytes) {
Ok(x) => x,
Err(_) => return None,
};
if old.deleted {
Some(Bucket {
name: old.name,
state: crdt::LWW::migrate_from_raw(old.timestamp, BucketState::Deleted),
})
} else {
let mut keys = crdt::LWWMap::new();
for ak in old.authorized_keys() {
keys.merge(&crdt::LWWMap::migrate_from_raw_item(
ak.key_id.clone(),
ak.timestamp,
PermissionSet {
allow_read: ak.allow_read,
allow_write: ak.allow_write,
},
));
}
let params = BucketParams {
authorized_keys: keys,
website: crdt::LWW::new(false),
};
Some(Bucket {
name: old.name,
state: crdt::LWW::migrate_from_raw(old.timestamp, BucketState::Present(params)),
})
}
}
} }

View file

@ -7,8 +7,8 @@ use garage_rpc::membership::System;
use garage_rpc::rpc_client::RpcHttpClient; use garage_rpc::rpc_client::RpcHttpClient;
use garage_rpc::rpc_server::RpcServer; use garage_rpc::rpc_server::RpcServer;
use garage_table::table_fullcopy::*; use garage_table::replication::fullcopy::*;
use garage_table::table_sharded::*; use garage_table::replication::sharded::*;
use garage_table::*; use garage_table::*;
use crate::block::*; use crate::block::*;
@ -35,7 +35,7 @@ pub struct Garage {
} }
impl Garage { impl Garage {
pub async fn new( pub fn new(
config: Config, config: Config,
db: sled::Db, db: sled::Db,
background: Arc<BackgroundRunner>, background: Arc<BackgroundRunner>,
@ -54,18 +54,23 @@ impl Garage {
); );
let data_rep_param = TableShardedReplication { let data_rep_param = TableShardedReplication {
system: system.clone(),
replication_factor: config.data_replication_factor, replication_factor: config.data_replication_factor,
write_quorum: (config.data_replication_factor + 1) / 2, write_quorum: (config.data_replication_factor + 1) / 2,
read_quorum: 1, read_quorum: 1,
}; };
let meta_rep_param = TableShardedReplication { let meta_rep_param = TableShardedReplication {
system: system.clone(),
replication_factor: config.meta_replication_factor, replication_factor: config.meta_replication_factor,
write_quorum: (config.meta_replication_factor + 1) / 2, write_quorum: (config.meta_replication_factor + 1) / 2,
read_quorum: (config.meta_replication_factor + 1) / 2, read_quorum: (config.meta_replication_factor + 1) / 2,
}; };
let control_rep_param = TableFullReplication::new(config.control_write_max_faults); let control_rep_param = TableFullReplication {
system: system.clone(),
max_faults: config.control_write_max_faults,
};
info!("Initialize block manager..."); info!("Initialize block manager...");
let block_manager = BlockManager::new( let block_manager = BlockManager::new(
@ -79,7 +84,6 @@ impl Garage {
info!("Initialize block_ref_table..."); info!("Initialize block_ref_table...");
let block_ref_table = Table::new( let block_ref_table = Table::new(
BlockRefTable { BlockRefTable {
background: background.clone(),
block_manager: block_manager.clone(), block_manager: block_manager.clone(),
}, },
data_rep_param.clone(), data_rep_param.clone(),
@ -87,8 +91,7 @@ impl Garage {
&db, &db,
"block_ref".to_string(), "block_ref".to_string(),
rpc_server, rpc_server,
) );
.await;
info!("Initialize version_table..."); info!("Initialize version_table...");
let version_table = Table::new( let version_table = Table::new(
@ -101,8 +104,7 @@ impl Garage {
&db, &db,
"version".to_string(), "version".to_string(),
rpc_server, rpc_server,
) );
.await;
info!("Initialize object_table..."); info!("Initialize object_table...");
let object_table = Table::new( let object_table = Table::new(
@ -115,8 +117,7 @@ impl Garage {
&db, &db,
"object".to_string(), "object".to_string(),
rpc_server, rpc_server,
) );
.await;
info!("Initialize bucket_table..."); info!("Initialize bucket_table...");
let bucket_table = Table::new( let bucket_table = Table::new(
@ -126,8 +127,7 @@ impl Garage {
&db, &db,
"bucket".to_string(), "bucket".to_string(),
rpc_server, rpc_server,
) );
.await;
info!("Initialize key_table_table..."); info!("Initialize key_table_table...");
let key_table = Table::new( let key_table = Table::new(
@ -137,8 +137,7 @@ impl Garage {
&db, &db,
"key".to_string(), "key".to_string(),
rpc_server, rpc_server,
) );
.await;
info!("Initialize Garage..."); info!("Initialize Garage...");
let garage = Arc::new(Self { let garage = Arc::new(Self {
@ -156,7 +155,7 @@ impl Garage {
info!("Start block manager background thread..."); info!("Start block manager background thread...");
garage.block_manager.garage.swap(Some(garage.clone())); garage.block_manager.garage.swap(Some(garage.clone()));
garage.block_manager.clone().spawn_background_worker().await; garage.block_manager.clone().spawn_background_worker();
garage garage
} }

View file

@ -1,10 +1,8 @@
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use garage_table::crdt::CRDT; use garage_table::crdt::*;
use garage_table::*; use garage_table::*;
use model010::key_table as prev;
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
pub struct Key { pub struct Key {
// Primary key // Primary key
@ -36,6 +34,15 @@ impl Key {
authorized_buckets: crdt::LWWMap::new(), authorized_buckets: crdt::LWWMap::new(),
} }
} }
pub fn import(key_id: &str, secret_key: &str, name: &str) -> Self {
Self {
key_id: key_id.to_string(),
secret_key: secret_key.to_string(),
name: crdt::LWW::new(name.to_string()),
deleted: crdt::Bool::new(false),
authorized_buckets: crdt::LWWMap::new(),
}
}
pub fn delete(key_id: String) -> Self { pub fn delete(key_id: String) -> Self {
Self { Self {
key_id, key_id,
@ -66,6 +73,10 @@ pub struct PermissionSet {
pub allow_write: bool, pub allow_write: bool,
} }
impl AutoCRDT for PermissionSet {
const WARN_IF_DIFFERENT: bool = true;
}
impl Entry<EmptyKey, String> for Key { impl Entry<EmptyKey, String> for Key {
fn partition_key(&self) -> &EmptyKey { fn partition_key(&self) -> &EmptyKey {
&EmptyKey &EmptyKey
@ -73,55 +84,43 @@ impl Entry<EmptyKey, String> for Key {
fn sort_key(&self) -> &String { fn sort_key(&self) -> &String {
&self.key_id &self.key_id
} }
}
impl CRDT for Key {
fn merge(&mut self, other: &Self) { fn merge(&mut self, other: &Self) {
self.name.merge(&other.name); self.name.merge(&other.name);
self.deleted.merge(&other.deleted); self.deleted.merge(&other.deleted);
if self.deleted.get() { if self.deleted.get() {
self.authorized_buckets.clear(); self.authorized_buckets.clear();
return; } else {
}
self.authorized_buckets.merge(&other.authorized_buckets); self.authorized_buckets.merge(&other.authorized_buckets);
} }
}
} }
pub struct KeyTable; pub struct KeyTable;
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum KeyFilter {
Deleted(DeletedFilter),
Matches(String),
}
impl TableSchema for KeyTable { impl TableSchema for KeyTable {
type P = EmptyKey; type P = EmptyKey;
type S = String; type S = String;
type E = Key; type E = Key;
type Filter = DeletedFilter; type Filter = KeyFilter;
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
filter.apply(entry.deleted.get()) match filter {
KeyFilter::Deleted(df) => df.apply(entry.deleted.get()),
KeyFilter::Matches(pat) => {
let pat = pat.to_lowercase();
entry.key_id.to_lowercase().starts_with(&pat)
|| entry.name.get().to_lowercase() == pat
} }
fn try_migrate(bytes: &[u8]) -> Option<Self::E> {
let old = match rmp_serde::decode::from_read_ref::<_, prev::Key>(bytes) {
Ok(x) => x,
Err(_) => return None,
};
let mut new = Self::E {
key_id: old.key_id.clone(),
secret_key: old.secret_key.clone(),
name: crdt::LWW::migrate_from_raw(old.name_timestamp, old.name.clone()),
deleted: crdt::Bool::new(old.deleted),
authorized_buckets: crdt::LWWMap::new(),
};
for ab in old.authorized_buckets() {
let it = crdt::LWWMap::migrate_from_raw_item(
ab.bucket.clone(),
ab.timestamp,
PermissionSet {
allow_read: ab.allow_read,
allow_write: ab.allow_write,
},
);
new.authorized_buckets.merge(&it);
} }
Some(new)
} }
} }

View file

@ -5,13 +5,12 @@ use std::sync::Arc;
use garage_util::background::BackgroundRunner; use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_table::table_sharded::*; use garage_table::crdt::*;
use garage_table::replication::sharded::*;
use garage_table::*; use garage_table::*;
use crate::version_table::*; use crate::version_table::*;
use model010::object_table as prev;
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
pub struct Object { pub struct Object {
// Primary key // Primary key
@ -70,7 +69,7 @@ pub enum ObjectVersionState {
Aborted, Aborted,
} }
impl ObjectVersionState { impl CRDT for ObjectVersionState {
fn merge(&mut self, other: &Self) { fn merge(&mut self, other: &Self) {
use ObjectVersionState::*; use ObjectVersionState::*;
match other { match other {
@ -91,37 +90,30 @@ impl ObjectVersionState {
} }
} }
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub enum ObjectVersionData { pub enum ObjectVersionData {
DeleteMarker, DeleteMarker,
Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>), Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>),
FirstBlock(ObjectVersionMeta, Hash), FirstBlock(ObjectVersionMeta, Hash),
} }
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] impl AutoCRDT for ObjectVersionData {
const WARN_IF_DIFFERENT: bool = true;
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct ObjectVersionMeta { pub struct ObjectVersionMeta {
pub headers: ObjectVersionHeaders, pub headers: ObjectVersionHeaders,
pub size: u64, pub size: u64,
pub etag: String, pub etag: String,
} }
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct ObjectVersionHeaders { pub struct ObjectVersionHeaders {
pub content_type: String, pub content_type: String,
pub other: BTreeMap<String, String>, pub other: BTreeMap<String, String>,
} }
impl ObjectVersionData {
fn merge(&mut self, b: &Self) {
if *self != *b {
warn!(
"Inconsistent object version data: {:?} (local) vs {:?} (remote)",
self, b
);
}
}
}
impl ObjectVersion { impl ObjectVersion {
fn cmp_key(&self) -> (u64, UUID) { fn cmp_key(&self) -> (u64, UUID) {
(self.timestamp, self.uuid) (self.timestamp, self.uuid)
@ -154,8 +146,14 @@ impl Entry<String, String> for Object {
fn sort_key(&self) -> &String { fn sort_key(&self) -> &String {
&self.key &self.key
} }
fn is_tombstone(&self) -> bool {
self.versions.len() == 1 && self.versions[0].state == ObjectVersionState::Complete(ObjectVersionData::DeleteMarker)
}
}
impl CRDT for Object {
fn merge(&mut self, other: &Self) { fn merge(&mut self, other: &Self) {
// Merge versions from other into here
for other_v in other.versions.iter() { for other_v in other.versions.iter() {
match self match self
.versions .versions
@ -169,6 +167,9 @@ impl Entry<String, String> for Object {
} }
} }
} }
// Remove versions which are obsolete, i.e. those that come
// before the last version which .is_complete().
let last_complete = self let last_complete = self
.versions .versions
.iter() .iter()
@ -212,13 +213,8 @@ impl TableSchema for ObjectTable {
} }
}; };
if newly_deleted { if newly_deleted {
let deleted_version = Version::new( let deleted_version =
v.uuid, Version::new(v.uuid, old_v.bucket.clone(), old_v.key.clone(), true);
old_v.bucket.clone(),
old_v.key.clone(),
true,
vec![],
);
version_table.insert(&deleted_version).await?; version_table.insert(&deleted_version).await?;
} }
} }
@ -231,55 +227,4 @@ impl TableSchema for ObjectTable {
let deleted = !entry.versions.iter().any(|v| v.is_data()); let deleted = !entry.versions.iter().any(|v| v.is_data());
filter.apply(deleted) filter.apply(deleted)
} }
fn try_migrate(bytes: &[u8]) -> Option<Self::E> {
let old = match rmp_serde::decode::from_read_ref::<_, prev::Object>(bytes) {
Ok(x) => x,
Err(_) => return None,
};
let new_v = old
.versions()
.iter()
.map(migrate_version)
.collect::<Vec<_>>();
let new = Object::new(old.bucket.clone(), old.key.clone(), new_v);
Some(new)
}
}
fn migrate_version(old: &prev::ObjectVersion) -> ObjectVersion {
let headers = ObjectVersionHeaders {
content_type: old.mime_type.clone(),
other: BTreeMap::new(),
};
let meta = ObjectVersionMeta {
headers: headers.clone(),
size: old.size,
etag: "".to_string(),
};
let state = match old.state {
prev::ObjectVersionState::Uploading => ObjectVersionState::Uploading(headers),
prev::ObjectVersionState::Aborted => ObjectVersionState::Aborted,
prev::ObjectVersionState::Complete => match &old.data {
prev::ObjectVersionData::Uploading => ObjectVersionState::Uploading(headers),
prev::ObjectVersionData::DeleteMarker => {
ObjectVersionState::Complete(ObjectVersionData::DeleteMarker)
}
prev::ObjectVersionData::Inline(x) => {
ObjectVersionState::Complete(ObjectVersionData::Inline(meta, x.clone()))
}
prev::ObjectVersionData::FirstBlock(h) => {
let mut hash = [0u8; 32];
hash.copy_from_slice(h.as_ref());
ObjectVersionState::Complete(ObjectVersionData::FirstBlock(meta, Hash::from(hash)))
}
},
};
let mut uuid = [0u8; 32];
uuid.copy_from_slice(old.uuid.as_ref());
ObjectVersion {
uuid: UUID::from(uuid),
timestamp: old.timestamp,
state,
}
} }

View file

@ -4,7 +4,8 @@ use std::sync::Arc;
use garage_util::background::BackgroundRunner; use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_table::table_sharded::*; use garage_table::crdt::*;
use garage_table::replication::sharded::*;
use garage_table::*; use garage_table::*;
use crate::block_ref_table::*; use crate::block_ref_table::*;
@ -15,8 +16,11 @@ pub struct Version {
pub uuid: UUID, pub uuid: UUID,
// Actual data: the blocks for this version // Actual data: the blocks for this version
pub deleted: bool, // In the case of a multipart upload, also store the etags
blocks: Vec<VersionBlock>, // of individual parts and check them when doing CompleteMultipartUpload
pub deleted: crdt::Bool,
pub blocks: crdt::Map<VersionBlockKey, VersionBlock>,
pub parts_etags: crdt::Map<u64, String>,
// Back link to bucket+key so that we can figure if // Back link to bucket+key so that we can figure if
// this was deleted later on // this was deleted later on
@ -25,56 +29,46 @@ pub struct Version {
} }
impl Version { impl Version {
pub fn new( pub fn new(uuid: UUID, bucket: String, key: String, deleted: bool) -> Self {
uuid: UUID, Self {
bucket: String,
key: String,
deleted: bool,
blocks: Vec<VersionBlock>,
) -> Self {
let mut ret = Self {
uuid, uuid,
deleted, deleted: deleted.into(),
blocks: vec![], blocks: crdt::Map::new(),
parts_etags: crdt::Map::new(),
bucket, bucket,
key, key,
};
for b in blocks {
ret.add_block(b)
.expect("Twice the same VersionBlock in Version constructor");
} }
ret
}
/// Adds a block if it wasn't already present
pub fn add_block(&mut self, new: VersionBlock) -> Result<(), ()> {
match self
.blocks
.binary_search_by(|b| b.cmp_key().cmp(&new.cmp_key()))
{
Err(i) => {
self.blocks.insert(i, new);
Ok(())
}
Ok(_) => Err(()),
}
}
pub fn blocks(&self) -> &[VersionBlock] {
&self.blocks[..]
} }
} }
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] #[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
pub struct VersionBlock { pub struct VersionBlockKey {
pub part_number: u64, pub part_number: u64,
pub offset: u64, pub offset: u64,
}
impl Ord for VersionBlockKey {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.part_number
.cmp(&other.part_number)
.then(self.offset.cmp(&other.offset))
}
}
impl PartialOrd for VersionBlockKey {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)]
pub struct VersionBlock {
pub hash: Hash, pub hash: Hash,
pub size: u64, pub size: u64,
} }
impl VersionBlock { impl AutoCRDT for VersionBlock {
fn cmp_key(&self) -> (u64, u64) { const WARN_IF_DIFFERENT: bool = true;
(self.part_number, self.offset)
}
} }
impl Entry<Hash, EmptyKey> for Version { impl Entry<Hash, EmptyKey> for Version {
@ -84,23 +78,21 @@ impl Entry<Hash, EmptyKey> for Version {
fn sort_key(&self) -> &EmptyKey { fn sort_key(&self) -> &EmptyKey {
&EmptyKey &EmptyKey
} }
fn is_tombstone(&self) -> bool {
self.deleted.get()
}
}
impl CRDT for Version {
fn merge(&mut self, other: &Self) { fn merge(&mut self, other: &Self) {
if other.deleted { self.deleted.merge(&other.deleted);
self.deleted = true;
if self.deleted.get() {
self.blocks.clear(); self.blocks.clear();
} else if !self.deleted { self.parts_etags.clear();
for bi in other.blocks.iter() { } else {
match self self.blocks.merge(&other.blocks);
.blocks self.parts_etags.merge(&other.parts_etags);
.binary_search_by(|x| x.cmp_key().cmp(&bi.cmp_key()))
{
Ok(_) => (),
Err(pos) => {
self.blocks.insert(pos, bi.clone());
}
}
}
} }
} }
} }
@ -121,14 +113,15 @@ impl TableSchema for VersionTable {
self.background.spawn(async move { self.background.spawn(async move {
if let (Some(old_v), Some(new_v)) = (old, new) { if let (Some(old_v), Some(new_v)) = (old, new) {
// Propagate deletion of version blocks // Propagate deletion of version blocks
if new_v.deleted && !old_v.deleted { if new_v.deleted.get() && !old_v.deleted.get() {
let deleted_block_refs = old_v let deleted_block_refs = old_v
.blocks .blocks
.items()
.iter() .iter()
.map(|vb| BlockRef { .map(|(_k, vb)| BlockRef {
block: vb.hash, block: vb.hash,
version: old_v.uuid, version: old_v.uuid,
deleted: true, deleted: true.into(),
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
block_ref_table.insert_many(&deleted_block_refs[..]).await?; block_ref_table.insert_many(&deleted_block_refs[..]).await?;
@ -139,6 +132,6 @@ impl TableSchema for VersionTable {
} }
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
filter.apply(entry.deleted) filter.apply(entry.deleted.get())
} }
} }

View file

@ -15,27 +15,26 @@ path = "lib.rs"
[dependencies] [dependencies]
garage_util = { version = "0.1.1", path = "../util" } garage_util = { version = "0.1.1", path = "../util" }
bytes = "0.4" bytes = "1.0"
rand = "0.7" hex = "0.4"
hex = "0.3" arc-swap = "1.0"
sha2 = "0.8"
arc-swap = "0.4"
gethostname = "0.2" gethostname = "0.2"
log = "0.4" log = "0.4"
rmp-serde = "0.14.3" rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
serde_json = "1.0" serde_json = "1.0"
futures = "0.3" futures = "0.3"
futures-util = "0.3" futures-util = "0.3"
tokio = { version = "0.2", default-features = false, features = ["rt-core", "rt-threaded", "io-driver", "net", "tcp", "time", "macros", "sync", "signal", "fs"] } tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
tokio-stream = { version = "0.1", features = ["net"] }
http = "0.2" http = "0.2"
hyper = "0.13" hyper = { version = "0.14", features = ["full"] }
rustls = "0.17" rustls = "0.19"
tokio-rustls = "0.13" tokio-rustls = "0.22"
hyper-rustls = { version = "0.20", default-features = false } hyper-rustls = { version = "0.22", default-features = false }
webpki = "0.21" webpki = "0.21"

View file

@ -11,13 +11,14 @@ use futures::future::join_all;
use futures::select; use futures::select;
use futures_util::future::*; use futures_util::future::*;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tokio::prelude::*; use tokio::io::AsyncWriteExt;
use tokio::sync::watch; use tokio::sync::watch;
use tokio::sync::Mutex; use tokio::sync::Mutex;
use garage_util::background::BackgroundRunner; use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use garage_util::time::*;
use crate::consul::get_consul_nodes; use crate::consul::get_consul_nodes;
use crate::ring::*; use crate::ring::*;
@ -315,23 +316,17 @@ impl System {
self.clone().ping_nodes(bootstrap_peers).await; self.clone().ping_nodes(bootstrap_peers).await;
let self2 = self.clone(); let self2 = self.clone();
self.clone() self.background
.background
.spawn_worker(format!("ping loop"), |stop_signal| { .spawn_worker(format!("ping loop"), |stop_signal| {
self2.ping_loop(stop_signal).map(Ok) self2.ping_loop(stop_signal)
}) });
.await;
if let (Some(consul_host), Some(consul_service_name)) = (consul_host, consul_service_name) { if let (Some(consul_host), Some(consul_service_name)) = (consul_host, consul_service_name) {
let self2 = self.clone(); let self2 = self.clone();
self.clone() self.background
.background
.spawn_worker(format!("Consul loop"), |stop_signal| { .spawn_worker(format!("Consul loop"), |stop_signal| {
self2 self2.consul_loop(stop_signal, consul_host, consul_service_name)
.consul_loop(stop_signal, consul_host, consul_service_name) });
.map(Ok)
})
.await;
} }
} }
@ -399,7 +394,7 @@ impl System {
if has_changes { if has_changes {
status.recalculate_hash(); status.recalculate_hash();
} }
if let Err(e) = update_locked.0.broadcast(Arc::new(status)) { if let Err(e) = update_locked.0.send(Arc::new(status)) {
error!("In ping_nodes: could not save status update ({})", e); error!("In ping_nodes: could not save status update ({})", e);
} }
drop(update_locked); drop(update_locked);
@ -425,7 +420,7 @@ impl System {
let status_hash = status.hash; let status_hash = status.hash;
let config_version = self.ring.borrow().config.version; let config_version = self.ring.borrow().config.version;
update_locked.0.broadcast(Arc::new(status))?; update_locked.0.send(Arc::new(status))?;
drop(update_locked); drop(update_locked);
if is_new || status_hash != ping.status_hash { if is_new || status_hash != ping.status_hash {
@ -507,7 +502,7 @@ impl System {
if has_changed { if has_changed {
status.recalculate_hash(); status.recalculate_hash();
} }
update_lock.0.broadcast(Arc::new(status))?; update_lock.0.send(Arc::new(status))?;
drop(update_lock); drop(update_lock);
if to_ping.len() > 0 { if to_ping.len() > 0 {
@ -527,7 +522,7 @@ impl System {
if adv.version > ring.config.version { if adv.version > ring.config.version {
let ring = Ring::new(adv.clone()); let ring = Ring::new(adv.clone());
update_lock.1.broadcast(Arc::new(ring))?; update_lock.1.send(Arc::new(ring))?;
drop(update_lock); drop(update_lock);
self.background.spawn_cancellable( self.background.spawn_cancellable(
@ -543,7 +538,7 @@ impl System {
async fn ping_loop(self: Arc<Self>, mut stop_signal: watch::Receiver<bool>) { async fn ping_loop(self: Arc<Self>, mut stop_signal: watch::Receiver<bool>) {
loop { loop {
let restart_at = tokio::time::delay_for(PING_INTERVAL); let restart_at = tokio::time::sleep(PING_INTERVAL);
let status = self.status.borrow().clone(); let status = self.status.borrow().clone();
let ping_addrs = status let ping_addrs = status
@ -557,10 +552,9 @@ impl System {
select! { select! {
_ = restart_at.fuse() => (), _ = restart_at.fuse() => (),
must_exit = stop_signal.recv().fuse() => { _ = stop_signal.changed().fuse() => {
match must_exit { if *stop_signal.borrow() {
None | Some(true) => return, return;
_ => (),
} }
} }
} }
@ -573,8 +567,8 @@ impl System {
consul_host: String, consul_host: String,
consul_service_name: String, consul_service_name: String,
) { ) {
loop { while !*stop_signal.borrow() {
let restart_at = tokio::time::delay_for(CONSUL_INTERVAL); let restart_at = tokio::time::sleep(CONSUL_INTERVAL);
match get_consul_nodes(&consul_host, &consul_service_name).await { match get_consul_nodes(&consul_host, &consul_service_name).await {
Ok(mut node_list) => { Ok(mut node_list) => {
@ -588,12 +582,7 @@ impl System {
select! { select! {
_ = restart_at.fuse() => (), _ = restart_at.fuse() => (),
must_exit = stop_signal.recv().fuse() => { _ = stop_signal.changed().fuse() => (),
match must_exit {
None | Some(true) => return,
_ => (),
}
}
} }
} }
} }

View file

@ -5,6 +5,11 @@ use serde::{Deserialize, Serialize};
use garage_util::data::*; use garage_util::data::*;
// A partition number is encoded on 16 bits,
// i.e. we have up to 2**16 partitions.
// (in practice we have exactly 2**PARTITION_BITS partitions)
pub type Partition = u16;
// TODO: make this constant parametrizable in the config file // TODO: make this constant parametrizable in the config file
// For deployments with many nodes it might make sense to bump // For deployments with many nodes it might make sense to bump
// it up to 10. // it up to 10.
@ -161,29 +166,48 @@ impl Ring {
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
eprintln!("RING: --"); // eprintln!("RING: --");
for e in ring.iter() { // for e in ring.iter() {
eprintln!("{:?}", e); // eprintln!("{:?}", e);
} // }
eprintln!("END --"); // eprintln!("END --");
Self { config, ring } Self { config, ring }
} }
pub fn partition_of(&self, from: &Hash) -> Partition {
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
top >> (16 - PARTITION_BITS)
}
pub fn partitions(&self) -> Vec<(Partition, Hash)> {
let mut ret = vec![];
for (i, entry) in self.ring.iter().enumerate() {
ret.push((i as u16, entry.location));
}
if ret.len() > 0 {
assert_eq!(ret[0].1, [0u8; 32].into());
}
ret
}
pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> { pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> {
if self.ring.len() != 1 << PARTITION_BITS { if self.ring.len() != 1 << PARTITION_BITS {
warn!("Ring not yet ready, read/writes will be lost"); warn!("Ring not yet ready, read/writes will be lost!");
return vec![]; return vec![];
} }
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap()); let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
let partition_idx = (top >> (16 - PARTITION_BITS)) as usize; let partition_idx = (top >> (16 - PARTITION_BITS)) as usize;
assert_eq!(partition_idx, self.partition_of(from) as usize);
let partition = &self.ring[partition_idx]; let partition = &self.ring[partition_idx];
let partition_top = let partition_top =
u16::from_be_bytes(partition.location.as_slice()[0..2].try_into().unwrap()); u16::from_be_bytes(partition.location.as_slice()[0..2].try_into().unwrap());
assert!(partition_top & PARTITION_MASK_U16 == top & PARTITION_MASK_U16); assert_eq!(partition_top & PARTITION_MASK_U16, top & PARTITION_MASK_U16);
assert!(n <= partition.nodes.len()); assert!(n <= partition.nodes.len());
partition.nodes[..n].iter().cloned().collect::<Vec<_>>() partition.nodes[..n].iter().cloned().collect::<Vec<_>>()

View file

@ -7,7 +7,6 @@ use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
use arc_swap::ArcSwapOption; use arc_swap::ArcSwapOption;
use bytes::IntoBuf;
use futures::future::Future; use futures::future::Future;
use futures::stream::futures_unordered::FuturesUnordered; use futures::stream::futures_unordered::FuturesUnordered;
use futures::stream::StreamExt; use futures::stream::StreamExt;
@ -197,11 +196,8 @@ impl<M: RpcMessage + 'static> RpcClient<M> {
if !strategy.rs_interrupt_after_quorum { if !strategy.rs_interrupt_after_quorum {
let wait_finished_fut = tokio::spawn(async move { let wait_finished_fut = tokio::spawn(async move {
resp_stream.collect::<Vec<_>>().await; resp_stream.collect::<Vec<_>>().await;
Ok(())
}); });
self.background.spawn(wait_finished_fut.map(|x| { self.background.spawn(wait_finished_fut.map(|_| Ok(())));
x.unwrap_or_else(|e| Err(Error::Message(format!("Await failed: {}", e))))
}));
} }
Ok(results) Ok(results)
@ -336,7 +332,7 @@ impl RpcHttpClient {
let body = hyper::body::to_bytes(resp.into_body()).await?; let body = hyper::body::to_bytes(resp.into_body()).await?;
drop(slot); drop(slot);
match rmp_serde::decode::from_read::<_, Result<M, String>>(body.into_buf())? { match rmp_serde::decode::from_read::<_, Result<M, String>>(&body[..])? {
Err(e) => Ok(Err(Error::RemoteError(e, status))), Err(e) => Ok(Err(Error::RemoteError(e, status))),
Ok(x) => Ok(Ok(x)), Ok(x) => Ok(Ok(x)),
} }

View file

@ -4,7 +4,6 @@ use std::pin::Pin;
use std::sync::Arc; use std::sync::Arc;
use std::time::Instant; use std::time::Instant;
use bytes::IntoBuf;
use futures::future::Future; use futures::future::Future;
use futures_util::future::*; use futures_util::future::*;
use futures_util::stream::*; use futures_util::stream::*;
@ -15,6 +14,7 @@ use serde::{Deserialize, Serialize};
use tokio::net::{TcpListener, TcpStream}; use tokio::net::{TcpListener, TcpStream};
use tokio_rustls::server::TlsStream; use tokio_rustls::server::TlsStream;
use tokio_rustls::TlsAcceptor; use tokio_rustls::TlsAcceptor;
use tokio_stream::wrappers::TcpListenerStream;
use garage_util::config::TlsConfig; use garage_util::config::TlsConfig;
use garage_util::data::*; use garage_util::data::*;
@ -47,11 +47,15 @@ where
{ {
let begin_time = Instant::now(); let begin_time = Instant::now();
let whole_body = hyper::body::to_bytes(req.into_body()).await?; let whole_body = hyper::body::to_bytes(req.into_body()).await?;
let msg = rmp_serde::decode::from_read::<_, M>(whole_body.into_buf())?; let msg = rmp_serde::decode::from_read::<_, M>(&whole_body[..])?;
trace!( trace!(
"Request message: {}", "Request message: {}",
serde_json::to_string(&msg).unwrap_or("<json error>".into()) serde_json::to_string(&msg)
.unwrap_or("<json error>".into())
.chars()
.take(100)
.collect::<String>()
); );
match handler(msg, sockaddr).await { match handler(msg, sockaddr).await {
@ -171,8 +175,8 @@ impl RpcServer {
config.set_single_cert([&node_certs[..], &ca_certs[..]].concat(), node_key)?; config.set_single_cert([&node_certs[..], &ca_certs[..]].concat(), node_key)?;
let tls_acceptor = Arc::new(TlsAcceptor::from(Arc::new(config))); let tls_acceptor = Arc::new(TlsAcceptor::from(Arc::new(config)));
let mut listener = TcpListener::bind(&self.bind_addr).await?; let listener = TcpListener::bind(&self.bind_addr).await?;
let incoming = listener.incoming().filter_map(|socket| async { let incoming = TcpListenerStream::new(listener).filter_map(|socket| async {
match socket { match socket {
Ok(stream) => match tls_acceptor.clone().accept(stream).await { Ok(stream) => match tls_acceptor.clone().accept(stream).await {
Ok(x) => Some(Ok::<_, hyper::Error>(x)), Ok(x) => Some(Ok::<_, hyper::Error>(x)),

View file

@ -16,21 +16,18 @@ path = "lib.rs"
garage_util = { version = "0.1.1", path = "../util" } garage_util = { version = "0.1.1", path = "../util" }
garage_rpc = { version = "0.1.1", path = "../rpc" } garage_rpc = { version = "0.1.1", path = "../rpc" }
bytes = "0.4" bytes = "1.0"
rand = "0.7" rand = "0.8"
hex = "0.3"
arc-swap = "0.4"
log = "0.4" log = "0.4"
hexdump = "0.1" hexdump = "0.1"
sled = "0.34" sled = "0.34"
rmp-serde = "0.14.3" rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
serde_bytes = "0.11" serde_bytes = "0.11"
async-trait = "0.1.30"
futures = "0.3" futures = "0.3"
futures-util = "0.3" futures-util = "0.3"
tokio = { version = "0.2", default-features = false, features = ["rt-core", "rt-threaded", "io-driver", "net", "tcp", "time", "macros", "sync", "signal", "fs"] } tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }

View file

@ -1,327 +0,0 @@
//! This package provides a simple implementation of conflict-free replicated data types (CRDTs)
//!
//! CRDTs are a type of data structures that do not require coordination. In other words, we can
//! edit them in parallel, we will always find a way to merge it.
//!
//! A general example is a counter. Its initial value is 0. Alice and Bob get a copy of the
//! counter. Alice does +1 on her copy, she reads 1. Bob does +3 on his copy, he reads 3. Now,
//! it is easy to merge their counters, order does not count: we always get 4.
//!
//! Learn more about CRDT [on Wikipedia](https://en.wikipedia.org/wiki/Conflict-free_replicated_data_type)
use serde::{Deserialize, Serialize};
use garage_util::data::*;
/// Definition of a CRDT - all CRDT Rust types implement this.
///
/// A CRDT is defined as a merge operator that respects a certain set of axioms.
///
/// In particular, the merge operator must be commutative, associative,
/// idempotent, and monotonic.
/// In other words, if `a`, `b` and `c` are CRDTs, and `⊔` denotes the merge operator,
/// the following axioms must apply:
///
/// ```text
/// a ⊔ b = b ⊔ a (commutativity)
/// (a ⊔ b) ⊔ c = a ⊔ (b ⊔ c) (associativity)
/// (a ⊔ b) ⊔ b = a ⊔ b (idempotence)
/// ```
///
/// Moreover, the relationship `≥` defined by `a ≥ b ⇔ ∃c. a = b ⊔ c` must be a partial order.
/// This implies a few properties such as: if `a ⊔ b ≠ a`, then there is no `c` such that `(a ⊔ b) ⊔ c = a`,
/// as this would imply a cycle in the partial order.
pub trait CRDT {
/// Merge the two datastructures according to the CRDT rules.
/// `self` is modified to contain the merged CRDT value. `other` is not modified.
///
/// # Arguments
///
/// * `other` - the other CRDT we wish to merge with
fn merge(&mut self, other: &Self);
}
/// All types that implement `Ord` (a total order) also implement a trivial CRDT
/// defined by the merge rule: `a ⊔ b = max(a, b)`.
impl<T> CRDT for T
where
T: Ord + Clone,
{
fn merge(&mut self, other: &Self) {
if other > self {
*self = other.clone();
}
}
}
// ---- LWW Register ----
/// Last Write Win (LWW)
///
/// An LWW CRDT associates a timestamp with a value, in order to implement a
/// time-based reconciliation rule: the most recent write wins.
/// For completeness, the LWW reconciliation rule must also be defined for two LWW CRDTs
/// with the same timestamp but different values.
///
/// In our case, we add the constraint that the value that is wrapped inside the LWW CRDT must
/// itself be a CRDT: in the case when the timestamp does not allow us to decide on which value to
/// keep, the merge rule of the inner CRDT is applied on the wrapped values. (Note that all types
/// that implement the `Ord` trait get a default CRDT implemetnation that keeps the maximum value.
/// This enables us to use LWW directly with primitive data types such as numbers or strings. It is
/// generally desirable in this case to never explicitly produce LWW values with the same timestamp
/// but different inner values, as the rule to keep the maximum value isn't generally the desired
/// semantics.)
///
/// As multiple computers clocks are always desynchronized,
/// when operations are close enough, it is equivalent to
/// take one copy and drop the other one.
///
/// Given that clocks are not too desynchronized, this assumption
/// is enough for most cases, as there is few chance that two humans
/// coordonate themself faster than the time difference between two NTP servers.
///
/// As a more concret example, let's suppose you want to upload a file
/// with the same key (path) in the same bucket at the very same time.
/// For each request, the file will be timestamped by the receiving server
/// and may differ from what you observed with your atomic clock!
///
/// This scheme is used by AWS S3 or Soundcloud and often without knowing
/// in entreprise when reconciliating databases with ad-hoc scripts.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct LWW<T> {
ts: u64,
v: T,
}
impl<T> LWW<T>
where
T: CRDT,
{
/// Creates a new CRDT
///
/// CRDT's internal timestamp is set with current node's clock.
pub fn new(value: T) -> Self {
Self {
ts: now_msec(),
v: value,
}
}
/// Build a new CRDT from a previous non-compatible one
///
/// Compared to new, the CRDT's timestamp is not set to now
/// but must be set to the previous, non-compatible, CRDT's timestamp.
pub fn migrate_from_raw(ts: u64, value: T) -> Self {
Self { ts, v: value }
}
/// Update the LWW CRDT while keeping some causal ordering.
///
/// The timestamp of the LWW CRDT is updated to be the current node's clock
/// at time of update, or the previous timestamp + 1 if that's bigger,
/// so that the new timestamp is always strictly larger than the previous one.
/// This ensures that merging the update with the old value will result in keeping
/// the updated value.
pub fn update(&mut self, new_value: T) {
self.ts = std::cmp::max(self.ts + 1, now_msec());
self.v = new_value;
}
/// Get the CRDT value
pub fn get(&self) -> &T {
&self.v
}
/// Get a mutable reference to the CRDT's value
///
/// This is usefull to mutate the inside value without changing the LWW timestamp.
/// When such mutation is done, the merge between two LWW values is done using the inner
/// CRDT's merge operation. This is usefull in the case where the inner CRDT is a large
/// data type, such as a map, and we only want to change a single item in the map.
/// To do this, we can produce a "CRDT delta", i.e. a LWW that contains only the modification.
/// This delta consists in a LWW with the same timestamp, and the map
/// inside only contains the updated value.
/// The advantage of such a delta is that it is much smaller than the whole map.
///
/// Avoid using this if the inner data type is a primitive type such as a number or a string,
/// as you will then rely on the merge function defined on `Ord` types by keeping the maximum
/// of both values.
pub fn get_mut(&mut self) -> &mut T {
&mut self.v
}
}
impl<T> CRDT for LWW<T>
where
T: Clone + CRDT,
{
fn merge(&mut self, other: &Self) {
if other.ts > self.ts {
self.ts = other.ts;
self.v = other.v.clone();
} else if other.ts == self.ts {
self.v.merge(&other.v);
}
}
}
/// Boolean, where `true` is an absorbing state
#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)]
pub struct Bool(bool);
impl Bool {
/// Create a new boolean with the specified value
pub fn new(b: bool) -> Self {
Self(b)
}
/// Set the boolean to true
pub fn set(&mut self) {
self.0 = true;
}
/// Get the boolean value
pub fn get(&self) -> bool {
self.0
}
}
impl CRDT for Bool {
fn merge(&mut self, other: &Self) {
self.0 = self.0 || other.0;
}
}
/// Last Write Win Map
///
/// This types defines a CRDT for a map from keys to values.
/// The values have an associated timestamp, such that the last written value
/// takes precedence over previous ones. As for the simpler `LWW` type, the value
/// type `V` is also required to implement the CRDT trait.
/// We do not encourage mutating the values associated with a given key
/// without updating the timestamp, in fact at the moment we do not provide a `.get_mut()`
/// method that would allow that.
///
/// Internally, the map is stored as a vector of keys and values, sorted by ascending key order.
/// This is why the key type `K` must implement `Ord` (and also to ensure a unique serialization,
/// such that two values can be compared for equality based on their hashes). As a consequence,
/// insertions take `O(n)` time. This means that LWWMap should be used for reasonably small maps.
/// However, note that even if we were using a more efficient data structure such as a `BTreeMap`,
/// the serialization cost `O(n)` would still have to be paid at each modification, so we are
/// actually not losing anything here.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct LWWMap<K, V> {
vals: Vec<(K, u64, V)>,
}
impl<K, V> LWWMap<K, V>
where
K: Ord,
V: CRDT,
{
/// Create a new empty map CRDT
pub fn new() -> Self {
Self { vals: vec![] }
}
/// Used to migrate from a map defined in an incompatible format. This produces
/// a map that contains a single item with the specified timestamp (copied from
/// the incompatible format). Do this as many times as you have items to migrate,
/// and put them all together using the CRDT merge operator.
pub fn migrate_from_raw_item(k: K, ts: u64, v: V) -> Self {
Self {
vals: vec![(k, ts, v)],
}
}
/// Returns a map that contains a single mapping from the specified key to the specified value.
/// This map is a mutator, or a delta-CRDT, such that when it is merged with the original map,
/// the previous value will be replaced with the one specified here.
/// The timestamp in the provided mutator is set to the maximum of the current system's clock
/// and 1 + the previous value's timestamp (if there is one), so that the new value will always
/// take precedence (LWW rule).
///
/// Typically, to update the value associated to a key in the map, you would do the following:
///
/// ```ignore
/// let my_update = my_crdt.update_mutator(key_to_modify, new_value);
/// my_crdt.merge(&my_update);
/// ```
///
/// However extracting the mutator on its own and only sending that on the network is very
/// interesting as it is much smaller than the whole map.
pub fn update_mutator(&self, k: K, new_v: V) -> Self {
let new_vals = match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
Ok(i) => {
let (_, old_ts, _) = self.vals[i];
let new_ts = std::cmp::max(old_ts + 1, now_msec());
vec![(k, new_ts, new_v)]
}
Err(_) => vec![(k, now_msec(), new_v)],
};
Self { vals: new_vals }
}
/// Takes all of the values of the map and returns them. The current map is reset to the
/// empty map. This is very usefull to produce in-place a new map that contains only a delta
/// that modifies a certain value:
///
/// ```ignore
/// let mut a = get_my_crdt_value();
/// let old_a = a.take_and_clear();
/// a.merge(&old_a.update_mutator(key_to_modify, new_value));
/// put_my_crdt_value(a);
/// ```
///
/// Of course in this simple example we could have written simply
/// `pyt_my_crdt_value(a.update_mutator(key_to_modify, new_value))`,
/// but in the case where the map is a field in a struct for instance (as is always the case),
/// this becomes very handy:
///
/// ```ignore
/// let mut a = get_my_crdt_value();
/// let old_a_map = a.map_field.take_and_clear();
/// a.map_field.merge(&old_a_map.update_mutator(key_to_modify, new_value));
/// put_my_crdt_value(a);
/// ```
pub fn take_and_clear(&mut self) -> Self {
let vals = std::mem::replace(&mut self.vals, vec![]);
Self { vals }
}
/// Removes all values from the map
pub fn clear(&mut self) {
self.vals.clear();
}
/// Get a reference to the value assigned to a key
pub fn get(&self, k: &K) -> Option<&V> {
match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
Ok(i) => Some(&self.vals[i].2),
Err(_) => None,
}
}
/// Gets a reference to all of the items, as a slice. Usefull to iterate on all map values.
/// In most case you will want to ignore the timestamp (second item of the tuple).
pub fn items(&self) -> &[(K, u64, V)] {
&self.vals[..]
}
}
impl<K, V> CRDT for LWWMap<K, V>
where
K: Clone + Ord,
V: Clone + CRDT,
{
fn merge(&mut self, other: &Self) {
for (k, ts2, v2) in other.vals.iter() {
match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
Ok(i) => {
let (_, ts1, _v1) = &self.vals[i];
if ts2 > ts1 {
self.vals[i].1 = *ts2;
self.vals[i].2 = v2.clone();
} else if ts1 == ts2 {
self.vals[i].2.merge(&v2);
}
}
Err(i) => {
self.vals.insert(i, (k.clone(), *ts2, v2.clone()));
}
}
}
}
}

34
src/table/crdt/bool.rs Normal file
View file

@ -0,0 +1,34 @@
use serde::{Deserialize, Serialize};
use crate::crdt::crdt::*;
/// Boolean, where `true` is an absorbing state
#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)]
pub struct Bool(bool);
impl Bool {
/// Create a new boolean with the specified value
pub fn new(b: bool) -> Self {
Self(b)
}
/// Set the boolean to true
pub fn set(&mut self) {
self.0 = true;
}
/// Get the boolean value
pub fn get(&self) -> bool {
self.0
}
}
impl From<bool> for Bool {
fn from(b: bool) -> Bool {
Bool::new(b)
}
}
impl CRDT for Bool {
fn merge(&mut self, other: &Self) {
self.0 = self.0 || other.0;
}
}

73
src/table/crdt/crdt.rs Normal file
View file

@ -0,0 +1,73 @@
use garage_util::data::*;
/// Definition of a CRDT - all CRDT Rust types implement this.
///
/// A CRDT is defined as a merge operator that respects a certain set of axioms.
///
/// In particular, the merge operator must be commutative, associative,
/// idempotent, and monotonic.
/// In other words, if `a`, `b` and `c` are CRDTs, and `⊔` denotes the merge operator,
/// the following axioms must apply:
///
/// ```text
/// a ⊔ b = b ⊔ a (commutativity)
/// (a ⊔ b) ⊔ c = a ⊔ (b ⊔ c) (associativity)
/// (a ⊔ b) ⊔ b = a ⊔ b (idempotence)
/// ```
///
/// Moreover, the relationship `≥` defined by `a ≥ b ⇔ ∃c. a = b ⊔ c` must be a partial order.
/// This implies a few properties such as: if `a ⊔ b ≠ a`, then there is no `c` such that `(a ⊔ b) ⊔ c = a`,
/// as this would imply a cycle in the partial order.
pub trait CRDT {
/// Merge the two datastructures according to the CRDT rules.
/// `self` is modified to contain the merged CRDT value. `other` is not modified.
///
/// # Arguments
///
/// * `other` - the other CRDT we wish to merge with
fn merge(&mut self, other: &Self);
}
/// All types that implement `Ord` (a total order) can also implement a trivial CRDT
/// defined by the merge rule: `a ⊔ b = max(a, b)`. Implement this trait for your type
/// to enable this behavior.
pub trait AutoCRDT: Ord + Clone + std::fmt::Debug {
/// WARN_IF_DIFFERENT: emit a warning when values differ. Set this to true if
/// different values in your application should never happen. Set this to false
/// if you are actually relying on the semantics of `a ⊔ b = max(a, b)`.
const WARN_IF_DIFFERENT: bool;
}
impl<T> CRDT for T
where
T: AutoCRDT,
{
fn merge(&mut self, other: &Self) {
if Self::WARN_IF_DIFFERENT && self != other {
warn!(
"Different CRDT values should be the same (logic error!): {:?} vs {:?}",
self, other
);
if other > self {
*self = other.clone();
}
warn!("Making an arbitrary choice: {:?}", self);
} else {
if other > self {
*self = other.clone();
}
}
}
}
impl AutoCRDT for String {
const WARN_IF_DIFFERENT: bool = true;
}
impl AutoCRDT for bool {
const WARN_IF_DIFFERENT: bool = true;
}
impl AutoCRDT for FixedBytes32 {
const WARN_IF_DIFFERENT: bool = true;
}

114
src/table/crdt/lww.rs Normal file
View file

@ -0,0 +1,114 @@
use serde::{Deserialize, Serialize};
use garage_util::time::now_msec;
use crate::crdt::crdt::*;
/// Last Write Win (LWW)
///
/// An LWW CRDT associates a timestamp with a value, in order to implement a
/// time-based reconciliation rule: the most recent write wins.
/// For completeness, the LWW reconciliation rule must also be defined for two LWW CRDTs
/// with the same timestamp but different values.
///
/// In our case, we add the constraint that the value that is wrapped inside the LWW CRDT must
/// itself be a CRDT: in the case when the timestamp does not allow us to decide on which value to
/// keep, the merge rule of the inner CRDT is applied on the wrapped values. (Note that all types
/// that implement the `Ord` trait get a default CRDT implemetnation that keeps the maximum value.
/// This enables us to use LWW directly with primitive data types such as numbers or strings. It is
/// generally desirable in this case to never explicitly produce LWW values with the same timestamp
/// but different inner values, as the rule to keep the maximum value isn't generally the desired
/// semantics.)
///
/// As multiple computers clocks are always desynchronized,
/// when operations are close enough, it is equivalent to
/// take one copy and drop the other one.
///
/// Given that clocks are not too desynchronized, this assumption
/// is enough for most cases, as there is few chance that two humans
/// coordonate themself faster than the time difference between two NTP servers.
///
/// As a more concret example, let's suppose you want to upload a file
/// with the same key (path) in the same bucket at the very same time.
/// For each request, the file will be timestamped by the receiving server
/// and may differ from what you observed with your atomic clock!
///
/// This scheme is used by AWS S3 or Soundcloud and often without knowing
/// in entreprise when reconciliating databases with ad-hoc scripts.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct LWW<T> {
ts: u64,
v: T,
}
impl<T> LWW<T>
where
T: CRDT,
{
/// Creates a new CRDT
///
/// CRDT's internal timestamp is set with current node's clock.
pub fn new(value: T) -> Self {
Self {
ts: now_msec(),
v: value,
}
}
/// Build a new CRDT from a previous non-compatible one
///
/// Compared to new, the CRDT's timestamp is not set to now
/// but must be set to the previous, non-compatible, CRDT's timestamp.
pub fn migrate_from_raw(ts: u64, value: T) -> Self {
Self { ts, v: value }
}
/// Update the LWW CRDT while keeping some causal ordering.
///
/// The timestamp of the LWW CRDT is updated to be the current node's clock
/// at time of update, or the previous timestamp + 1 if that's bigger,
/// so that the new timestamp is always strictly larger than the previous one.
/// This ensures that merging the update with the old value will result in keeping
/// the updated value.
pub fn update(&mut self, new_value: T) {
self.ts = std::cmp::max(self.ts + 1, now_msec());
self.v = new_value;
}
/// Get the CRDT value
pub fn get(&self) -> &T {
&self.v
}
/// Get a mutable reference to the CRDT's value
///
/// This is usefull to mutate the inside value without changing the LWW timestamp.
/// When such mutation is done, the merge between two LWW values is done using the inner
/// CRDT's merge operation. This is usefull in the case where the inner CRDT is a large
/// data type, such as a map, and we only want to change a single item in the map.
/// To do this, we can produce a "CRDT delta", i.e. a LWW that contains only the modification.
/// This delta consists in a LWW with the same timestamp, and the map
/// inside only contains the updated value.
/// The advantage of such a delta is that it is much smaller than the whole map.
///
/// Avoid using this if the inner data type is a primitive type such as a number or a string,
/// as you will then rely on the merge function defined on `Ord` types by keeping the maximum
/// of both values.
pub fn get_mut(&mut self) -> &mut T {
&mut self.v
}
}
impl<T> CRDT for LWW<T>
where
T: Clone + CRDT,
{
fn merge(&mut self, other: &Self) {
if other.ts > self.ts {
self.ts = other.ts;
self.v = other.v.clone();
} else if other.ts == self.ts {
self.v.merge(&other.v);
}
}
}

145
src/table/crdt/lww_map.rs Normal file
View file

@ -0,0 +1,145 @@
use serde::{Deserialize, Serialize};
use garage_util::time::now_msec;
use crate::crdt::crdt::*;
/// Last Write Win Map
///
/// This types defines a CRDT for a map from keys to values.
/// The values have an associated timestamp, such that the last written value
/// takes precedence over previous ones. As for the simpler `LWW` type, the value
/// type `V` is also required to implement the CRDT trait.
/// We do not encourage mutating the values associated with a given key
/// without updating the timestamp, in fact at the moment we do not provide a `.get_mut()`
/// method that would allow that.
///
/// Internally, the map is stored as a vector of keys and values, sorted by ascending key order.
/// This is why the key type `K` must implement `Ord` (and also to ensure a unique serialization,
/// such that two values can be compared for equality based on their hashes). As a consequence,
/// insertions take `O(n)` time. This means that LWWMap should be used for reasonably small maps.
/// However, note that even if we were using a more efficient data structure such as a `BTreeMap`,
/// the serialization cost `O(n)` would still have to be paid at each modification, so we are
/// actually not losing anything here.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct LWWMap<K, V> {
vals: Vec<(K, u64, V)>,
}
impl<K, V> LWWMap<K, V>
where
K: Ord,
V: CRDT,
{
/// Create a new empty map CRDT
pub fn new() -> Self {
Self { vals: vec![] }
}
/// Used to migrate from a map defined in an incompatible format. This produces
/// a map that contains a single item with the specified timestamp (copied from
/// the incompatible format). Do this as many times as you have items to migrate,
/// and put them all together using the CRDT merge operator.
pub fn migrate_from_raw_item(k: K, ts: u64, v: V) -> Self {
Self {
vals: vec![(k, ts, v)],
}
}
/// Returns a map that contains a single mapping from the specified key to the specified value.
/// This map is a mutator, or a delta-CRDT, such that when it is merged with the original map,
/// the previous value will be replaced with the one specified here.
/// The timestamp in the provided mutator is set to the maximum of the current system's clock
/// and 1 + the previous value's timestamp (if there is one), so that the new value will always
/// take precedence (LWW rule).
///
/// Typically, to update the value associated to a key in the map, you would do the following:
///
/// ```ignore
/// let my_update = my_crdt.update_mutator(key_to_modify, new_value);
/// my_crdt.merge(&my_update);
/// ```
///
/// However extracting the mutator on its own and only sending that on the network is very
/// interesting as it is much smaller than the whole map.
pub fn update_mutator(&self, k: K, new_v: V) -> Self {
let new_vals = match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
Ok(i) => {
let (_, old_ts, _) = self.vals[i];
let new_ts = std::cmp::max(old_ts + 1, now_msec());
vec![(k, new_ts, new_v)]
}
Err(_) => vec![(k, now_msec(), new_v)],
};
Self { vals: new_vals }
}
/// Takes all of the values of the map and returns them. The current map is reset to the
/// empty map. This is very usefull to produce in-place a new map that contains only a delta
/// that modifies a certain value:
///
/// ```ignore
/// let mut a = get_my_crdt_value();
/// let old_a = a.take_and_clear();
/// a.merge(&old_a.update_mutator(key_to_modify, new_value));
/// put_my_crdt_value(a);
/// ```
///
/// Of course in this simple example we could have written simply
/// `pyt_my_crdt_value(a.update_mutator(key_to_modify, new_value))`,
/// but in the case where the map is a field in a struct for instance (as is always the case),
/// this becomes very handy:
///
/// ```ignore
/// let mut a = get_my_crdt_value();
/// let old_a_map = a.map_field.take_and_clear();
/// a.map_field.merge(&old_a_map.update_mutator(key_to_modify, new_value));
/// put_my_crdt_value(a);
/// ```
pub fn take_and_clear(&mut self) -> Self {
let vals = std::mem::replace(&mut self.vals, vec![]);
Self { vals }
}
/// Removes all values from the map
pub fn clear(&mut self) {
self.vals.clear();
}
/// Get a reference to the value assigned to a key
pub fn get(&self, k: &K) -> Option<&V> {
match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
Ok(i) => Some(&self.vals[i].2),
Err(_) => None,
}
}
/// Gets a reference to all of the items, as a slice. Usefull to iterate on all map values.
/// In most case you will want to ignore the timestamp (second item of the tuple).
pub fn items(&self) -> &[(K, u64, V)] {
&self.vals[..]
}
/// Returns the number of items in the map
pub fn len(&self) -> usize {
self.vals.len()
}
}
impl<K, V> CRDT for LWWMap<K, V>
where
K: Clone + Ord,
V: Clone + CRDT,
{
fn merge(&mut self, other: &Self) {
for (k, ts2, v2) in other.vals.iter() {
match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
Ok(i) => {
let (_, ts1, _v1) = &self.vals[i];
if ts2 > ts1 {
self.vals[i].1 = *ts2;
self.vals[i].2 = v2.clone();
} else if ts1 == ts2 {
self.vals[i].2.merge(&v2);
}
}
Err(i) => {
self.vals.insert(i, (k.clone(), *ts2, v2.clone()));
}
}
}
}
}

83
src/table/crdt/map.rs Normal file
View file

@ -0,0 +1,83 @@
use serde::{Deserialize, Serialize};
use crate::crdt::crdt::*;
/// Simple CRDT Map
///
/// This types defines a CRDT for a map from keys to values. Values are CRDT types which
/// can have their own updating logic.
///
/// Internally, the map is stored as a vector of keys and values, sorted by ascending key order.
/// This is why the key type `K` must implement `Ord` (and also to ensure a unique serialization,
/// such that two values can be compared for equality based on their hashes). As a consequence,
/// insertions take `O(n)` time. This means that Map should be used for reasonably small maps.
/// However, note that even if we were using a more efficient data structure such as a `BTreeMap`,
/// the serialization cost `O(n)` would still have to be paid at each modification, so we are
/// actually not losing anything here.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct Map<K, V> {
vals: Vec<(K, V)>,
}
impl<K, V> Map<K, V>
where
K: Clone + Ord,
V: Clone + CRDT,
{
/// Create a new empty map CRDT
pub fn new() -> Self {
Self { vals: vec![] }
}
/// Returns a map that contains a single mapping from the specified key to the specified value.
/// This can be used to build a delta-mutator:
/// when merged with another map, the value will be added or CRDT-merged if a previous
/// value already exists.
pub fn put_mutator(k: K, v: V) -> Self {
Self { vals: vec![(k, v)] }
}
pub fn put(&mut self, k: K, v: V) {
self.merge(&Self::put_mutator(k, v));
}
/// Removes all values from the map
pub fn clear(&mut self) {
self.vals.clear();
}
/// Get a reference to the value assigned to a key
pub fn get(&self, k: &K) -> Option<&V> {
match self.vals.binary_search_by(|(k2, _)| k2.cmp(&k)) {
Ok(i) => Some(&self.vals[i].1),
Err(_) => None,
}
}
/// Gets a reference to all of the items, as a slice. Usefull to iterate on all map values.
pub fn items(&self) -> &[(K, V)] {
&self.vals[..]
}
/// Returns the number of items in the map
pub fn len(&self) -> usize {
self.vals.len()
}
}
impl<K, V> CRDT for Map<K, V>
where
K: Clone + Ord,
V: Clone + CRDT,
{
fn merge(&mut self, other: &Self) {
for (k, v2) in other.vals.iter() {
match self.vals.binary_search_by(|(k2, _)| k2.cmp(&k)) {
Ok(i) => {
self.vals[i].1.merge(&v2);
}
Err(i) => {
self.vals.insert(i, (k.clone(), v2.clone()));
}
}
}
}
}

22
src/table/crdt/mod.rs Normal file
View file

@ -0,0 +1,22 @@
//! This package provides a simple implementation of conflict-free replicated data types (CRDTs)
//!
//! CRDTs are a type of data structures that do not require coordination. In other words, we can
//! edit them in parallel, we will always find a way to merge it.
//!
//! A general example is a counter. Its initial value is 0. Alice and Bob get a copy of the
//! counter. Alice does +1 on her copy, she reads 1. Bob does +3 on his copy, he reads 3. Now,
//! it is easy to merge their counters, order does not count: we always get 4.
//!
//! Learn more about CRDT [on Wikipedia](https://en.wikipedia.org/wiki/Conflict-free_replicated_data_type)
mod bool;
mod crdt;
mod lww;
mod lww_map;
mod map;
pub use self::bool::*;
pub use crdt::*;
pub use lww::*;
pub use lww_map::*;
pub use map::*;

254
src/table/data.rs Normal file
View file

@ -0,0 +1,254 @@
use core::borrow::Borrow;
use std::sync::Arc;
use log::warn;
use serde_bytes::ByteBuf;
use sled::Transactional;
use tokio::sync::Notify;
use garage_util::data::*;
use garage_util::error::*;
use garage_rpc::membership::System;
use crate::crdt::CRDT;
use crate::replication::*;
use crate::schema::*;
pub struct TableData<F: TableSchema, R: TableReplication> {
system: Arc<System>,
pub name: String,
pub(crate) instance: F,
pub(crate) replication: R,
pub store: sled::Tree,
pub(crate) merkle_tree: sled::Tree,
pub(crate) merkle_todo: sled::Tree,
pub(crate) merkle_todo_notify: Notify,
pub(crate) gc_todo: sled::Tree,
}
impl<F, R> TableData<F, R>
where
F: TableSchema,
R: TableReplication,
{
pub fn new(system: Arc<System>, name: String, instance: F, replication: R, db: &sled::Db) -> Arc<Self> {
let store = db
.open_tree(&format!("{}:table", name))
.expect("Unable to open DB tree");
let merkle_tree = db
.open_tree(&format!("{}:merkle_tree", name))
.expect("Unable to open DB Merkle tree tree");
let merkle_todo = db
.open_tree(&format!("{}:merkle_todo", name))
.expect("Unable to open DB Merkle TODO tree");
let gc_todo = db
.open_tree(&format!("{}:gc_todo", name))
.expect("Unable to open DB tree");
Arc::new(Self {
system,
name,
instance,
replication,
store,
merkle_tree,
merkle_todo,
merkle_todo_notify: Notify::new(),
gc_todo,
})
}
// Read functions
pub fn read_entry(&self, p: &F::P, s: &F::S) -> Result<Option<ByteBuf>, Error> {
let tree_key = self.tree_key(p, s);
if let Some(bytes) = self.store.get(&tree_key)? {
Ok(Some(ByteBuf::from(bytes.to_vec())))
} else {
Ok(None)
}
}
pub fn read_range(
&self,
p: &F::P,
s: &Option<F::S>,
filter: &Option<F::Filter>,
limit: usize,
) -> Result<Vec<Arc<ByteBuf>>, Error> {
let partition_hash = p.hash();
let first_key = match s {
None => partition_hash.to_vec(),
Some(sk) => self.tree_key(p, sk),
};
let mut ret = vec![];
for item in self.store.range(first_key..) {
let (key, value) = item?;
if &key[..32] != partition_hash.as_slice() {
break;
}
let keep = match filter {
None => true,
Some(f) => {
let entry = self.decode_entry(value.as_ref())?;
F::matches_filter(&entry, f)
}
};
if keep {
ret.push(Arc::new(ByteBuf::from(value.as_ref())));
}
if ret.len() >= limit {
break;
}
}
Ok(ret)
}
// Mutation functions
// When changing this code, take care of propagating modifications correctly:
// - When an entry is modified or deleted, call the updated() function
// on the table instance
// - When an entry is modified or deleted, add it to the merkle updater's todo list.
// This has to be done atomically with the modification for the merkle updater
// to maintain consistency. The merkle updater must then be notified with todo_notify.
// - When an entry is updated to be a tombstone, add it to the gc_todo tree
pub(crate) fn update_many<T: Borrow<ByteBuf>>(&self, entries: &[T]) -> Result<(), Error> {
for update_bytes in entries.iter() {
self.update_entry(update_bytes.borrow().as_slice())?;
}
Ok(())
}
pub(crate) fn update_entry(&self, update_bytes: &[u8]) -> Result<(), Error> {
let update = self.decode_entry(update_bytes)?;
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
let changed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
let (old_entry, new_entry) = match store.get(&tree_key)? {
Some(prev_bytes) => {
let old_entry = self
.decode_entry(&prev_bytes)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
let mut new_entry = old_entry.clone();
new_entry.merge(&update);
(Some(old_entry), new_entry)
}
None => (None, update.clone()),
};
if Some(&new_entry) != old_entry.as_ref() {
let new_bytes = rmp_to_vec_all_named(&new_entry)
.map_err(Error::RMPEncode)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
let new_bytes_hash = blake2sum(&new_bytes[..]);
mkl_todo.insert(tree_key.clone(), new_bytes_hash.as_slice())?;
store.insert(tree_key.clone(), new_bytes)?;
Ok(Some((old_entry, new_entry, new_bytes_hash)))
} else {
Ok(None)
}
})?;
if let Some((old_entry, new_entry, new_bytes_hash)) = changed {
let is_tombstone = new_entry.is_tombstone();
self.instance.updated(old_entry, Some(new_entry));
self.merkle_todo_notify.notify_one();
if is_tombstone {
// We are only responsible for GC'ing this item if we are the
// "leader" of the partition, i.e. the first node in the
// set of nodes that replicates this partition.
// This avoids GC loops and does not change the termination properties
// of the GC algorithm, as in all cases GC is suspended if
// any node of the partition is unavailable.
let pk_hash = Hash::try_from(&tree_key[..32]).unwrap();
let nodes = self.replication.write_nodes(&pk_hash);
if nodes.first() == Some(&self.system.id) {
self.gc_todo.insert(&tree_key, new_bytes_hash.as_slice())?;
}
}
}
Ok(())
}
pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
let removed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
if let Some(cur_v) = store.get(k)? {
if cur_v == v {
store.remove(k)?;
mkl_todo.insert(k, vec![])?;
return Ok(true);
}
}
Ok(false)
})?;
if removed {
let old_entry = self.decode_entry(v)?;
self.instance.updated(Some(old_entry), None);
self.merkle_todo_notify.notify_one();
}
Ok(removed)
}
pub(crate) fn delete_if_equal_hash(
self: &Arc<Self>,
k: &[u8],
vhash: Hash,
) -> Result<bool, Error> {
let removed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
if let Some(cur_v) = store.get(k)? {
if blake2sum(&cur_v[..]) == vhash {
store.remove(k)?;
mkl_todo.insert(k, vec![])?;
return Ok(Some(cur_v));
}
}
Ok(None)
})?;
if let Some(old_v) = removed {
let old_entry = self.decode_entry(&old_v[..])?;
self.instance.updated(Some(old_entry), None);
self.merkle_todo_notify.notify_one();
Ok(true)
} else {
Ok(false)
}
}
// ---- Utility functions ----
pub(crate) fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
let mut ret = p.hash().to_vec();
ret.extend(s.sort_key());
ret
}
pub(crate) fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) {
Ok(x) => Ok(x),
Err(e) => match F::try_migrate(bytes) {
Some(x) => Ok(x),
None => {
warn!("Unable to decode entry of {}: {}", self.name, e);
for line in hexdump::hexdump_iter(bytes) {
debug!("{}", line);
}
Err(e.into())
}
},
}
}
pub fn gc_todo_len(&self) -> usize {
self.gc_todo.len()
}
}

248
src/table/gc.rs Normal file
View file

@ -0,0 +1,248 @@
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
use futures::future::join_all;
use futures::select;
use futures_util::future::*;
use tokio::sync::watch;
use garage_util::data::*;
use garage_util::error::Error;
use garage_rpc::membership::System;
use garage_rpc::rpc_client::*;
use garage_rpc::rpc_server::*;
use crate::data::*;
use crate::replication::*;
use crate::schema::*;
const TABLE_GC_BATCH_SIZE: usize = 1024;
const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
pub struct TableGC<F: TableSchema, R: TableReplication> {
system: Arc<System>,
data: Arc<TableData<F, R>>,
rpc_client: Arc<RpcClient<GcRPC>>,
}
#[derive(Serialize, Deserialize)]
enum GcRPC {
Update(Vec<ByteBuf>),
DeleteIfEqualHash(Vec<(ByteBuf, Hash)>),
Ok,
}
impl RpcMessage for GcRPC {}
impl<F, R> TableGC<F, R>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
pub(crate) fn launch(
system: Arc<System>,
data: Arc<TableData<F, R>>,
rpc_server: &mut RpcServer,
) -> Arc<Self> {
let rpc_path = format!("table_{}/gc", data.name);
let rpc_client = system.rpc_client::<GcRPC>(&rpc_path);
let gc = Arc::new(Self {
system: system.clone(),
data: data.clone(),
rpc_client,
});
gc.register_handler(rpc_server, rpc_path);
let gc1 = gc.clone();
system.background.spawn_worker(
format!("GC loop for {}", data.name),
move |must_exit: watch::Receiver<bool>| gc1.gc_loop(must_exit),
);
gc
}
async fn gc_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
while !*must_exit.borrow() {
match self.gc_loop_iter().await {
Ok(true) => {
// Stuff was done, loop imediately
continue;
}
Ok(false) => {
// Nothing was done, sleep for some time (below)
}
Err(e) => {
warn!("({}) Error doing GC: {}", self.data.name, e);
}
}
select! {
_ = tokio::time::sleep(Duration::from_secs(10)).fuse() => (),
_ = must_exit.changed().fuse() => (),
}
}
}
async fn gc_loop_iter(&self) -> Result<bool, Error> {
let mut entries = vec![];
let mut excluded = vec![];
for item in self.data.gc_todo.iter() {
let (k, vhash) = item?;
let vhash = Hash::try_from(&vhash[..]).unwrap();
let v_opt = self
.data
.store
.get(&k[..])?
.filter(|v| blake2sum(&v[..]) == vhash);
if let Some(v) = v_opt {
entries.push((ByteBuf::from(k.to_vec()), vhash, ByteBuf::from(v.to_vec())));
if entries.len() >= TABLE_GC_BATCH_SIZE {
break;
}
} else {
excluded.push((k, vhash));
}
}
for (k, vhash) in excluded {
self.todo_remove_if_equal(&k[..], vhash)?;
}
if entries.len() == 0 {
// Nothing to do in this iteration
return Ok(false);
}
debug!("({}) GC: doing {} items", self.data.name, entries.len());
let mut partitions = HashMap::new();
for (k, vhash, v) in entries {
let pkh = Hash::try_from(&k[..32]).unwrap();
let mut nodes = self.data.replication.write_nodes(&pkh);
nodes.retain(|x| *x != self.system.id);
nodes.sort();
if !partitions.contains_key(&nodes) {
partitions.insert(nodes.clone(), vec![]);
}
partitions.get_mut(&nodes).unwrap().push((k, vhash, v));
}
let resps = join_all(
partitions
.into_iter()
.map(|(nodes, items)| self.try_send_and_delete(nodes, items)),
)
.await;
let mut errs = vec![];
for resp in resps {
if let Err(e) = resp {
errs.push(e);
}
}
if errs.is_empty() {
Ok(true)
} else {
Err(Error::Message(errs.into_iter().map(|x| format!("{}", x)).collect::<Vec<_>>().join(", ")))
}
}
async fn try_send_and_delete(
&self,
nodes: Vec<UUID>,
items: Vec<(ByteBuf, Hash, ByteBuf)>,
) -> Result<(), Error> {
let n_items = items.len();
let mut updates = vec![];
let mut deletes = vec![];
for (k, vhash, v) in items {
updates.push(v);
deletes.push((k, vhash));
}
self.rpc_client
.try_call_many(
&nodes[..],
GcRPC::Update(updates),
RequestStrategy::with_quorum(nodes.len()).with_timeout(TABLE_GC_RPC_TIMEOUT),
)
.await?;
info!(
"({}) GC: {} items successfully pushed, will try to delete.",
self.data.name, n_items
);
self.rpc_client
.try_call_many(
&nodes[..],
GcRPC::DeleteIfEqualHash(deletes.clone()),
RequestStrategy::with_quorum(nodes.len()).with_timeout(TABLE_GC_RPC_TIMEOUT),
)
.await?;
for (k, vhash) in deletes {
self.data.delete_if_equal_hash(&k[..], vhash)?;
self.todo_remove_if_equal(&k[..], vhash)?;
}
Ok(())
}
fn todo_remove_if_equal(&self, key: &[u8], vhash: Hash) -> Result<(), Error> {
let _ = self
.data
.gc_todo
.compare_and_swap::<_, _, Vec<u8>>(key, Some(vhash), None)?;
Ok(())
}
// ---- RPC HANDLER ----
fn register_handler(self: &Arc<Self>, rpc_server: &mut RpcServer, path: String) {
let self2 = self.clone();
rpc_server.add_handler::<GcRPC, _, _>(path, move |msg, _addr| {
let self2 = self2.clone();
async move { self2.handle_rpc(&msg).await }
});
let self2 = self.clone();
self.rpc_client
.set_local_handler(self.system.id, move |msg| {
let self2 = self2.clone();
async move { self2.handle_rpc(&msg).await }
});
}
async fn handle_rpc(self: &Arc<Self>, message: &GcRPC) -> Result<GcRPC, Error> {
match message {
GcRPC::Update(items) => {
self.data.update_many(items)?;
Ok(GcRPC::Ok)
}
GcRPC::DeleteIfEqualHash(items) => {
for (key, vhash) in items.iter() {
self.data.delete_if_equal_hash(&key[..], *vhash)?;
self.todo_remove_if_equal(&key[..], *vhash)?;
}
Ok(GcRPC::Ok)
}
_ => Err(Error::Message(format!("Unexpected GC RPC"))),
}
}
}

View file

@ -7,10 +7,12 @@ pub mod crdt;
pub mod schema; pub mod schema;
pub mod util; pub mod util;
pub mod data;
pub mod gc;
pub mod merkle;
pub mod replication;
pub mod sync;
pub mod table; pub mod table;
pub mod table_fullcopy;
pub mod table_sharded;
pub mod table_sync;
pub use schema::*; pub use schema::*;
pub use table::*; pub use table::*;

454
src/table/merkle.rs Normal file
View file

@ -0,0 +1,454 @@
use std::sync::Arc;
use std::time::Duration;
use futures::select;
use futures_util::future::*;
use log::{debug, warn};
use serde::{Deserialize, Serialize};
use sled::transaction::{
ConflictableTransactionError, ConflictableTransactionResult, TransactionalTree,
};
use tokio::sync::watch;
use garage_util::background::BackgroundRunner;
use garage_util::data::*;
use garage_util::error::Error;
use garage_rpc::ring::*;
use crate::data::*;
use crate::replication::*;
use crate::schema::*;
// This modules partitions the data in 2**16 partitions, based on the top
// 16 bits (two bytes) of item's partition keys' hashes.
// It builds one Merkle tree for each of these 2**16 partitions.
pub struct MerkleUpdater<F: TableSchema, R: TableReplication> {
data: Arc<TableData<F, R>>,
// Content of the todo tree: items where
// - key = the key of an item in the main table, ie hash(partition_key)+sort_key
// - value = the hash of the full serialized item, if present,
// or an empty vec if item is absent (deleted)
// Fields in data:
// pub(crate) merkle_todo: sled::Tree,
// pub(crate) merkle_todo_notify: Notify,
// Content of the merkle tree: items where
// - key = .bytes() for MerkleNodeKey
// - value = serialization of a MerkleNode, assumed to be MerkleNode::empty if not found
// Field in data:
// pub(crate) merkle_tree: sled::Tree,
empty_node_hash: Hash,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct MerkleNodeKey {
// partition number
pub partition: Partition,
// prefix: a prefix for the hash of full keys, i.e. hash(hash(partition_key)+sort_key)
#[serde(with = "serde_bytes")]
pub prefix: Vec<u8>,
}
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize)]
pub enum MerkleNode {
// The empty Merkle node
Empty,
// An intermediate Merkle tree node for a prefix
// Contains the hashes of the 256 possible next prefixes
Intermediate(Vec<(u8, Hash)>),
// A final node for an item
// Contains the full key of the item and the hash of the value
Leaf(Vec<u8>, Hash),
}
impl<F, R> MerkleUpdater<F, R>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
pub(crate) fn launch(background: &BackgroundRunner, data: Arc<TableData<F, R>>) -> Arc<Self> {
let empty_node_hash = blake2sum(&rmp_to_vec_all_named(&MerkleNode::Empty).unwrap()[..]);
let ret = Arc::new(Self {
data,
empty_node_hash,
});
let ret2 = ret.clone();
background.spawn_worker(
format!("Merkle tree updater for {}", ret.data.name),
|must_exit: watch::Receiver<bool>| ret2.updater_loop(must_exit),
);
ret
}
async fn updater_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
while !*must_exit.borrow() {
if let Some(x) = self.data.merkle_todo.iter().next() {
match x {
Ok((key, valhash)) => {
if let Err(e) = self.update_item(&key[..], &valhash[..]) {
warn!(
"({}) Error while updating Merkle tree item: {}",
self.data.name, e
);
}
}
Err(e) => {
warn!(
"({}) Error while iterating on Merkle todo tree: {}",
self.data.name, e
);
tokio::time::sleep(Duration::from_secs(10)).await;
}
}
} else {
select! {
_ = self.data.merkle_todo_notify.notified().fuse() => (),
_ = must_exit.changed().fuse() => (),
}
}
}
}
fn update_item(&self, k: &[u8], vhash_by: &[u8]) -> Result<(), Error> {
let khash = blake2sum(k);
let new_vhash = if vhash_by.len() == 0 {
None
} else {
Some(Hash::try_from(&vhash_by[..]).unwrap())
};
let key = MerkleNodeKey {
partition: self
.data
.replication
.partition_of(&Hash::try_from(&k[0..32]).unwrap()),
prefix: vec![],
};
self.data
.merkle_tree
.transaction(|tx| self.update_item_rec(tx, k, &khash, &key, new_vhash))?;
let deleted = self
.data
.merkle_todo
.compare_and_swap::<_, _, Vec<u8>>(k, Some(vhash_by), None)?
.is_ok();
if !deleted {
debug!(
"({}) Item not deleted from Merkle todo because it changed: {:?}",
self.data.name, k
);
}
Ok(())
}
fn update_item_rec(
&self,
tx: &TransactionalTree,
k: &[u8],
khash: &Hash,
key: &MerkleNodeKey,
new_vhash: Option<Hash>,
) -> ConflictableTransactionResult<Option<Hash>, Error> {
let i = key.prefix.len();
// Read node at current position (defined by the prefix stored in key)
// Calculate an update to apply to this node
// This update is an Option<_>, so that it is None if the update is a no-op
// and we can thus skip recalculating and re-storing everything
let mutate = match self.read_node_txn(tx, &key)? {
MerkleNode::Empty => {
if let Some(vhv) = new_vhash {
Some(MerkleNode::Leaf(k.to_vec(), vhv))
} else {
// Nothing to do, keep empty node
None
}
}
MerkleNode::Intermediate(mut children) => {
let key2 = key.next_key(khash);
if let Some(subhash) = self.update_item_rec(tx, k, khash, &key2, new_vhash)? {
// Subtree changed, update this node as well
if subhash == self.empty_node_hash {
intermediate_rm_child(&mut children, key2.prefix[i]);
} else {
intermediate_set_child(&mut children, key2.prefix[i], subhash);
}
if children.len() == 0 {
// should not happen
warn!(
"({}) Replacing intermediate node with empty node, should not happen.",
self.data.name
);
Some(MerkleNode::Empty)
} else if children.len() == 1 {
// We now have a single node (case when the update deleted one of only two
// children). If that node is a leaf, move it to this level.
let key_sub = key.add_byte(children[0].0);
let subnode = self.read_node_txn(tx, &key_sub)?;
match subnode {
MerkleNode::Empty => {
warn!("({}) Single subnode in tree is empty Merkle node", self.data.name);
Some(MerkleNode::Empty)
}
MerkleNode::Intermediate(_) => {
Some(MerkleNode::Intermediate(children))
}
x @ MerkleNode::Leaf(_, _) => {
tx.remove(key_sub.encode())?;
Some(x)
}
}
} else {
Some(MerkleNode::Intermediate(children))
}
} else {
// Subtree not changed, nothing to do
None
}
}
MerkleNode::Leaf(exlf_k, exlf_vhash) => {
if exlf_k == k {
// This leaf is for the same key that the one we are updating
match new_vhash {
Some(vhv) if vhv == exlf_vhash => None,
Some(vhv) => Some(MerkleNode::Leaf(k.to_vec(), vhv)),
None => Some(MerkleNode::Empty),
}
} else {
// This is an only leaf for another key
if new_vhash.is_some() {
// Move that other key to a subnode, create another subnode for our
// insertion and replace current node by an intermediary node
let mut int = vec![];
let exlf_khash = blake2sum(&exlf_k[..]);
assert_eq!(khash.as_slice()[..i], exlf_khash.as_slice()[..i]);
{
let exlf_subkey = key.next_key(&exlf_khash);
let exlf_sub_hash = self.update_item_rec(tx, &exlf_k[..], &exlf_khash, &exlf_subkey, Some(exlf_vhash))?.unwrap();
intermediate_set_child(&mut int, exlf_subkey.prefix[i], exlf_sub_hash);
assert_eq!(int.len(), 1);
}
{
let key2 = key.next_key(khash);
let subhash = self.update_item_rec(tx, k, khash, &key2, new_vhash)?.unwrap();
intermediate_set_child(&mut int, key2.prefix[i], subhash);
if exlf_khash.as_slice()[i] == khash.as_slice()[i] {
assert_eq!(int.len(), 1);
} else {
assert_eq!(int.len(), 2);
}
}
Some(MerkleNode::Intermediate(int))
} else {
// Nothing to do, we don't want to insert this value because it is None,
// and we don't want to change the other value because it's for something
// else
None
}
}
}
};
if let Some(new_node) = mutate {
let hash = self.put_node_txn(tx, &key, &new_node)?;
Ok(Some(hash))
} else {
Ok(None)
}
}
// Merkle tree node manipulation
fn read_node_txn(
&self,
tx: &TransactionalTree,
k: &MerkleNodeKey,
) -> ConflictableTransactionResult<MerkleNode, Error> {
let ent = tx.get(k.encode())?;
MerkleNode::decode_opt(ent).map_err(ConflictableTransactionError::Abort)
}
fn put_node_txn(
&self,
tx: &TransactionalTree,
k: &MerkleNodeKey,
v: &MerkleNode,
) -> ConflictableTransactionResult<Hash, Error> {
trace!("Put Merkle node: {:?} => {:?}", k, v);
if *v == MerkleNode::Empty {
tx.remove(k.encode())?;
Ok(self.empty_node_hash)
} else {
let vby = rmp_to_vec_all_named(v)
.map_err(|e| ConflictableTransactionError::Abort(e.into()))?;
let rethash = blake2sum(&vby[..]);
tx.insert(k.encode(), vby)?;
Ok(rethash)
}
}
// Access a node in the Merkle tree, used by the sync protocol
pub(crate) fn read_node(&self, k: &MerkleNodeKey) -> Result<MerkleNode, Error> {
let ent = self.data.merkle_tree.get(k.encode())?;
MerkleNode::decode_opt(ent)
}
pub fn merkle_tree_len(&self) -> usize {
self.data.merkle_tree.len()
}
pub fn todo_len(&self) -> usize {
self.data.merkle_todo.len()
}
}
impl MerkleNodeKey {
fn encode(&self) -> Vec<u8> {
let mut ret = Vec::with_capacity(2 + self.prefix.len());
ret.extend(&u16::to_be_bytes(self.partition)[..]);
ret.extend(&self.prefix[..]);
ret
}
pub fn next_key(&self, h: &Hash) -> Self {
assert_eq!(h.as_slice()[0..self.prefix.len()], self.prefix[..]);
let mut s2 = self.clone();
s2.prefix.push(h.as_slice()[self.prefix.len()]);
s2
}
pub fn add_byte(&self, b: u8) -> Self {
let mut s2 = self.clone();
s2.prefix.push(b);
s2
}
}
impl MerkleNode {
fn decode_opt(ent: Option<sled::IVec>) -> Result<Self, Error> {
match ent {
None => Ok(MerkleNode::Empty),
Some(v) => Ok(rmp_serde::decode::from_read_ref::<_, MerkleNode>(&v[..])?),
}
}
pub fn is_empty(&self) -> bool {
*self == MerkleNode::Empty
}
}
fn intermediate_set_child(ch: &mut Vec<(u8, Hash)>, pos: u8, v: Hash) {
for i in 0..ch.len() {
if ch[i].0 == pos {
ch[i].1 = v;
return;
} else if ch[i].0 > pos {
ch.insert(i, (pos, v));
return;
}
}
ch.push((pos, v));
}
fn intermediate_rm_child(ch: &mut Vec<(u8, Hash)>, pos: u8) {
for i in 0..ch.len() {
if ch[i].0 == pos {
ch.remove(i);
return;
}
}
}
#[test]
fn test_intermediate_aux() {
let mut v = vec![];
intermediate_set_child(&mut v, 12u8, [12u8; 32].into());
assert_eq!(v, vec![(12u8, [12u8; 32].into())]);
intermediate_set_child(&mut v, 42u8, [42u8; 32].into());
assert_eq!(
v,
vec![(12u8, [12u8; 32].into()), (42u8, [42u8; 32].into())]
);
intermediate_set_child(&mut v, 4u8, [4u8; 32].into());
assert_eq!(
v,
vec![
(4u8, [4u8; 32].into()),
(12u8, [12u8; 32].into()),
(42u8, [42u8; 32].into())
]
);
intermediate_set_child(&mut v, 12u8, [8u8; 32].into());
assert_eq!(
v,
vec![
(4u8, [4u8; 32].into()),
(12u8, [8u8; 32].into()),
(42u8, [42u8; 32].into())
]
);
intermediate_set_child(&mut v, 6u8, [6u8; 32].into());
assert_eq!(
v,
vec![
(4u8, [4u8; 32].into()),
(6u8, [6u8; 32].into()),
(12u8, [8u8; 32].into()),
(42u8, [42u8; 32].into())
]
);
intermediate_rm_child(&mut v, 42u8);
assert_eq!(
v,
vec![
(4u8, [4u8; 32].into()),
(6u8, [6u8; 32].into()),
(12u8, [8u8; 32].into())
]
);
intermediate_rm_child(&mut v, 11u8);
assert_eq!(
v,
vec![
(4u8, [4u8; 32].into()),
(6u8, [6u8; 32].into()),
(12u8, [8u8; 32].into())
]
);
intermediate_rm_child(&mut v, 6u8);
assert_eq!(v, vec![(4u8, [4u8; 32].into()), (12u8, [8u8; 32].into())]);
intermediate_set_child(&mut v, 6u8, [7u8; 32].into());
assert_eq!(
v,
vec![
(4u8, [4u8; 32].into()),
(6u8, [7u8; 32].into()),
(12u8, [8u8; 32].into())
]
);
}

View file

@ -0,0 +1,51 @@
use std::sync::Arc;
use garage_rpc::membership::System;
use garage_rpc::ring::*;
use garage_util::data::*;
use crate::replication::*;
#[derive(Clone)]
pub struct TableFullReplication {
pub system: Arc<System>,
pub max_faults: usize,
}
impl TableReplication for TableFullReplication {
// Full replication schema: all nodes store everything
// Writes are disseminated in an epidemic manner in the network
// Advantage: do all reads locally, extremely fast
// Inconvenient: only suitable to reasonably small tables
fn read_nodes(&self, _hash: &Hash) -> Vec<UUID> {
vec![self.system.id]
}
fn read_quorum(&self) -> usize {
1
}
fn write_nodes(&self, _hash: &Hash) -> Vec<UUID> {
let ring = self.system.ring.borrow();
ring.config.members.keys().cloned().collect::<Vec<_>>()
}
fn write_quorum(&self) -> usize {
let nmembers = self.system.ring.borrow().config.members.len();
if nmembers > self.max_faults {
nmembers - self.max_faults
} else {
1
}
}
fn max_write_errors(&self) -> usize {
self.max_faults
}
fn partition_of(&self, _hash: &Hash) -> Partition {
0u16
}
fn partitions(&self) -> Vec<(Partition, Hash)> {
vec![(0u16, [0u8; 32].into())]
}
}

View file

@ -0,0 +1,6 @@
mod parameters;
pub mod fullcopy;
pub mod sharded;
pub use parameters::*;

View file

@ -0,0 +1,21 @@
use garage_rpc::ring::*;
use garage_util::data::*;
pub trait TableReplication: Send + Sync {
// See examples in table_sharded.rs and table_fullcopy.rs
// To understand various replication methods
// Which nodes to send reads from
fn read_nodes(&self, hash: &Hash) -> Vec<UUID>;
fn read_quorum(&self) -> usize;
// Which nodes to send writes to
fn write_nodes(&self, hash: &Hash) -> Vec<UUID>;
fn write_quorum(&self) -> usize;
fn max_write_errors(&self) -> usize;
// Accessing partitions, for Merkle tree & sync
fn partition_of(&self, hash: &Hash) -> Partition;
fn partitions(&self) -> Vec<(Partition, Hash)>;
}

View file

@ -1,11 +1,14 @@
use std::sync::Arc;
use garage_rpc::membership::System; use garage_rpc::membership::System;
use garage_rpc::ring::Ring; use garage_rpc::ring::*;
use garage_util::data::*; use garage_util::data::*;
use crate::*; use crate::replication::*;
#[derive(Clone)] #[derive(Clone)]
pub struct TableShardedReplication { pub struct TableShardedReplication {
pub system: Arc<System>,
pub replication_factor: usize, pub replication_factor: usize,
pub read_quorum: usize, pub read_quorum: usize,
pub write_quorum: usize, pub write_quorum: usize,
@ -19,35 +22,29 @@ impl TableReplication for TableShardedReplication {
// - reads are done on all of the nodes that replicate the data // - reads are done on all of the nodes that replicate the data
// - writes as well // - writes as well
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID> { fn read_nodes(&self, hash: &Hash) -> Vec<UUID> {
let ring = system.ring.borrow().clone(); let ring = self.system.ring.borrow().clone();
ring.walk_ring(&hash, self.replication_factor) ring.walk_ring(&hash, self.replication_factor)
} }
fn read_quorum(&self) -> usize { fn read_quorum(&self) -> usize {
self.read_quorum self.read_quorum
} }
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID> { fn write_nodes(&self, hash: &Hash) -> Vec<UUID> {
let ring = system.ring.borrow().clone(); let ring = self.system.ring.borrow();
ring.walk_ring(&hash, self.replication_factor) ring.walk_ring(&hash, self.replication_factor)
} }
fn write_quorum(&self, _system: &System) -> usize { fn write_quorum(&self) -> usize {
self.write_quorum self.write_quorum
} }
fn max_write_errors(&self) -> usize { fn max_write_errors(&self) -> usize {
self.replication_factor - self.write_quorum self.replication_factor - self.write_quorum
} }
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID> { fn partition_of(&self, hash: &Hash) -> Partition {
ring.walk_ring(&hash, self.replication_factor) self.system.ring.borrow().partition_of(hash)
} }
fn split_points(&self, ring: &Ring) -> Vec<Hash> { fn partitions(&self) -> Vec<(Partition, Hash)> {
let mut ret = vec![]; self.system.ring.borrow().partitions()
for entry in ring.ring.iter() {
ret.push(entry.location);
}
ret.push([0xFFu8; 32].into());
ret
} }
} }

View file

@ -2,13 +2,15 @@ use serde::{Deserialize, Serialize};
use garage_util::data::*; use garage_util::data::*;
use crate::crdt::CRDT;
pub trait PartitionKey { pub trait PartitionKey {
fn hash(&self) -> Hash; fn hash(&self) -> Hash;
} }
impl PartitionKey for String { impl PartitionKey for String {
fn hash(&self) -> Hash { fn hash(&self) -> Hash {
sha256sum(self.as_bytes()) blake2sum(self.as_bytes())
} }
} }
@ -35,12 +37,14 @@ impl SortKey for Hash {
} }
pub trait Entry<P: PartitionKey, S: SortKey>: pub trait Entry<P: PartitionKey, S: SortKey>:
PartialEq + Clone + Serialize + for<'de> Deserialize<'de> + Send + Sync CRDT + PartialEq + Clone + Serialize + for<'de> Deserialize<'de> + Send + Sync
{ {
fn partition_key(&self) -> &P; fn partition_key(&self) -> &P;
fn sort_key(&self) -> &S; fn sort_key(&self) -> &S;
fn merge(&mut self, other: &Self); fn is_tombstone(&self) -> bool {
false
}
} }
pub trait TableSchema: Send + Sync { pub trait TableSchema: Send + Sync {

614
src/table/sync.rs Normal file
View file

@ -0,0 +1,614 @@
use std::collections::VecDeque;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use futures::select;
use futures_util::future::*;
use futures_util::stream::*;
use rand::Rng;
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
use tokio::sync::{mpsc, watch};
use garage_util::data::*;
use garage_util::error::Error;
use garage_rpc::membership::System;
use garage_rpc::ring::*;
use garage_rpc::rpc_client::*;
use garage_rpc::rpc_server::*;
use crate::data::*;
use crate::merkle::*;
use crate::replication::*;
use crate::*;
const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
// Do anti-entropy every 10 minutes
const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60);
pub struct TableSyncer<F: TableSchema, R: TableReplication> {
system: Arc<System>,
data: Arc<TableData<F, R>>,
merkle: Arc<MerkleUpdater<F, R>>,
todo: Mutex<SyncTodo>,
rpc_client: Arc<RpcClient<SyncRPC>>,
}
#[derive(Serialize, Deserialize)]
pub(crate) enum SyncRPC {
RootCkHash(Partition, Hash),
RootCkDifferent(bool),
GetNode(MerkleNodeKey),
Node(MerkleNodeKey, MerkleNode),
Items(Vec<Arc<ByteBuf>>),
Ok,
}
impl RpcMessage for SyncRPC {}
struct SyncTodo {
todo: Vec<TodoPartition>,
}
#[derive(Debug, Clone)]
struct TodoPartition {
partition: Partition,
begin: Hash,
end: Hash,
// Are we a node that stores this partition or not?
retain: bool,
}
impl<F, R> TableSyncer<F, R>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
pub(crate) fn launch(
system: Arc<System>,
data: Arc<TableData<F, R>>,
merkle: Arc<MerkleUpdater<F, R>>,
rpc_server: &mut RpcServer,
) -> Arc<Self> {
let rpc_path = format!("table_{}/sync", data.name);
let rpc_client = system.rpc_client::<SyncRPC>(&rpc_path);
let todo = SyncTodo { todo: vec![] };
let syncer = Arc::new(Self {
system: system.clone(),
data: data.clone(),
merkle,
todo: Mutex::new(todo),
rpc_client,
});
syncer.register_handler(rpc_server, rpc_path);
let (busy_tx, busy_rx) = mpsc::unbounded_channel();
let s1 = syncer.clone();
system.background.spawn_worker(
format!("table sync watcher for {}", data.name),
move |must_exit: watch::Receiver<bool>| s1.watcher_task(must_exit, busy_rx),
);
let s2 = syncer.clone();
system.background.spawn_worker(
format!("table syncer for {}", data.name),
move |must_exit: watch::Receiver<bool>| s2.syncer_task(must_exit, busy_tx),
);
let s3 = syncer.clone();
tokio::spawn(async move {
tokio::time::sleep(Duration::from_secs(20)).await;
s3.add_full_sync();
});
syncer
}
fn register_handler(self: &Arc<Self>, rpc_server: &mut RpcServer, path: String) {
let self2 = self.clone();
rpc_server.add_handler::<SyncRPC, _, _>(path, move |msg, _addr| {
let self2 = self2.clone();
async move { self2.handle_rpc(&msg).await }
});
let self2 = self.clone();
self.rpc_client
.set_local_handler(self.system.id, move |msg| {
let self2 = self2.clone();
async move { self2.handle_rpc(&msg).await }
});
}
async fn watcher_task(
self: Arc<Self>,
mut must_exit: watch::Receiver<bool>,
mut busy_rx: mpsc::UnboundedReceiver<bool>,
) {
let mut prev_ring: Arc<Ring> = self.system.ring.borrow().clone();
let mut ring_recv: watch::Receiver<Arc<Ring>> = self.system.ring.clone();
let mut nothing_to_do_since = Some(Instant::now());
while !*must_exit.borrow() {
select! {
_ = ring_recv.changed().fuse() => {
let new_ring = ring_recv.borrow();
if !Arc::ptr_eq(&new_ring, &prev_ring) {
debug!("({}) Ring changed, adding full sync to syncer todo list", self.data.name);
self.add_full_sync();
prev_ring = new_ring.clone();
}
}
busy_opt = busy_rx.recv().fuse() => {
if let Some(busy) = busy_opt {
if busy {
nothing_to_do_since = None;
} else {
if nothing_to_do_since.is_none() {
nothing_to_do_since = Some(Instant::now());
}
}
}
}
_ = must_exit.changed().fuse() => (),
_ = tokio::time::sleep(Duration::from_secs(1)).fuse() => {
if nothing_to_do_since.map(|t| Instant::now() - t >= ANTI_ENTROPY_INTERVAL).unwrap_or(false) {
nothing_to_do_since = None;
debug!("({}) Interval passed, adding full sync to syncer todo list", self.data.name);
self.add_full_sync();
}
}
}
}
}
pub fn add_full_sync(&self) {
self.todo
.lock()
.unwrap()
.add_full_sync(&self.data, &self.system);
}
async fn syncer_task(
self: Arc<Self>,
mut must_exit: watch::Receiver<bool>,
busy_tx: mpsc::UnboundedSender<bool>,
) {
while !*must_exit.borrow() {
let task = self.todo.lock().unwrap().pop_task();
if let Some(partition) = task {
busy_tx.send(true).unwrap();
let res = self
.clone()
.sync_partition(&partition, &mut must_exit)
.await;
if let Err(e) = res {
warn!(
"({}) Error while syncing {:?}: {}",
self.data.name, partition, e
);
}
} else {
busy_tx.send(false).unwrap();
tokio::time::sleep(Duration::from_secs(1)).await;
}
}
}
async fn sync_partition(
self: Arc<Self>,
partition: &TodoPartition,
must_exit: &mut watch::Receiver<bool>,
) -> Result<(), Error> {
if partition.retain {
let my_id = self.system.id;
let nodes = self
.data
.replication
.write_nodes(&partition.begin)
.into_iter()
.filter(|node| *node != my_id)
.collect::<Vec<_>>();
debug!(
"({}) Syncing {:?} with {:?}...",
self.data.name, partition, nodes
);
let mut sync_futures = nodes
.iter()
.map(|node| {
self.clone()
.do_sync_with(partition.clone(), *node, must_exit.clone())
})
.collect::<FuturesUnordered<_>>();
let mut n_errors = 0;
while let Some(r) = sync_futures.next().await {
if let Err(e) = r {
n_errors += 1;
warn!("({}) Sync error: {}", self.data.name, e);
}
}
if n_errors > self.data.replication.max_write_errors() {
return Err(Error::Message(format!(
"Sync failed with too many nodes (should have been: {:?}).",
nodes
)));
}
} else {
self.offload_partition(&partition.begin, &partition.end, must_exit)
.await?;
}
Ok(())
}
// Offload partition: this partition is not something we are storing,
// so send it out to all other nodes that store it and delete items locally.
// We don't bother checking if the remote nodes already have the items,
// we just batch-send everything. Offloading isn't supposed to happen very often.
// If any of the nodes that are supposed to store the items is unable to
// save them, we interrupt the process.
async fn offload_partition(
self: &Arc<Self>,
begin: &Hash,
end: &Hash,
must_exit: &mut watch::Receiver<bool>,
) -> Result<(), Error> {
let mut counter: usize = 0;
while !*must_exit.borrow() {
let mut items = Vec::new();
for item in self.data.store.range(begin.to_vec()..end.to_vec()) {
let (key, value) = item?;
items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref()))));
if items.len() >= 1024 {
break;
}
}
if items.len() > 0 {
let nodes = self
.data
.replication
.write_nodes(&begin)
.into_iter()
.collect::<Vec<_>>();
if nodes.contains(&self.system.id) {
warn!(
"({}) Interrupting offload as partitions seem to have changed",
self.data.name
);
break;
}
if nodes.len() < self.data.replication.write_quorum() {
return Err(Error::Message(format!(
"Not offloading as we don't have a quorum of nodes to write to."
)));
}
counter += 1;
info!(
"({}) Offloading {} items from {:?}..{:?} ({})",
self.data.name,
items.len(),
begin,
end,
counter
);
self.offload_items(&items, &nodes[..]).await?;
} else {
break;
}
}
Ok(())
}
async fn offload_items(
self: &Arc<Self>,
items: &Vec<(Vec<u8>, Arc<ByteBuf>)>,
nodes: &[UUID],
) -> Result<(), Error> {
let values = items.iter().map(|(_k, v)| v.clone()).collect::<Vec<_>>();
self.rpc_client
.try_call_many(
&nodes[..],
SyncRPC::Items(values),
RequestStrategy::with_quorum(nodes.len()).with_timeout(TABLE_SYNC_RPC_TIMEOUT),
)
.await?;
// All remote nodes have written those items, now we can delete them locally
let mut not_removed = 0;
for (k, v) in items.iter() {
if !self.data.delete_if_equal(&k[..], &v[..])? {
not_removed += 1;
}
}
if not_removed > 0 {
debug!("({}) {} items not removed during offload because they changed in between (trying again...)", self.data.name, not_removed);
}
Ok(())
}
// ======= SYNCHRONIZATION PROCEDURE -- DRIVER SIDE ======
// The driver side is only concerned with sending out the item it has
// and the other side might not have. Receiving items that differ from one
// side to the other will happen when the other side syncs with us,
// which they also do regularly.
fn get_root_ck(&self, partition: Partition) -> Result<(MerkleNodeKey, MerkleNode), Error> {
let key = MerkleNodeKey {
partition,
prefix: vec![],
};
let node = self.merkle.read_node(&key)?;
Ok((key, node))
}
async fn do_sync_with(
self: Arc<Self>,
partition: TodoPartition,
who: UUID,
must_exit: watch::Receiver<bool>,
) -> Result<(), Error> {
let (root_ck_key, root_ck) = self.get_root_ck(partition.partition)?;
if root_ck.is_empty() {
debug!(
"({}) Sync {:?} with {:?}: partition is empty.",
self.data.name, partition, who
);
return Ok(());
}
let root_ck_hash = hash_of::<MerkleNode>(&root_ck)?;
// Check if they have the same root checksum
// If so, do nothing.
let root_resp = self
.rpc_client
.call(
who,
SyncRPC::RootCkHash(partition.partition, root_ck_hash),
TABLE_SYNC_RPC_TIMEOUT,
)
.await?;
let mut todo = match root_resp {
SyncRPC::RootCkDifferent(false) => {
debug!(
"({}) Sync {:?} with {:?}: no difference",
self.data.name, partition, who
);
return Ok(());
}
SyncRPC::RootCkDifferent(true) => VecDeque::from(vec![root_ck_key]),
x => {
return Err(Error::Message(format!(
"Invalid respone to RootCkHash RPC: {}",
debug_serialize(x)
)));
}
};
let mut todo_items = vec![];
while !todo.is_empty() && !*must_exit.borrow() {
let key = todo.pop_front().unwrap();
let node = self.merkle.read_node(&key)?;
match node {
MerkleNode::Empty => {
// They have items we don't have.
// We don't request those items from them, they will send them.
// We only bother with pushing items that differ
}
MerkleNode::Leaf(ik, ivhash) => {
// Just send that item directly
if let Some(val) = self.data.store.get(&ik[..])? {
if blake2sum(&val[..]) != ivhash {
warn!("({}) Hashes differ between stored value and Merkle tree, key: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", self.data.name, ik);
}
todo_items.push(val.to_vec());
} else {
warn!("({}) Item from Merkle tree not found in store: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", self.data.name, ik);
}
}
MerkleNode::Intermediate(l) => {
// Get Merkle node for this tree position at remote node
// and compare it with local node
let remote_node = match self
.rpc_client
.call(who, SyncRPC::GetNode(key.clone()), TABLE_SYNC_RPC_TIMEOUT)
.await?
{
SyncRPC::Node(_, node) => node,
x => {
return Err(Error::Message(format!(
"Invalid respone to GetNode RPC: {}",
debug_serialize(x)
)));
}
};
let int_l2 = match remote_node {
// If they have an intermediate node at this tree position,
// we can compare them to find differences
MerkleNode::Intermediate(l2) => l2,
// Otherwise, treat it as if they have nothing for this subtree,
// which will have the consequence of sending them everything
_ => vec![],
};
let join = join_ordered(&l[..], &int_l2[..]);
for (p, v1, v2) in join.into_iter() {
let diff = match (v1, v2) {
(Some(_), None) | (None, Some(_)) => true,
(Some(a), Some(b)) => a != b,
_ => false,
};
if diff {
todo.push_back(key.add_byte(*p));
}
}
}
}
if todo_items.len() >= 256 {
self.send_items(who, std::mem::replace(&mut todo_items, vec![]))
.await?;
}
}
if !todo_items.is_empty() {
self.send_items(who, todo_items).await?;
}
Ok(())
}
async fn send_items(&self, who: UUID, item_value_list: Vec<Vec<u8>>) -> Result<(), Error> {
info!(
"({}) Sending {} items to {:?}",
self.data.name,
item_value_list.len(),
who
);
let values = item_value_list
.into_iter()
.map(|x| Arc::new(ByteBuf::from(x)))
.collect::<Vec<_>>();
let rpc_resp = self
.rpc_client
.call(who, SyncRPC::Items(values), TABLE_SYNC_RPC_TIMEOUT)
.await?;
if let SyncRPC::Ok = rpc_resp {
Ok(())
} else {
Err(Error::Message(format!(
"Unexpected response to RPC Update: {}",
debug_serialize(&rpc_resp)
)))
}
}
// ======= SYNCHRONIZATION PROCEDURE -- RECEIVER SIDE ======
async fn handle_rpc(self: &Arc<Self>, message: &SyncRPC) -> Result<SyncRPC, Error> {
match message {
SyncRPC::RootCkHash(range, h) => {
let (_root_ck_key, root_ck) = self.get_root_ck(*range)?;
let hash = hash_of::<MerkleNode>(&root_ck)?;
Ok(SyncRPC::RootCkDifferent(hash != *h))
}
SyncRPC::GetNode(k) => {
let node = self.merkle.read_node(&k)?;
Ok(SyncRPC::Node(k.clone(), node))
}
SyncRPC::Items(items) => {
self.data.update_many(items)?;
Ok(SyncRPC::Ok)
}
_ => Err(Error::Message(format!("Unexpected sync RPC"))),
}
}
}
impl SyncTodo {
fn add_full_sync<F: TableSchema, R: TableReplication>(
&mut self,
data: &TableData<F, R>,
system: &System,
) {
let my_id = system.id;
self.todo.clear();
let partitions = data.replication.partitions();
for i in 0..partitions.len() {
let begin = partitions[i].1;
let end = if i + 1 < partitions.len() {
partitions[i + 1].1
} else {
[0xFFu8; 32].into()
};
let nodes = data.replication.write_nodes(&begin);
let retain = nodes.contains(&my_id);
if !retain {
// Check if we have some data to send, otherwise skip
if data.store.range(begin..end).next().is_none() {
continue;
}
}
self.todo.push(TodoPartition {
partition: partitions[i].0,
begin,
end,
retain,
});
}
}
fn pop_task(&mut self) -> Option<TodoPartition> {
if self.todo.is_empty() {
return None;
}
let i = rand::thread_rng().gen_range(0..self.todo.len());
if i == self.todo.len() - 1 {
self.todo.pop()
} else {
let replacement = self.todo.pop().unwrap();
let ret = std::mem::replace(&mut self.todo[i], replacement);
Some(ret)
}
}
}
fn hash_of<T: Serialize>(x: &T) -> Result<Hash, Error> {
Ok(blake2sum(&rmp_to_vec_all_named(x)?[..]))
}
fn join_ordered<'a, K: Ord + Eq, V1, V2>(
x: &'a [(K, V1)],
y: &'a [(K, V2)],
) -> Vec<(&'a K, Option<&'a V1>, Option<&'a V2>)> {
let mut ret = vec![];
let mut i = 0;
let mut j = 0;
while i < x.len() || j < y.len() {
if i < x.len() && j < y.len() && x[i].0 == y[j].0 {
ret.push((&x[i].0, Some(&x[i].1), Some(&y[j].1)));
i += 1;
j += 1;
} else if i < x.len() && (j == y.len() || x[i].0 < y[j].0) {
ret.push((&x[i].0, Some(&x[i].1), None));
i += 1;
} else if j < y.len() && (i == x.len() || x[i].0 > y[j].0) {
ret.push((&y[j].0, None, Some(&y[j].1)));
j += 1;
} else {
unreachable!();
}
}
ret
}

View file

@ -2,9 +2,6 @@ use std::collections::{BTreeMap, HashMap};
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
use log::warn;
use arc_swap::ArcSwapOption;
use futures::stream::*; use futures::stream::*;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf; use serde_bytes::ByteBuf;
@ -13,25 +10,25 @@ use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use garage_rpc::membership::System; use garage_rpc::membership::System;
use garage_rpc::ring::Ring;
use garage_rpc::rpc_client::*; use garage_rpc::rpc_client::*;
use garage_rpc::rpc_server::*; use garage_rpc::rpc_server::*;
use crate::crdt::CRDT;
use crate::data::*;
use crate::gc::*;
use crate::merkle::*;
use crate::replication::*;
use crate::schema::*; use crate::schema::*;
use crate::table_sync::*; use crate::sync::*;
const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10); const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10);
pub struct Table<F: TableSchema, R: TableReplication> { pub struct Table<F: TableSchema, R: TableReplication> {
pub instance: F,
pub replication: R,
pub name: String,
pub(crate) rpc_client: Arc<RpcClient<TableRPC<F>>>,
pub system: Arc<System>, pub system: Arc<System>,
pub store: sled::Tree, pub data: Arc<TableData<F, R>>,
pub syncer: ArcSwapOption<TableSyncer<F, R>>, pub merkle_updater: Arc<MerkleUpdater<F, R>>,
pub syncer: Arc<TableSyncer<F, R>>,
rpc_client: Arc<RpcClient<TableRPC<F>>>,
} }
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
@ -45,30 +42,10 @@ pub(crate) enum TableRPC<F: TableSchema> {
ReadRange(F::P, Option<F::S>, Option<F::Filter>, usize), ReadRange(F::P, Option<F::S>, Option<F::Filter>, usize),
Update(Vec<Arc<ByteBuf>>), Update(Vec<Arc<ByteBuf>>),
SyncRPC(SyncRPC),
} }
impl<F: TableSchema> RpcMessage for TableRPC<F> {} impl<F: TableSchema> RpcMessage for TableRPC<F> {}
pub trait TableReplication: Send + Sync {
// See examples in table_sharded.rs and table_fullcopy.rs
// To understand various replication methods
// Which nodes to send reads from
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
fn read_quorum(&self) -> usize;
// Which nodes to send writes to
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
fn write_quorum(&self, system: &System) -> usize;
fn max_write_errors(&self) -> usize;
// Which are the nodes that do actually replicate the data
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID>;
fn split_points(&self, ring: &Ring) -> Vec<Hash>;
}
impl<F, R> Table<F, R> impl<F, R> Table<F, R>
where where
F: TableSchema + 'static, F: TableSchema + 'static,
@ -76,7 +53,7 @@ where
{ {
// =============== PUBLIC INTERFACE FUNCTIONS (new, insert, get, etc) =============== // =============== PUBLIC INTERFACE FUNCTIONS (new, insert, get, etc) ===============
pub async fn new( pub fn new(
instance: F, instance: F,
replication: R, replication: R,
system: Arc<System>, system: Arc<System>,
@ -84,31 +61,37 @@ where
name: String, name: String,
rpc_server: &mut RpcServer, rpc_server: &mut RpcServer,
) -> Arc<Self> { ) -> Arc<Self> {
let store = db.open_tree(&name).expect("Unable to open DB tree");
let rpc_path = format!("table_{}", name); let rpc_path = format!("table_{}", name);
let rpc_client = system.rpc_client::<TableRPC<F>>(&rpc_path); let rpc_client = system.rpc_client::<TableRPC<F>>(&rpc_path);
let table = Arc::new(Self { let data = TableData::new(system.clone(), name, instance, replication, db);
instance,
replication,
name,
rpc_client,
system,
store,
syncer: ArcSwapOption::from(None),
});
table.clone().register_handler(rpc_server, rpc_path);
let syncer = TableSyncer::launch(table.clone()).await; let merkle_updater = MerkleUpdater::launch(&system.background, data.clone());
table.syncer.swap(Some(syncer));
let syncer = TableSyncer::launch(
system.clone(),
data.clone(),
merkle_updater.clone(),
rpc_server,
);
TableGC::launch(system.clone(), data.clone(), rpc_server);
let table = Arc::new(Self {
system,
data,
merkle_updater,
syncer,
rpc_client,
});
table.clone().register_handler(rpc_server, rpc_path);
table table
} }
pub async fn insert(&self, e: &F::E) -> Result<(), Error> { pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
let hash = e.partition_key().hash(); let hash = e.partition_key().hash();
let who = self.replication.write_nodes(&hash, &self.system); let who = self.data.replication.write_nodes(&hash);
//eprintln!("insert who: {:?}", who); //eprintln!("insert who: {:?}", who);
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?)); let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?));
@ -118,7 +101,7 @@ where
.try_call_many( .try_call_many(
&who[..], &who[..],
rpc, rpc,
RequestStrategy::with_quorum(self.replication.write_quorum(&self.system)) RequestStrategy::with_quorum(self.data.replication.write_quorum())
.with_timeout(TABLE_RPC_TIMEOUT), .with_timeout(TABLE_RPC_TIMEOUT),
) )
.await?; .await?;
@ -130,7 +113,7 @@ where
for entry in entries.iter() { for entry in entries.iter() {
let hash = entry.partition_key().hash(); let hash = entry.partition_key().hash();
let who = self.replication.write_nodes(&hash, &self.system); let who = self.data.replication.write_nodes(&hash);
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?)); let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?));
for node in who { for node in who {
if !call_list.contains_key(&node) { if !call_list.contains_key(&node) {
@ -154,7 +137,7 @@ where
errors.push(e); errors.push(e);
} }
} }
if errors.len() > self.replication.max_write_errors() { if errors.len() > self.data.replication.max_write_errors() {
Err(Error::Message("Too many errors".into())) Err(Error::Message("Too many errors".into()))
} else { } else {
Ok(()) Ok(())
@ -167,7 +150,7 @@ where
sort_key: &F::S, sort_key: &F::S,
) -> Result<Option<F::E>, Error> { ) -> Result<Option<F::E>, Error> {
let hash = partition_key.hash(); let hash = partition_key.hash();
let who = self.replication.read_nodes(&hash, &self.system); let who = self.data.replication.read_nodes(&hash);
//eprintln!("get who: {:?}", who); //eprintln!("get who: {:?}", who);
let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone()); let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
@ -176,7 +159,7 @@ where
.try_call_many( .try_call_many(
&who[..], &who[..],
rpc, rpc,
RequestStrategy::with_quorum(self.replication.read_quorum()) RequestStrategy::with_quorum(self.data.replication.read_quorum())
.with_timeout(TABLE_RPC_TIMEOUT) .with_timeout(TABLE_RPC_TIMEOUT)
.interrupt_after_quorum(true), .interrupt_after_quorum(true),
) )
@ -187,7 +170,7 @@ where
for resp in resps { for resp in resps {
if let TableRPC::ReadEntryResponse(value) = resp { if let TableRPC::ReadEntryResponse(value) = resp {
if let Some(v_bytes) = value { if let Some(v_bytes) = value {
let v = self.decode_entry(v_bytes.as_slice())?; let v = self.data.decode_entry(v_bytes.as_slice())?;
ret = match ret { ret = match ret {
None => Some(v), None => Some(v),
Some(mut x) => { Some(mut x) => {
@ -223,7 +206,7 @@ where
limit: usize, limit: usize,
) -> Result<Vec<F::E>, Error> { ) -> Result<Vec<F::E>, Error> {
let hash = partition_key.hash(); let hash = partition_key.hash();
let who = self.replication.read_nodes(&hash, &self.system); let who = self.data.replication.read_nodes(&hash);
let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit); let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit);
@ -232,7 +215,7 @@ where
.try_call_many( .try_call_many(
&who[..], &who[..],
rpc, rpc,
RequestStrategy::with_quorum(self.replication.read_quorum()) RequestStrategy::with_quorum(self.data.replication.read_quorum())
.with_timeout(TABLE_RPC_TIMEOUT) .with_timeout(TABLE_RPC_TIMEOUT)
.interrupt_after_quorum(true), .interrupt_after_quorum(true),
) )
@ -243,8 +226,8 @@ where
for resp in resps { for resp in resps {
if let TableRPC::Update(entries) = resp { if let TableRPC::Update(entries) = resp {
for entry_bytes in entries.iter() { for entry_bytes in entries.iter() {
let entry = self.decode_entry(entry_bytes.as_slice())?; let entry = self.data.decode_entry(entry_bytes.as_slice())?;
let entry_key = self.tree_key(entry.partition_key(), entry.sort_key()); let entry_key = self.data.tree_key(entry.partition_key(), entry.sort_key());
match ret.remove(&entry_key) { match ret.remove(&entry_key) {
None => { None => {
ret.insert(entry_key, Some(entry)); ret.insert(entry_key, Some(entry));
@ -313,146 +296,18 @@ where
async fn handle(self: &Arc<Self>, msg: &TableRPC<F>) -> Result<TableRPC<F>, Error> { async fn handle(self: &Arc<Self>, msg: &TableRPC<F>) -> Result<TableRPC<F>, Error> {
match msg { match msg {
TableRPC::ReadEntry(key, sort_key) => { TableRPC::ReadEntry(key, sort_key) => {
let value = self.handle_read_entry(key, sort_key)?; let value = self.data.read_entry(key, sort_key)?;
Ok(TableRPC::ReadEntryResponse(value)) Ok(TableRPC::ReadEntryResponse(value))
} }
TableRPC::ReadRange(key, begin_sort_key, filter, limit) => { TableRPC::ReadRange(key, begin_sort_key, filter, limit) => {
let values = self.handle_read_range(key, begin_sort_key, filter, *limit)?; let values = self.data.read_range(key, begin_sort_key, filter, *limit)?;
Ok(TableRPC::Update(values)) Ok(TableRPC::Update(values))
} }
TableRPC::Update(pairs) => { TableRPC::Update(pairs) => {
self.handle_update(pairs).await?; self.data.update_many(pairs)?;
Ok(TableRPC::Ok) Ok(TableRPC::Ok)
} }
TableRPC::SyncRPC(rpc) => {
let syncer = self.syncer.load_full().unwrap();
let response = syncer
.handle_rpc(rpc, self.system.background.stop_signal.clone())
.await?;
Ok(TableRPC::SyncRPC(response))
}
_ => Err(Error::BadRPC(format!("Unexpected table RPC"))), _ => Err(Error::BadRPC(format!("Unexpected table RPC"))),
} }
} }
fn handle_read_entry(&self, p: &F::P, s: &F::S) -> Result<Option<ByteBuf>, Error> {
let tree_key = self.tree_key(p, s);
if let Some(bytes) = self.store.get(&tree_key)? {
Ok(Some(ByteBuf::from(bytes.to_vec())))
} else {
Ok(None)
}
}
fn handle_read_range(
&self,
p: &F::P,
s: &Option<F::S>,
filter: &Option<F::Filter>,
limit: usize,
) -> Result<Vec<Arc<ByteBuf>>, Error> {
let partition_hash = p.hash();
let first_key = match s {
None => partition_hash.to_vec(),
Some(sk) => self.tree_key(p, sk),
};
let mut ret = vec![];
for item in self.store.range(first_key..) {
let (key, value) = item?;
if &key[..32] != partition_hash.as_slice() {
break;
}
let keep = match filter {
None => true,
Some(f) => {
let entry = self.decode_entry(value.as_ref())?;
F::matches_filter(&entry, f)
}
};
if keep {
ret.push(Arc::new(ByteBuf::from(value.as_ref())));
}
if ret.len() >= limit {
break;
}
}
Ok(ret)
}
pub async fn handle_update(self: &Arc<Self>, entries: &[Arc<ByteBuf>]) -> Result<(), Error> {
let syncer = self.syncer.load_full().unwrap();
for update_bytes in entries.iter() {
let update = self.decode_entry(update_bytes.as_slice())?;
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
let (old_entry, new_entry) = self.store.transaction(|db| {
let (old_entry, new_entry) = match db.get(&tree_key)? {
Some(prev_bytes) => {
let old_entry = self
.decode_entry(&prev_bytes)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
let mut new_entry = old_entry.clone();
new_entry.merge(&update);
(Some(old_entry), new_entry)
}
None => (None, update.clone()),
};
let new_bytes = rmp_to_vec_all_named(&new_entry)
.map_err(Error::RMPEncode)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
db.insert(tree_key.clone(), new_bytes)?;
Ok((old_entry, new_entry))
})?;
if old_entry.as_ref() != Some(&new_entry) {
self.instance.updated(old_entry, Some(new_entry));
syncer.invalidate(&tree_key[..]);
}
}
Ok(())
}
pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
let removed = self.store.transaction(|txn| {
if let Some(cur_v) = txn.get(k)? {
if cur_v == v {
txn.remove(k)?;
return Ok(true);
}
}
Ok(false)
})?;
if removed {
let old_entry = self.decode_entry(v)?;
self.instance.updated(Some(old_entry), None);
self.syncer.load_full().unwrap().invalidate(k);
}
Ok(removed)
}
fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
let mut ret = p.hash().to_vec();
ret.extend(s.sort_key());
ret
}
fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) {
Ok(x) => Ok(x),
Err(e) => match F::try_migrate(bytes) {
Some(x) => Ok(x),
None => {
warn!("Unable to decode entry of {}: {}", self.name, e);
for line in hexdump::hexdump_iter(bytes) {
debug!("{}", line);
}
Err(e.into())
}
},
}
}
} }

View file

@ -1,59 +0,0 @@
use std::sync::Arc;
use garage_rpc::membership::System;
use garage_rpc::ring::Ring;
use garage_util::data::*;
use crate::*;
#[derive(Clone)]
pub struct TableFullReplication {
pub max_faults: usize,
}
#[derive(Clone)]
struct Neighbors {
ring: Arc<Ring>,
neighbors: Vec<UUID>,
}
impl TableFullReplication {
pub fn new(max_faults: usize) -> Self {
TableFullReplication { max_faults }
}
}
impl TableReplication for TableFullReplication {
// Full replication schema: all nodes store everything
// Writes are disseminated in an epidemic manner in the network
// Advantage: do all reads locally, extremely fast
// Inconvenient: only suitable to reasonably small tables
fn read_nodes(&self, _hash: &Hash, system: &System) -> Vec<UUID> {
vec![system.id]
}
fn read_quorum(&self) -> usize {
1
}
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID> {
self.replication_nodes(hash, system.ring.borrow().as_ref())
}
fn write_quorum(&self, system: &System) -> usize {
system.ring.borrow().config.members.len() - self.max_faults
}
fn max_write_errors(&self) -> usize {
self.max_faults
}
fn replication_nodes(&self, _hash: &Hash, ring: &Ring) -> Vec<UUID> {
ring.config.members.keys().cloned().collect::<Vec<_>>()
}
fn split_points(&self, _ring: &Ring) -> Vec<Hash> {
let mut ret = vec![];
ret.push([0u8; 32].into());
ret.push([0xFFu8; 32].into());
ret
}
}

View file

@ -1,891 +0,0 @@
use rand::Rng;
use std::collections::{BTreeMap, VecDeque};
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use futures::future::join_all;
use futures::{pin_mut, select};
use futures_util::future::*;
use futures_util::stream::*;
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
use tokio::sync::{mpsc, watch};
use garage_rpc::ring::Ring;
use garage_util::data::*;
use garage_util::error::Error;
use crate::*;
const MAX_DEPTH: usize = 16;
const SCAN_INTERVAL: Duration = Duration::from_secs(3600);
const CHECKSUM_CACHE_TIMEOUT: Duration = Duration::from_secs(1800);
const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
pub struct TableSyncer<F: TableSchema, R: TableReplication> {
table: Arc<Table<F, R>>,
todo: Mutex<SyncTodo>,
cache: Vec<Mutex<BTreeMap<SyncRange, RangeChecksumCache>>>,
}
#[derive(Serialize, Deserialize)]
pub(crate) enum SyncRPC {
GetRootChecksumRange(Hash, Hash),
RootChecksumRange(SyncRange),
Checksums(Vec<RangeChecksum>),
Difference(Vec<SyncRange>, Vec<Arc<ByteBuf>>),
}
struct SyncTodo {
todo: Vec<TodoPartition>,
}
#[derive(Debug, Clone)]
struct TodoPartition {
// Partition consists in hashes between begin included and end excluded
begin: Hash,
end: Hash,
// Are we a node that stores this partition or not?
retain: bool,
}
// A SyncRange defines a query on the dataset stored by a node, in the following way:
// - all items whose key are >= `begin`
// - stopping at the first item whose key hash has at least `level` leading zero bytes (excluded)
// - except if the first item of the range has such many leading zero bytes
// - and stopping at `end` (excluded) if such an item is not found
// The checksum itself does not store all of the items in the database, only the hashes of the "sub-ranges"
// i.e. of ranges of level `level-1` that cover the same range
// (ranges of level 0 do not exist and their hash is simply the hash of the first item >= begin)
// See RangeChecksum for the struct that stores this information.
#[derive(Hash, PartialEq, Eq, Debug, Clone, Serialize, Deserialize)]
pub(crate) struct SyncRange {
begin: Vec<u8>,
end: Vec<u8>,
level: usize,
}
impl std::cmp::PartialOrd for SyncRange {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl std::cmp::Ord for SyncRange {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.begin
.cmp(&other.begin)
.then(self.level.cmp(&other.level))
.then(self.end.cmp(&other.end))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub(crate) struct RangeChecksum {
bounds: SyncRange,
children: Vec<(SyncRange, Hash)>,
found_limit: Option<Vec<u8>>,
#[serde(skip, default = "std::time::Instant::now")]
time: Instant,
}
#[derive(Debug, Clone)]
struct RangeChecksumCache {
hash: Option<Hash>, // None if no children
found_limit: Option<Vec<u8>>,
time: Instant,
}
impl<F, R> TableSyncer<F, R>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
pub(crate) async fn launch(table: Arc<Table<F, R>>) -> Arc<Self> {
let todo = SyncTodo { todo: Vec::new() };
let syncer = Arc::new(TableSyncer {
table: table.clone(),
todo: Mutex::new(todo),
cache: (0..MAX_DEPTH)
.map(|_| Mutex::new(BTreeMap::new()))
.collect::<Vec<_>>(),
});
let (busy_tx, busy_rx) = mpsc::unbounded_channel();
let s1 = syncer.clone();
table
.system
.background
.spawn_worker(
format!("table sync watcher for {}", table.name),
move |must_exit: watch::Receiver<bool>| s1.watcher_task(must_exit, busy_rx),
)
.await;
let s2 = syncer.clone();
table
.system
.background
.spawn_worker(
format!("table syncer for {}", table.name),
move |must_exit: watch::Receiver<bool>| s2.syncer_task(must_exit, busy_tx),
)
.await;
let s3 = syncer.clone();
tokio::spawn(async move {
tokio::time::delay_for(Duration::from_secs(20)).await;
s3.add_full_scan().await;
});
syncer
}
async fn watcher_task(
self: Arc<Self>,
mut must_exit: watch::Receiver<bool>,
mut busy_rx: mpsc::UnboundedReceiver<bool>,
) -> Result<(), Error> {
let mut prev_ring: Arc<Ring> = self.table.system.ring.borrow().clone();
let mut ring_recv: watch::Receiver<Arc<Ring>> = self.table.system.ring.clone();
let mut nothing_to_do_since = Some(Instant::now());
while !*must_exit.borrow() {
let s_ring_recv = ring_recv.recv().fuse();
let s_busy = busy_rx.recv().fuse();
let s_must_exit = must_exit.recv().fuse();
let s_timeout = tokio::time::delay_for(Duration::from_secs(1)).fuse();
pin_mut!(s_ring_recv, s_busy, s_must_exit, s_timeout);
select! {
new_ring_r = s_ring_recv => {
if let Some(new_ring) = new_ring_r {
debug!("({}) Adding ring difference to syncer todo list", self.table.name);
self.todo.lock().unwrap().add_ring_difference(&self.table, &prev_ring, &new_ring);
prev_ring = new_ring;
}
}
busy_opt = s_busy => {
if let Some(busy) = busy_opt {
if busy {
nothing_to_do_since = None;
} else {
if nothing_to_do_since.is_none() {
nothing_to_do_since = Some(Instant::now());
}
}
}
}
must_exit_v = s_must_exit => {
if must_exit_v.unwrap_or(false) {
break;
}
}
_ = s_timeout => {
if nothing_to_do_since.map(|t| Instant::now() - t >= SCAN_INTERVAL).unwrap_or(false) {
nothing_to_do_since = None;
debug!("({}) Adding full scan to syncer todo list", self.table.name);
self.add_full_scan().await;
}
}
}
}
Ok(())
}
pub async fn add_full_scan(&self) {
self.todo.lock().unwrap().add_full_scan(&self.table);
}
async fn syncer_task(
self: Arc<Self>,
mut must_exit: watch::Receiver<bool>,
busy_tx: mpsc::UnboundedSender<bool>,
) -> Result<(), Error> {
while !*must_exit.borrow() {
let task = self.todo.lock().unwrap().pop_task();
if let Some(partition) = task {
busy_tx.send(true)?;
let res = self
.clone()
.sync_partition(&partition, &mut must_exit)
.await;
if let Err(e) = res {
warn!(
"({}) Error while syncing {:?}: {}",
self.table.name, partition, e
);
}
} else {
busy_tx.send(false)?;
tokio::time::delay_for(Duration::from_secs(1)).await;
}
}
Ok(())
}
async fn sync_partition(
self: Arc<Self>,
partition: &TodoPartition,
must_exit: &mut watch::Receiver<bool>,
) -> Result<(), Error> {
if partition.retain {
let my_id = self.table.system.id;
let nodes = self
.table
.replication
.write_nodes(&partition.begin, &self.table.system)
.into_iter()
.filter(|node| *node != my_id)
.collect::<Vec<_>>();
debug!(
"({}) Preparing to sync {:?} with {:?}...",
self.table.name, partition, nodes
);
let root_cks = self.root_checksum(&partition.begin, &partition.end, must_exit)?;
let mut sync_futures = nodes
.iter()
.map(|node| {
self.clone().do_sync_with(
partition.clone(),
root_cks.clone(),
*node,
must_exit.clone(),
)
})
.collect::<FuturesUnordered<_>>();
let mut n_errors = 0;
while let Some(r) = sync_futures.next().await {
if let Err(e) = r {
n_errors += 1;
warn!("({}) Sync error: {}", self.table.name, e);
}
}
if n_errors > self.table.replication.max_write_errors() {
return Err(Error::Message(format!(
"Sync failed with too many nodes (should have been: {:?}).",
nodes
)));
}
} else {
self.offload_partition(&partition.begin, &partition.end, must_exit)
.await?;
}
Ok(())
}
// Offload partition: this partition is not something we are storing,
// so send it out to all other nodes that store it and delete items locally.
// We don't bother checking if the remote nodes already have the items,
// we just batch-send everything. Offloading isn't supposed to happen very often.
// If any of the nodes that are supposed to store the items is unable to
// save them, we interrupt the process.
async fn offload_partition(
self: &Arc<Self>,
begin: &Hash,
end: &Hash,
must_exit: &mut watch::Receiver<bool>,
) -> Result<(), Error> {
let mut counter: usize = 0;
while !*must_exit.borrow() {
let mut items = Vec::new();
for item in self.table.store.range(begin.to_vec()..end.to_vec()) {
let (key, value) = item?;
items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref()))));
if items.len() >= 1024 {
break;
}
}
if items.len() > 0 {
let nodes = self
.table
.replication
.write_nodes(&begin, &self.table.system)
.into_iter()
.collect::<Vec<_>>();
if nodes.contains(&self.table.system.id) {
warn!("Interrupting offload as partitions seem to have changed");
break;
}
counter += 1;
debug!(
"Offloading {} items from {:?}..{:?} ({})",
items.len(),
begin,
end,
counter
);
self.offload_items(&items, &nodes[..]).await?;
} else {
break;
}
}
Ok(())
}
async fn offload_items(
self: &Arc<Self>,
items: &Vec<(Vec<u8>, Arc<ByteBuf>)>,
nodes: &[UUID],
) -> Result<(), Error> {
let values = items.iter().map(|(_k, v)| v.clone()).collect::<Vec<_>>();
let update_msg = Arc::new(TableRPC::<F>::Update(values));
for res in join_all(nodes.iter().map(|to| {
self.table
.rpc_client
.call_arc(*to, update_msg.clone(), TABLE_SYNC_RPC_TIMEOUT)
}))
.await
{
res?;
}
// All remote nodes have written those items, now we can delete them locally
let mut not_removed = 0;
for (k, v) in items.iter() {
if !self.table.delete_if_equal(&k[..], &v[..])? {
not_removed += 1;
}
}
if not_removed > 0 {
debug!("{} items not removed during offload because they changed in between (trying again...)", not_removed);
}
Ok(())
}
fn root_checksum(
self: &Arc<Self>,
begin: &Hash,
end: &Hash,
must_exit: &mut watch::Receiver<bool>,
) -> Result<RangeChecksum, Error> {
for i in 1..MAX_DEPTH {
let rc = self.range_checksum(
&SyncRange {
begin: begin.to_vec(),
end: end.to_vec(),
level: i,
},
must_exit,
)?;
if rc.found_limit.is_none() {
return Ok(rc);
}
}
Err(Error::Message(format!(
"Unable to compute root checksum (this should never happen)"
)))
}
fn range_checksum(
self: &Arc<Self>,
range: &SyncRange,
must_exit: &mut watch::Receiver<bool>,
) -> Result<RangeChecksum, Error> {
assert!(range.level != 0);
trace!("Call range_checksum {:?}", range);
if range.level == 1 {
let mut children = vec![];
for item in self
.table
.store
.range(range.begin.clone()..range.end.clone())
{
let (key, value) = item?;
let key_hash = blake2sum(&key[..]);
if children.len() > 0
&& key_hash.as_slice()[0..range.level]
.iter()
.all(|x| *x == 0u8)
{
trace!(
"range_checksum {:?} returning {} items",
range,
children.len()
);
return Ok(RangeChecksum {
bounds: range.clone(),
children,
found_limit: Some(key.to_vec()),
time: Instant::now(),
});
}
let item_range = SyncRange {
begin: key.to_vec(),
end: vec![],
level: 0,
};
children.push((item_range, blake2sum(&value[..])));
}
trace!(
"range_checksum {:?} returning {} items",
range,
children.len()
);
Ok(RangeChecksum {
bounds: range.clone(),
children,
found_limit: None,
time: Instant::now(),
})
} else {
let mut children = vec![];
let mut sub_range = SyncRange {
begin: range.begin.clone(),
end: range.end.clone(),
level: range.level - 1,
};
let mut time = Instant::now();
while !*must_exit.borrow() {
let sub_ck = self.range_checksum_cached_hash(&sub_range, must_exit)?;
if let Some(hash) = sub_ck.hash {
children.push((sub_range.clone(), hash));
if sub_ck.time < time {
time = sub_ck.time;
}
}
if sub_ck.found_limit.is_none() || sub_ck.hash.is_none() {
trace!(
"range_checksum {:?} returning {} items",
range,
children.len()
);
return Ok(RangeChecksum {
bounds: range.clone(),
children,
found_limit: None,
time,
});
}
let found_limit = sub_ck.found_limit.unwrap();
let actual_limit_hash = blake2sum(&found_limit[..]);
if actual_limit_hash.as_slice()[0..range.level]
.iter()
.all(|x| *x == 0u8)
{
trace!(
"range_checksum {:?} returning {} items",
range,
children.len()
);
return Ok(RangeChecksum {
bounds: range.clone(),
children,
found_limit: Some(found_limit.clone()),
time,
});
}
sub_range.begin = found_limit;
}
trace!("range_checksum {:?} exiting due to must_exit", range);
Err(Error::Message(format!("Exiting.")))
}
}
fn range_checksum_cached_hash(
self: &Arc<Self>,
range: &SyncRange,
must_exit: &mut watch::Receiver<bool>,
) -> Result<RangeChecksumCache, Error> {
{
let mut cache = self.cache[range.level].lock().unwrap();
if let Some(v) = cache.get(&range) {
if Instant::now() - v.time < CHECKSUM_CACHE_TIMEOUT {
return Ok(v.clone());
}
}
cache.remove(&range);
}
let v = self.range_checksum(&range, must_exit)?;
trace!(
"({}) New checksum calculated for {}-{}/{}, {} children",
self.table.name,
hex::encode(&range.begin)
.chars()
.take(16)
.collect::<String>(),
hex::encode(&range.end).chars().take(16).collect::<String>(),
range.level,
v.children.len()
);
let hash = if v.children.len() > 0 {
Some(blake2sum(&rmp_to_vec_all_named(&v)?[..]))
} else {
None
};
let cache_entry = RangeChecksumCache {
hash,
found_limit: v.found_limit,
time: v.time,
};
let mut cache = self.cache[range.level].lock().unwrap();
cache.insert(range.clone(), cache_entry.clone());
Ok(cache_entry)
}
async fn do_sync_with(
self: Arc<Self>,
partition: TodoPartition,
root_ck: RangeChecksum,
who: UUID,
mut must_exit: watch::Receiver<bool>,
) -> Result<(), Error> {
let mut todo = VecDeque::new();
// If their root checksum has level > than us, use that as a reference
let root_cks_resp = self
.table
.rpc_client
.call(
who,
TableRPC::<F>::SyncRPC(SyncRPC::GetRootChecksumRange(
partition.begin.clone(),
partition.end.clone(),
)),
TABLE_SYNC_RPC_TIMEOUT,
)
.await?;
if let TableRPC::<F>::SyncRPC(SyncRPC::RootChecksumRange(range)) = root_cks_resp {
if range.level > root_ck.bounds.level {
let their_root_range_ck = self.range_checksum(&range, &mut must_exit)?;
todo.push_back(their_root_range_ck);
} else {
todo.push_back(root_ck);
}
} else {
return Err(Error::Message(format!(
"Invalid respone to GetRootChecksumRange RPC: {}",
debug_serialize(root_cks_resp)
)));
}
while !todo.is_empty() && !*must_exit.borrow() {
let total_children = todo.iter().map(|x| x.children.len()).fold(0, |x, y| x + y);
trace!(
"({}) Sync with {:?}: {} ({}) remaining",
self.table.name,
who,
todo.len(),
total_children
);
let step_size = std::cmp::min(16, todo.len());
let step = todo.drain(..step_size).collect::<Vec<_>>();
let rpc_resp = self
.table
.rpc_client
.call(
who,
TableRPC::<F>::SyncRPC(SyncRPC::Checksums(step)),
TABLE_SYNC_RPC_TIMEOUT,
)
.await?;
if let TableRPC::<F>::SyncRPC(SyncRPC::Difference(mut diff_ranges, diff_items)) =
rpc_resp
{
if diff_ranges.len() > 0 || diff_items.len() > 0 {
info!(
"({}) Sync with {:?}: difference {} ranges, {} items",
self.table.name,
who,
diff_ranges.len(),
diff_items.len()
);
}
let mut items_to_send = vec![];
for differing in diff_ranges.drain(..) {
if differing.level == 0 {
items_to_send.push(differing.begin);
} else {
let checksum = self.range_checksum(&differing, &mut must_exit)?;
todo.push_back(checksum);
}
}
if diff_items.len() > 0 {
self.table.handle_update(&diff_items[..]).await?;
}
if items_to_send.len() > 0 {
self.send_items(who, items_to_send).await?;
}
} else {
return Err(Error::Message(format!(
"Unexpected response to sync RPC checksums: {}",
debug_serialize(&rpc_resp)
)));
}
}
Ok(())
}
async fn send_items(&self, who: UUID, item_list: Vec<Vec<u8>>) -> Result<(), Error> {
info!(
"({}) Sending {} items to {:?}",
self.table.name,
item_list.len(),
who
);
let mut values = vec![];
for item in item_list.iter() {
if let Some(v) = self.table.store.get(&item[..])? {
values.push(Arc::new(ByteBuf::from(v.as_ref())));
}
}
let rpc_resp = self
.table
.rpc_client
.call(who, TableRPC::<F>::Update(values), TABLE_SYNC_RPC_TIMEOUT)
.await?;
if let TableRPC::<F>::Ok = rpc_resp {
Ok(())
} else {
Err(Error::Message(format!(
"Unexpected response to RPC Update: {}",
debug_serialize(&rpc_resp)
)))
}
}
pub(crate) async fn handle_rpc(
self: &Arc<Self>,
message: &SyncRPC,
mut must_exit: watch::Receiver<bool>,
) -> Result<SyncRPC, Error> {
match message {
SyncRPC::GetRootChecksumRange(begin, end) => {
let root_cks = self.root_checksum(&begin, &end, &mut must_exit)?;
Ok(SyncRPC::RootChecksumRange(root_cks.bounds))
}
SyncRPC::Checksums(checksums) => {
self.handle_checksums_rpc(&checksums[..], &mut must_exit)
.await
}
_ => Err(Error::Message(format!("Unexpected sync RPC"))),
}
}
async fn handle_checksums_rpc(
self: &Arc<Self>,
checksums: &[RangeChecksum],
must_exit: &mut watch::Receiver<bool>,
) -> Result<SyncRPC, Error> {
let mut ret_ranges = vec![];
let mut ret_items = vec![];
for their_ckr in checksums.iter() {
let our_ckr = self.range_checksum(&their_ckr.bounds, must_exit)?;
for (their_range, their_hash) in their_ckr.children.iter() {
let differs = match our_ckr
.children
.binary_search_by(|(our_range, _)| our_range.cmp(&their_range))
{
Err(_) => {
if their_range.level >= 1 {
let cached_hash =
self.range_checksum_cached_hash(&their_range, must_exit)?;
cached_hash.hash.map(|h| h != *their_hash).unwrap_or(true)
} else {
true
}
}
Ok(i) => our_ckr.children[i].1 != *their_hash,
};
if differs {
ret_ranges.push(their_range.clone());
if their_range.level == 0 {
if let Some(item_bytes) =
self.table.store.get(their_range.begin.as_slice())?
{
ret_items.push(Arc::new(ByteBuf::from(item_bytes.to_vec())));
}
}
}
}
for (our_range, _hash) in our_ckr.children.iter() {
if let Some(their_found_limit) = &their_ckr.found_limit {
if our_range.begin.as_slice() > their_found_limit.as_slice() {
break;
}
}
let not_present = our_ckr
.children
.binary_search_by(|(their_range, _)| their_range.cmp(&our_range))
.is_err();
if not_present {
if our_range.level > 0 {
ret_ranges.push(our_range.clone());
}
if our_range.level == 0 {
if let Some(item_bytes) =
self.table.store.get(our_range.begin.as_slice())?
{
ret_items.push(Arc::new(ByteBuf::from(item_bytes.to_vec())));
}
}
}
}
}
let n_checksums = checksums
.iter()
.map(|x| x.children.len())
.fold(0, |x, y| x + y);
if ret_ranges.len() > 0 || ret_items.len() > 0 {
trace!(
"({}) Checksum comparison RPC: {} different + {} items for {} received",
self.table.name,
ret_ranges.len(),
ret_items.len(),
n_checksums
);
}
Ok(SyncRPC::Difference(ret_ranges, ret_items))
}
pub(crate) fn invalidate(self: &Arc<Self>, item_key: &[u8]) {
for i in 1..MAX_DEPTH {
let needle = SyncRange {
begin: item_key.to_vec(),
end: vec![],
level: i,
};
let mut cache = self.cache[i].lock().unwrap();
if let Some(cache_entry) = cache.range(..=needle).rev().next() {
if cache_entry.0.begin[..] <= *item_key && cache_entry.0.end[..] > *item_key {
let index = cache_entry.0.clone();
drop(cache_entry);
cache.remove(&index);
}
}
}
}
}
impl SyncTodo {
fn add_full_scan<F: TableSchema, R: TableReplication>(&mut self, table: &Table<F, R>) {
let my_id = table.system.id;
self.todo.clear();
let ring = table.system.ring.borrow().clone();
let split_points = table.replication.split_points(&ring);
for i in 0..split_points.len() - 1 {
let begin = split_points[i];
let end = split_points[i + 1];
let nodes = table.replication.replication_nodes(&begin, &ring);
let retain = nodes.contains(&my_id);
if !retain {
// Check if we have some data to send, otherwise skip
if table.store.range(begin..end).next().is_none() {
continue;
}
}
self.todo.push(TodoPartition { begin, end, retain });
}
}
fn add_ring_difference<F: TableSchema, R: TableReplication>(
&mut self,
table: &Table<F, R>,
old_ring: &Ring,
new_ring: &Ring,
) {
let my_id = table.system.id;
// If it is us who are entering or leaving the system,
// initiate a full sync instead of incremental sync
if old_ring.config.members.contains_key(&my_id)
!= new_ring.config.members.contains_key(&my_id)
{
self.add_full_scan(table);
return;
}
let mut all_points = None
.into_iter()
.chain(table.replication.split_points(old_ring).drain(..))
.chain(table.replication.split_points(new_ring).drain(..))
.chain(self.todo.iter().map(|x| x.begin))
.chain(self.todo.iter().map(|x| x.end))
.collect::<Vec<_>>();
all_points.sort();
all_points.dedup();
let mut old_todo = std::mem::replace(&mut self.todo, vec![]);
old_todo.sort_by(|x, y| x.begin.cmp(&y.begin));
let mut new_todo = vec![];
for i in 0..all_points.len() - 1 {
let begin = all_points[i];
let end = all_points[i + 1];
let was_ours = table
.replication
.replication_nodes(&begin, &old_ring)
.contains(&my_id);
let is_ours = table
.replication
.replication_nodes(&begin, &new_ring)
.contains(&my_id);
let was_todo = match old_todo.binary_search_by(|x| x.begin.cmp(&begin)) {
Ok(_) => true,
Err(j) => {
(j > 0 && old_todo[j - 1].begin < end && begin < old_todo[j - 1].end)
|| (j < old_todo.len()
&& old_todo[j].begin < end && begin < old_todo[j].end)
}
};
if was_todo || (is_ours && !was_ours) || (was_ours && !is_ours) {
new_todo.push(TodoPartition {
begin,
end,
retain: is_ours,
});
}
}
self.todo = new_todo;
}
fn pop_task(&mut self) -> Option<TodoPartition> {
if self.todo.is_empty() {
return None;
}
let i = rand::thread_rng().gen_range::<usize, _, _>(0, self.todo.len());
if i == self.todo.len() - 1 {
self.todo.pop()
} else {
let replacement = self.todo.pop().unwrap();
let ret = std::mem::replace(&mut self.todo[i], replacement);
Some(ret)
}
}
}

View file

@ -13,29 +13,26 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
rand = "0.7" rand = "0.8"
hex = "0.3" hex = "0.4"
sha2 = "0.8" sha2 = "0.9"
blake2 = "0.9" blake2 = "0.9"
err-derive = "0.2.3" err-derive = "0.3"
log = "0.4" log = "0.4"
fasthash = "0.4" fasthash = "0.4"
sled = "0.34" sled = "0.34"
toml = "0.5" toml = "0.5"
rmp-serde = "0.14.3" rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
serde_json = "1.0" serde_json = "1.0"
chrono = "0.4"
futures = "0.3" futures = "0.3"
futures-util = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
tokio = { version = "0.2", default-features = false, features = ["rt-core", "rt-threaded", "io-driver", "net", "tcp", "time", "macros", "sync", "signal", "fs"] }
http = "0.2" http = "0.2"
hyper = "0.13" hyper = "0.14"
rustls = "0.17" rustls = "0.19"
webpki = "0.21" webpki = "0.21"
roxmltree = "0.11"

View file

@ -1,12 +1,11 @@
use core::future::Future; use core::future::Future;
use std::pin::Pin; use std::pin::Pin;
use futures::future::join_all;
use futures::select;
use futures_util::future::*;
use std::sync::Arc; use std::sync::Arc;
use tokio::sync::Mutex; use std::time::Duration;
use tokio::sync::{mpsc, watch, Notify};
use futures::future::*;
use futures::select;
use tokio::sync::{mpsc, watch, Mutex};
use crate::error::Error; use crate::error::Error;
@ -14,54 +13,106 @@ type JobOutput = Result<(), Error>;
type Job = Pin<Box<dyn Future<Output = JobOutput> + Send>>; type Job = Pin<Box<dyn Future<Output = JobOutput> + Send>>;
pub struct BackgroundRunner { pub struct BackgroundRunner {
n_runners: usize,
pub stop_signal: watch::Receiver<bool>, pub stop_signal: watch::Receiver<bool>,
queue_in: mpsc::UnboundedSender<(Job, bool)>, queue_in: mpsc::UnboundedSender<(Job, bool)>,
queue_out: Mutex<mpsc::UnboundedReceiver<(Job, bool)>>, worker_in: mpsc::UnboundedSender<tokio::task::JoinHandle<()>>,
job_notify: Notify,
workers: Mutex<Vec<tokio::task::JoinHandle<()>>>,
} }
impl BackgroundRunner { impl BackgroundRunner {
pub fn new(n_runners: usize, stop_signal: watch::Receiver<bool>) -> Arc<Self> { pub fn new(
n_runners: usize,
stop_signal: watch::Receiver<bool>,
) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
let (worker_in, mut worker_out) = mpsc::unbounded_channel();
let stop_signal_2 = stop_signal.clone();
let await_all_done = tokio::spawn(async move {
loop {
let wkr = {
select! {
item = worker_out.recv().fuse() => {
match item {
Some(x) => x,
None => break,
}
}
_ = tokio::time::sleep(Duration::from_secs(5)).fuse() => {
if *stop_signal_2.borrow() {
break;
} else {
continue;
}
}
}
};
if let Err(e) = wkr.await {
error!("Error while awaiting for worker: {}", e);
}
}
});
let (queue_in, queue_out) = mpsc::unbounded_channel(); let (queue_in, queue_out) = mpsc::unbounded_channel();
Arc::new(Self { let queue_out = Arc::new(Mutex::new(queue_out));
n_runners,
for i in 0..n_runners {
let queue_out = queue_out.clone();
let stop_signal = stop_signal.clone();
worker_in
.send(tokio::spawn(async move {
loop {
let (job, cancellable) = {
select! {
item = wait_job(&queue_out).fuse() => match item {
// We received a task, process it
Some(x) => x,
// We received a signal that no more tasks will ever be sent
// because the sending side was dropped. Exit now.
None => break,
},
_ = tokio::time::sleep(Duration::from_secs(5)).fuse() => {
if *stop_signal.borrow() {
// Nothing has been going on for 5 secs, and we are shutting
// down. Exit now.
break;
} else {
// Nothing is going on but we don't want to exit.
continue;
}
}
}
};
if cancellable && *stop_signal.borrow() {
continue;
}
if let Err(e) = job.await {
error!("Job failed: {}", e)
}
}
info!("Background worker {} exiting", i);
}))
.unwrap();
}
let bgrunner = Arc::new(Self {
stop_signal, stop_signal,
queue_in, queue_in,
queue_out: Mutex::new(queue_out), worker_in,
job_notify: Notify::new(), });
workers: Mutex::new(Vec::new()), (bgrunner, await_all_done)
})
}
pub async fn run(self: Arc<Self>) {
let mut workers = self.workers.lock().await;
for i in 0..self.n_runners {
workers.push(tokio::spawn(self.clone().runner(i)));
}
drop(workers);
let mut stop_signal = self.stop_signal.clone();
while let Some(exit_now) = stop_signal.recv().await {
if exit_now {
let mut workers = self.workers.lock().await;
let workers_vec = workers.drain(..).collect::<Vec<_>>();
join_all(workers_vec).await;
return;
}
}
} }
// Spawn a task to be run in background
pub fn spawn<T>(&self, job: T) pub fn spawn<T>(&self, job: T)
where where
T: Future<Output = JobOutput> + Send + 'static, T: Future<Output = JobOutput> + Send + 'static,
{ {
let boxed: Job = Box::pin(job); let boxed: Job = Box::pin(job);
let _: Result<_, _> = self.queue_in.clone().send((boxed, false)); self.queue_in
self.job_notify.notify(); .send((boxed, false))
.map_err(|_| "could not put job in queue")
.unwrap();
} }
pub fn spawn_cancellable<T>(&self, job: T) pub fn spawn_cancellable<T>(&self, job: T)
@ -69,56 +120,30 @@ impl BackgroundRunner {
T: Future<Output = JobOutput> + Send + 'static, T: Future<Output = JobOutput> + Send + 'static,
{ {
let boxed: Job = Box::pin(job); let boxed: Job = Box::pin(job);
let _: Result<_, _> = self.queue_in.clone().send((boxed, true)); self.queue_in
self.job_notify.notify(); .send((boxed, true))
.map_err(|_| "could not put job in queue")
.unwrap();
} }
pub async fn spawn_worker<F, T>(&self, name: String, worker: F) pub fn spawn_worker<F, T>(&self, name: String, worker: F)
where where
F: FnOnce(watch::Receiver<bool>) -> T + Send + 'static, F: FnOnce(watch::Receiver<bool>) -> T + Send + 'static,
T: Future<Output = JobOutput> + Send + 'static, T: Future<Output = ()> + Send + 'static,
{ {
let mut workers = self.workers.lock().await;
let stop_signal = self.stop_signal.clone(); let stop_signal = self.stop_signal.clone();
workers.push(tokio::spawn(async move { let task = tokio::spawn(async move {
if let Err(e) = worker(stop_signal).await { info!("Worker started: {}", name);
error!("Worker stopped with error: {}, error: {}", name, e); worker(stop_signal).await;
} else { info!("Worker exited: {}", name);
info!("Worker exited successfully: {}", name); });
} self.worker_in
})); .send(task)
} .map_err(|_| "could not put job in queue")
.unwrap();
async fn runner(self: Arc<Self>, i: usize) {
let mut stop_signal = self.stop_signal.clone();
loop {
let must_exit: bool = *stop_signal.borrow();
if let Some(job) = self.dequeue_job(must_exit).await {
if let Err(e) = job.await {
error!("Job failed: {}", e)
}
} else {
if must_exit {
info!("Background runner {} exiting", i);
return;
}
select! {
_ = self.job_notify.notified().fuse() => (),
_ = stop_signal.recv().fuse() => (),
}
}
}
}
async fn dequeue_job(&self, must_exit: bool) -> Option<Job> {
let mut queue = self.queue_out.lock().await;
while let Ok((job, cancellable)) = queue.try_recv() {
if cancellable && must_exit {
continue;
} else {
return Some(job);
}
}
None
} }
} }
async fn wait_job(q: &Mutex<mpsc::UnboundedReceiver<(Job, bool)>>) -> Option<(Job, bool)> {
q.lock().await.recv().await
}

View file

@ -2,7 +2,6 @@ use rand::Rng;
use serde::de::{self, Visitor}; use serde::de::{self, Visitor};
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::fmt; use std::fmt;
use std::time::{SystemTime, UNIX_EPOCH};
#[derive(Default, PartialOrd, Ord, Clone, Hash, PartialEq, Copy)] #[derive(Default, PartialOrd, Ord, Clone, Hash, PartialEq, Copy)]
pub struct FixedBytes32([u8; 32]); pub struct FixedBytes32([u8; 32]);
@ -71,6 +70,14 @@ impl FixedBytes32 {
pub fn to_vec(&self) -> Vec<u8> { pub fn to_vec(&self) -> Vec<u8> {
self.0.to_vec() self.0.to_vec()
} }
pub fn try_from(by: &[u8]) -> Option<Self> {
if by.len() != 32 {
return None;
}
let mut ret = [0u8; 32];
ret.copy_from_slice(by);
Some(Self(ret))
}
} }
pub type UUID = FixedBytes32; pub type UUID = FixedBytes32;
@ -80,9 +87,9 @@ pub fn sha256sum(data: &[u8]) -> Hash {
use sha2::{Digest, Sha256}; use sha2::{Digest, Sha256};
let mut hasher = Sha256::new(); let mut hasher = Sha256::new();
hasher.input(data); hasher.update(data);
let mut hash = [0u8; 32]; let mut hash = [0u8; 32];
hash.copy_from_slice(&hasher.result()[..]); hash.copy_from_slice(&hasher.finalize()[..]);
hash.into() hash.into()
} }
@ -111,13 +118,6 @@ pub fn gen_uuid() -> UUID {
rand::thread_rng().gen::<[u8; 32]>().into() rand::thread_rng().gen::<[u8; 32]>().into()
} }
pub fn now_msec() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("Fix your clock :o")
.as_millis() as u64
}
// RMP serialization with names of fields and variants // RMP serialization with names of fields and variants
pub fn rmp_to_vec_all_named<T>(val: &T) -> Result<Vec<u8>, rmp_serde::encode::Error> pub fn rmp_to_vec_all_named<T>(val: &T) -> Result<Vec<u8>, rmp_serde::encode::Error>

View file

@ -8,16 +8,22 @@ use crate::data::*;
pub enum RPCError { pub enum RPCError {
#[error(display = "Node is down: {:?}.", _0)] #[error(display = "Node is down: {:?}.", _0)]
NodeDown(UUID), NodeDown(UUID),
#[error(display = "Timeout: {}", _0)] #[error(display = "Timeout: {}", _0)]
Timeout(#[error(source)] tokio::time::Elapsed), Timeout(#[error(source)] tokio::time::error::Elapsed),
#[error(display = "HTTP error: {}", _0)] #[error(display = "HTTP error: {}", _0)]
HTTP(#[error(source)] http::Error), HTTP(#[error(source)] http::Error),
#[error(display = "Hyper error: {}", _0)] #[error(display = "Hyper error: {}", _0)]
Hyper(#[error(source)] hyper::Error), Hyper(#[error(source)] hyper::Error),
#[error(display = "Messagepack encode error: {}", _0)] #[error(display = "Messagepack encode error: {}", _0)]
RMPEncode(#[error(source)] rmp_serde::encode::Error), RMPEncode(#[error(source)] rmp_serde::encode::Error),
#[error(display = "Messagepack decode error: {}", _0)] #[error(display = "Messagepack decode error: {}", _0)]
RMPDecode(#[error(source)] rmp_serde::decode::Error), RMPDecode(#[error(source)] rmp_serde::decode::Error),
#[error(display = "Too many errors: {:?}", _0)] #[error(display = "Too many errors: {:?}", _0)]
TooManyErrors(Vec<String>), TooManyErrors(Vec<String>),
} }

View file

@ -5,3 +5,4 @@ pub mod background;
pub mod config; pub mod config;
pub mod data; pub mod data;
pub mod error; pub mod error;
pub mod time;

16
src/util/time.rs Normal file
View file

@ -0,0 +1,16 @@
use chrono::{SecondsFormat, TimeZone, Utc};
use std::time::{SystemTime, UNIX_EPOCH};
pub fn now_msec() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("Fix your clock :o")
.as_millis() as u64
}
pub fn msec_to_rfc3339(msecs: u64) -> String {
let secs = msecs as i64 / 1000;
let nanos = (msecs as i64 % 1000) as u32 * 1_000_000;
let timestamp = Utc.timestamp(secs, nanos);
timestamp.to_rfc3339_opts(SecondsFormat::Secs, true)
}

View file

@ -18,11 +18,10 @@ garage_table = { version = "0.1.1", path = "../table" }
garage_model = { version = "0.1.1", path = "../model" } garage_model = { version = "0.1.1", path = "../model" }
garage_api = { version = "0.1.1", path = "../api" } garage_api = { version = "0.1.1", path = "../api" }
err-derive = "0.2.3" err-derive = "0.3"
log = "0.4" log = "0.4"
futures = "0.3" futures = "0.3"
http = "0.2" http = "0.2"
hyper = "0.13" hyper = "0.14"
percent-encoding = "2.1.0" percent-encoding = "2.1.0"
roxmltree = "0.11"
idna = "0.2" idna = "0.2"