summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSean Whitton <spwhitton@spwhitton.name>2020-02-04 17:04:58 -0700
committerSean Whitton <spwhitton@spwhitton.name>2020-02-04 17:04:58 -0700
commit4dc8c46c185a2e3233a955d251991312d2f03437 (patch)
tree35fc630c4540a1866821af93bfae9ca425a0233d
parentc1d65f03978ed8c133479ea184dce5e0dc0f8bda (diff)
downloadp5-Git-Annex-4dc8c46c185a2e3233a955d251991312d2f03437.tar.gz
add annex-to-annex{,-dropunused}
Signed-off-by: Sean Whitton <spwhitton@spwhitton.name>
-rw-r--r--README2
-rwxr-xr-xbin/annex-to-annex188
-rwxr-xr-xbin/annex-to-annex-dropunused72
-rwxr-xr-xt/21_annex-to-annex.t163
-rwxr-xr-xt/22_annex-to-annex-dropunused.t91
5 files changed, 516 insertions, 0 deletions
diff --git a/README b/README
index baa47b8..9a42826 100644
--- a/README
+++ b/README
@@ -8,6 +8,8 @@ Git::Annex -- class representing a git-annex repository
annex-review-unused -- interactively process `git annex unused` output
+annex-to-annex -- use hardlinks to migrate files between git annex repos
+
INSTALLATION
To install this module, run the following commands:
diff --git a/bin/annex-to-annex b/bin/annex-to-annex
new file mode 100755
index 0000000..0ebd771
--- /dev/null
+++ b/bin/annex-to-annex
@@ -0,0 +1,188 @@
+#!/usr/bin/perl
+
+# annex-to-annex -- use hardlinks to migrate files between git annex repos
+
+# Copyright (C) 2019-2020 Sean Whitton
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+=head1 NAME
+
+annex-to-annex - use hardlinks to migrate files between git annex repos
+
+=head1 SYNOPSIS
+
+
+
+=head1 DESCRIPTION
+
+This script moves files and directories from one or more git annexes
+into a destination git annex, using hardlinks rather than copying
+files where possible.
+
+It is useful for splitting and consolidating git annexes. For
+example, at the end of the semester I use this script to move files
+from my work annex, which gets synced to a lot of places, into an
+archival annex, which doesn't.
+
+=cut
+
+use 5.028;
+use strict;
+use warnings;
+
+use autodie;
+use Digest::MD5::File qw(file_md5);
+use File::Basename qw(dirname basename);
+use File::Copy;
+use File::Find;
+use File::Spec::Functions qw(catfile rel2abs abs2rel);
+use Try::Tiny;
+use Git::Annex;
+
+# only support v7 because supporting v5 too would make things quite
+# complex. require git-annex >=7.20191009 because it will refuse to
+# work in v5 repos, and because it supports `git annex find --unlocked`
+chomp(my %annex_version_fields = map { split ': ' } `git annex version`);
+die "I need git-annex >=7.20191009 and a v7 repository\n"
+ unless $annex_version_fields{'git-annex version'} >= 7.20191009;
+
+die "need at least two arguments\n" unless @ARGV > 1;
+my $dest = rel2abs pop @ARGV;
+die "dest is not a directory\n" unless -d $dest;
+my $dest_device_id = (stat($dest))[0];
+my $dannex = Git::Annex->new($dest);
+my $do_commit = 0;
+if ($ARGV[0] eq '--commit') {
+ $do_commit = 1;
+ shift @ARGV;
+
+ my @git_status = $dannex->git->RUN("status", { porcelain => 1 });
+ die "git repo containing $dest is not clean; please commit\n"
+ unless @git_status == 0;
+
+ #<<<
+ try {
+ $dannex->git->symbolic_ref({ quiet => 1 }, "HEAD");
+ } catch {
+ die "$dest has a detached HEAD; aborting";
+ };
+ #>>>
+}
+my @sources = map rel2abs($_), @ARGV;
+
+# process one entry in @sources at a time because we can start up
+# annex batch processes for each of these as all files under each
+# entry in @sources will lie in the same annex
+foreach my $source (@sources) {
+ my $dir = dirname $source;
+ my $annex = Git::Annex->new($dir);
+ #<<<
+ try {
+ $annex->git->annex("status");
+ } catch {
+ die "$source does not appear to lie within an annex\n";
+ };
+ #>>>
+ die "$source does not exist\n" unless -e $source;
+
+ if ($do_commit) {
+ my @git_status = $annex->git->RUN("status", { porcelain => 1 });
+ die "git repo containing $source is not clean; please commit\n"
+ unless @git_status == 0;
+
+ #<<<
+ try {
+ $annex->git->symbolic_ref({ quiet => 1 }, "HEAD");
+ } catch {
+ die "$dest has a detached HEAD; aborting";
+ };
+ #>>>
+ }
+
+ my $base = basename $source;
+ my @missing = $annex->git->annex("find", "--not", "--in", "here", $base);
+ if (@missing) {
+ say "Following annexed files are not present in this repo:";
+ say for @missing;
+ die "cannot continue; please `git-annex get` them\n";
+ }
+
+ # start batch processes
+ my $lk = $annex->batch("lookupkey");
+ my $cl = $annex->batch("contentlocation");
+ my $find = $annex->batch("find", "--unlocked");
+
+ find({
+ wanted => sub {
+ my $rel = abs2rel $File::Find::name, $dir;
+ my $target = catfile $dest, $rel;
+ die "$target already exists!\n" if -e $target and !-d $target;
+
+ my $key = $lk->ask($rel);
+ if ($key) { # this is an annexed file
+ my $content = rel2abs $cl->ask($key), $annex->toplevel;
+ my $content_device_id = (stat $content)[0];
+ if ($dest_device_id == $content_device_id) {
+ link $content, $target;
+ } else {
+ copy_and_md5($content, $target);
+ }
+ # add, and then maybe unlock. we don't use `-c
+ # annex.addunlocked=true` because we want to
+ # hardlink from .git/annex/objects in the source
+ # to .git/annex/objects in the dest, rather than
+ # having the unlocked copy in dest be hardlinked
+ # to the source, or anything like that
+ system "git", "-C", $dest, "annex", "add", $rel;
+ system "git", "-C", $dest, "annex", "unlock", $rel
+ if $find->ask($rel);
+
+ # if using the default backend, quick sanity check
+ if ($key =~ /^SHA256E-s[0-9]+--([0-9a-f]+)/) {
+ my $key_sum = $1;
+ chomp(my $dest_key
+ = `git -C "$dest" annex lookupkey "$rel"`);
+ if ($dest_key =~ /^SHA256E-s[0-9]+--([0-9a-f]+)/) {
+ my $dest_key_sum = $1;
+ die
+"git-annex calculated a different checksum for $target"
+ unless $key_sum eq $dest_key_sum;
+ }
+ }
+ } else { # this is not an annexed file
+ if (-d $File::Find::name) {
+ mkdir $target unless -d $target;
+ } else {
+ copy_and_md5($File::Find::name, $target);
+ system "git", "-C", $dest,
+ "-c", "annex.gitaddtoannex=false", "add", $rel;
+ }
+ }
+ $annex->git->rm($File::Find::name) unless -d $File::Find::name;
+ },
+ no_chdir => 1,
+ },
+ $source
+ );
+ $annex->git->commit({ message => "migrated by annex-to-annex" })
+ if $do_commit;
+}
+$dannex->git->commit({ message => "add" }) if $do_commit;
+
+sub copy_and_md5 {
+ copy($_[0], $_[1]);
+ die "md5 checksum failure after copying $_[0] to $_[1]!"
+ unless file_md5($_[0]) eq file_md5($_[1]);
+}
diff --git a/bin/annex-to-annex-dropunused b/bin/annex-to-annex-dropunused
new file mode 100755
index 0000000..33ddf03
--- /dev/null
+++ b/bin/annex-to-annex-dropunused
@@ -0,0 +1,72 @@
+#!/usr/bin/perl
+
+# annex-to-annex-dropunused -- drop old hardlinks migrated by annex-to-annex
+
+# Copyright (C) 2019-2020 Sean Whitton
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+=head1 NAME
+
+annex-to-annex-dropunused - drop old hardlinks migrated by annex-to-annex
+
+=head1 SYNOPSIS
+
+
+
+=head1 DESCRIPTION
+
+
+
+=cut
+
+use 5.028;
+use strict;
+use warnings;
+
+use autodie;
+use Git::Annex;
+
+# This script used to have a --dest option which specified the
+# destination annex previously used with annex-to-annex. Then, if the
+# unused file had a hardlink count of 1, but was present in the
+# destination annex, this script would drop it.
+#
+# That was somewhat dangerous functionality because it involves this
+# script running `git annex dropunused --force` for files with a
+# hardlink count of 1. And further, it is not actually needed,
+# because running annex-to-annex-reinject after
+# annex-to-annex-dropunused handles such files in a way that is safer.
+#
+# It is still good to run this script before annex-to-annex-reinject
+# to make the latter faster.
+
+my $annex = Git::Annex->new;
+
+my @to_drop;
+my @unused_files = grep { !$_->{bad} && !$_->{tmp} } @{ $annex->unused };
+
+foreach my $unused_file (@unused_files) {
+ my $content = $annex->abs_contentlocation($unused_file->{key});
+ my $link_count = (stat $content)[3];
+ my @logs
+ = $annex->git->log({ no_textconv => 1 }, "-S", $unused_file->{key});
+
+ next unless $logs[0] and $logs[0]->message =~ /migrated by annex-to-annex/;
+ next unless $link_count > 1;
+
+ push @to_drop, $unused_file->{number};
+}
+
+$annex->git->annex("dropunused", { force => 1 }, @to_drop);
diff --git a/t/21_annex-to-annex.t b/t/21_annex-to-annex.t
new file mode 100755
index 0000000..2666b4b
--- /dev/null
+++ b/t/21_annex-to-annex.t
@@ -0,0 +1,163 @@
+#!/usr/bin/perl
+
+use 5.028;
+use strict;
+use warnings;
+use lib 't/lib';
+
+use Test::More;
+use t::Setup;
+use t::Util;
+use File::Path qw(make_path);
+use File::Slurp;
+use Capture::Tiny qw(capture_merged);
+use File::Spec::Functions qw(catfile rel2abs);
+use File::chdir;
+
+# make sure that `make test` will always use the right version of the
+# script we seek to test
+#
+# TODO testing the development version is currently difficult because
+# the 'run' sub in this script seems to reset PATH. Right now the
+# only sure fire way to run these test against the development version
+# is `dzil test`, which runs everything
+my $a2a = "annex-to-annex";
+$a2a = rel2abs "blib/script/annex-to-annex" if -x "blib/script/annex-to-annex";
+
+my $exit;
+my @output;
+
+with_temp_annexes {
+ make_path "dest/foo/foo2";
+ write_file "dest/foo/bar", "";
+ write_file "dest/foo/foo2/baz", "";
+ ($exit, @output) = run($a2a, "source1/foo", "source2/other", "dest");
+ ok $exit, "it exits nonzero instead of clobbering an existing file";
+ ok grep(/\/dest\/foo\/bar already exists!$/, @output),
+ "it won't clobber an existing file";
+};
+
+with_temp_annexes {
+ write_file catfile("source1", "quux"), "quux\n";
+ ($exit, @output) = run($a2a, "--commit", "source1/foo/bar", "dest");
+ ok $exit,
+ "with --commit, it exits nonzero when uncommitted source changes";
+ ok grep(/^git repo containing [^ ]+\/bar is not clean; please commit$/,
+ @output),
+ "with --commit, it exits when uncommitted changes";
+};
+
+with_temp_annexes {
+ write_file catfile("dest", "quux"), "quux\n";
+ ($exit, @output) = run($a2a, "--commit", "source1/foo/bar", "dest");
+ ok $exit, "with --commit, it exits nonzero when uncommitted dest changes";
+ ok grep(/^git repo containing [^ ]+\/dest is not clean; please commit$/,
+ @output),
+ "with --commit, it exits when uncommitted changes";
+};
+
+# following test is sensitive to changes in git-log output, but that's
+# okay
+with_temp_annexes {
+ my (undef, $source1, $source2, $dest) = @_;
+
+ system $a2a, qw(--commit source1/foo source2/other dest);
+
+ @output = $source1->RUN(qw(log -1 --oneline --name-status));
+ like $output[0], qr/migrated by annex-to-annex/,
+ "--commit makes a source1 commit";
+ ok grep(m{^D\s+foo/bar$}, @output[1 .. $#output]),
+ "--commit commit deletes bar";
+ ok grep(m{^D\s+foo/foo2/baz$}, @output[1 .. $#output]),
+ "--commit commit deletes baz";
+
+ @output = $source2->RUN(qw(log -1 --oneline --name-status));
+ like $output[0], qr/migrated by annex-to-annex/,
+ "--commit makes a source2 commit";
+ ok grep(m{^D\s+other$}, @output[1 .. $#output]),
+ "--commit commit deletes other";
+
+ @output = $dest->RUN(qw(log -1 --oneline --name-status));
+ like $output[0], qr/add/, "--commit makes a dest commit";
+ ok grep(m{^A\s+other$}, @output[1 .. $#output]),
+ "--commit commit adds other";
+};
+
+with_temp_annexes {
+ my (undef, $source1) = @_;
+
+ corrupt_annexed_file $source1, "foo/foo2/baz";
+ ($exit, @output)
+ = run($a2a, "--commit", "source1/foo", "source2/other", "dest");
+ ok $exit, "it exits nonzero when dest annex calculates a diff checksum";
+ ok grep(/git-annex calculated a different checksum for/, @output),
+ "it warns when dest annex calculates a diff checksum";
+};
+
+with_temp_annexes {
+ my (undef, $source1) = @_;
+
+ $source1->annex(qw(drop --force foo/foo2/baz));
+ ($exit, @output)
+ = run($a2a, "--commit", "source1/foo", "source2/other", "dest");
+ ok $exit, "it exits nonzero when an annexed file is not present";
+ ok
+ grep(/^Following annexed files are not present in this repo:$/, @output),
+ "it exits when annexed files are not present";
+};
+
+# this is the main integration test for the script doing its job
+with_temp_annexes {
+ my (undef, $source1, $source2, $dest) = @_;
+ system $a2a, qw(source1/foo source2/other dest);
+ {
+ local $CWD = "source1";
+ ok !-e "foo/bar", "bar should not exist in source1";
+ ok !-e "foo/foo2/baz", "baz should not exist in source1";
+ }
+ ok !-e "source2/other", "other should not exist in source2";
+ {
+ local $CWD = "dest";
+
+ ok -f "foo/bar", "bar is regular file in dest";
+ ok -f "foo/foo2/baz", "baz is regular file in dest";
+ ok -l "other", "other is symlink in dest";
+ my @bar_find = $dest->annex(qw(find foo/bar));
+ ok @bar_find == 0, "bar is not annexed in dest";
+
+ my ($baz_key) = $dest->annex(qw(lookupkey foo/foo2/baz));
+ my ($baz_content) = $dest->annex("contentlocation", $baz_key);
+ my @baz_content_stat = stat $baz_content;
+ ok $baz_content_stat[3] == 2, "baz was hardlinked into annex";
+ ok((stat("foo/foo2/baz"))[3] == 1,
+ "baz in dest working tree is a copy");
+
+ my ($other_key) = $dest->annex(qw(lookupkey other));
+ my ($other_content) = $dest->annex("contentlocation", $other_key);
+ my @other_content_stat = stat $other_content;
+ ok $other_content_stat[3] == 2, "other was hardlinked into annex";
+
+ is read_file("foo/bar"), "bar\n", "bar has expected file content";
+ is read_file("foo/foo2/baz"), "baz\n", "baz has expected file content";
+ is read_file("other"), "other\n", "other has expected file content";
+ }
+ my @source1_git_status = $source1->RUN(qw(status --porcelain));
+ ok grep(/D\s+foo\/bar/, @source1_git_status), "bar was removed from git";
+ ok grep(/D\s+foo\/foo2\/baz/, @source1_git_status),
+ "baz was removed from git";
+ my @source2_git_status = $source2->RUN(qw(status --porcelain));
+ ok grep(/D\s+other/, @source2_git_status), "other was removed from git";
+ my @dest_git_status = $dest->RUN(qw(status --porcelain));
+ ok grep(/A\s+foo\/bar/, @dest_git_status), "bar was added to git";
+ ok grep(/A\s+foo\/foo2\/baz/, @dest_git_status), "baz was added to git";
+ ok grep(/A\s+other/, @dest_git_status), "other was added to git";
+};
+
+done_testing;
+
+sub run {
+ my @cmd = @_;
+ my ($exit, @output);
+ @output = split "\n", capture_merged { system @cmd; $exit = $? >> 8 };
+ return ($exit, @output);
+}
diff --git a/t/22_annex-to-annex-dropunused.t b/t/22_annex-to-annex-dropunused.t
new file mode 100755
index 0000000..88affd7
--- /dev/null
+++ b/t/22_annex-to-annex-dropunused.t
@@ -0,0 +1,91 @@
+#!/usr/bin/perl
+
+use 5.028;
+use strict;
+use warnings;
+use lib 't/lib';
+
+use Test::More;
+use File::Spec::Functions qw(rel2abs);
+use t::Setup;
+use File::chdir;
+use File::Basename qw(dirname);
+use File::Copy qw(copy);
+
+# make sure that `make test` will always use the right version of the
+# script we seek to test
+my $a2a = "annex-to-annex";
+my $a2a_du = "annex-to-annex-dropunused";
+$a2a = rel2abs "blib/script/annex-to-annex" if -x "blib/script/annex-to-annex";
+$a2a_du = rel2abs "blib/script/annex-to-annex-dropunused"
+ if -x "blib/script/annex-to-annex-dropunuesd";
+
+with_temp_annexes {
+ my (undef, undef, $source2) = @_;
+
+ system $a2a, qw(--commit source1/foo source2/other dest);
+
+ {
+ local $CWD = "source2";
+ system $a2a_du;
+ $source2->checkout("master~1");
+ ok((lstat "other" and not stat "other"), "other was dropped");
+ }
+};
+
+with_temp_annexes {
+ my (undef, undef, $source2) = @_;
+
+ system $a2a, qw(--commit source1/foo source2/other dest);
+
+ {
+ local $CWD = "source2";
+
+ $source2->checkout("master~1");
+ my ($other_key) = $source2->annex(qw(lookupkey other));
+ my ($other_content) = $source2->annex("contentlocation", $other_key);
+ $source2->checkout("master");
+
+ # break the hardlink
+ chmod 0755, dirname $other_content;
+ copy $other_content, "$other_content.tmp";
+ system "mv", "-f", "$other_content.tmp", $other_content;
+ chmod 0555, dirname $other_content;
+
+ system $a2a_du;
+ $source2->checkout("master~1");
+ ok((lstat "other" and stat "other"), "other was not dropped");
+ # $source2->checkout("master");
+ # system $a2a_du, "--dest=../dest";
+ # $source2->checkout("master~1");
+ # ok((lstat "other" and not stat "other"), "other was dropped");
+ }
+};
+
+# with_temp_annexes {
+# my (undef, undef, $source2, $dest) = @_;
+
+# system $a2a, qw(--commit source1/foo source2/other dest);
+
+# $dest->annex(qw(drop --force other));
+# {
+# local $CWD = "source2";
+
+# $source2->checkout("master~1");
+# my ($other_key) = $source2->annex(qw(lookupkey other));
+# my ($other_content) = $source2->annex("contentlocation", $other_key);
+# $source2->checkout("master");
+
+# # break the hardlink
+# chmod 0755, dirname $other_content;
+# copy $other_content, "$other_content.tmp";
+# system "mv", "-f", "$other_content.tmp", $other_content;
+# chmod 0555, dirname $other_content;
+
+# system $a2a_du, "--dest=../dest";
+# $source2->checkout("master~1");
+# ok((lstat "other" and stat "other"), "other was not dropped");
+# }
+# };
+
+done_testing;