From 4dc8c46c185a2e3233a955d251991312d2f03437 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Tue, 4 Feb 2020 17:04:58 -0700 Subject: add annex-to-annex{,-dropunused} Signed-off-by: Sean Whitton --- README | 2 + bin/annex-to-annex | 188 +++++++++++++++++++++++++++++++++++++++ bin/annex-to-annex-dropunused | 72 +++++++++++++++ t/21_annex-to-annex.t | 163 +++++++++++++++++++++++++++++++++ t/22_annex-to-annex-dropunused.t | 91 +++++++++++++++++++ 5 files changed, 516 insertions(+) create mode 100755 bin/annex-to-annex create mode 100755 bin/annex-to-annex-dropunused create mode 100755 t/21_annex-to-annex.t create mode 100755 t/22_annex-to-annex-dropunused.t diff --git a/README b/README index baa47b8..9a42826 100644 --- a/README +++ b/README @@ -8,6 +8,8 @@ Git::Annex -- class representing a git-annex repository annex-review-unused -- interactively process `git annex unused` output +annex-to-annex -- use hardlinks to migrate files between git annex repos + INSTALLATION To install this module, run the following commands: diff --git a/bin/annex-to-annex b/bin/annex-to-annex new file mode 100755 index 0000000..0ebd771 --- /dev/null +++ b/bin/annex-to-annex @@ -0,0 +1,188 @@ +#!/usr/bin/perl + +# annex-to-annex -- use hardlinks to migrate files between git annex repos + +# Copyright (C) 2019-2020 Sean Whitton +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or (at +# your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +=head1 NAME + +annex-to-annex - use hardlinks to migrate files between git annex repos + +=head1 SYNOPSIS + + + +=head1 DESCRIPTION + +This script moves files and directories from one or more git annexes +into a destination git annex, using hardlinks rather than copying +files where possible. + +It is useful for splitting and consolidating git annexes. For +example, at the end of the semester I use this script to move files +from my work annex, which gets synced to a lot of places, into an +archival annex, which doesn't. + +=cut + +use 5.028; +use strict; +use warnings; + +use autodie; +use Digest::MD5::File qw(file_md5); +use File::Basename qw(dirname basename); +use File::Copy; +use File::Find; +use File::Spec::Functions qw(catfile rel2abs abs2rel); +use Try::Tiny; +use Git::Annex; + +# only support v7 because supporting v5 too would make things quite +# complex. require git-annex >=7.20191009 because it will refuse to +# work in v5 repos, and because it supports `git annex find --unlocked` +chomp(my %annex_version_fields = map { split ': ' } `git annex version`); +die "I need git-annex >=7.20191009 and a v7 repository\n" + unless $annex_version_fields{'git-annex version'} >= 7.20191009; + +die "need at least two arguments\n" unless @ARGV > 1; +my $dest = rel2abs pop @ARGV; +die "dest is not a directory\n" unless -d $dest; +my $dest_device_id = (stat($dest))[0]; +my $dannex = Git::Annex->new($dest); +my $do_commit = 0; +if ($ARGV[0] eq '--commit') { + $do_commit = 1; + shift @ARGV; + + my @git_status = $dannex->git->RUN("status", { porcelain => 1 }); + die "git repo containing $dest is not clean; please commit\n" + unless @git_status == 0; + + #<<< + try { + $dannex->git->symbolic_ref({ quiet => 1 }, "HEAD"); + } catch { + die "$dest has a detached HEAD; aborting"; + }; + #>>> +} +my @sources = map rel2abs($_), @ARGV; + +# process one entry in @sources at a time because we can start up +# annex batch processes for each of these as all files under each +# entry in @sources will lie in the same annex +foreach my $source (@sources) { + my $dir = dirname $source; + my $annex = Git::Annex->new($dir); + #<<< + try { + $annex->git->annex("status"); + } catch { + die "$source does not appear to lie within an annex\n"; + }; + #>>> + die "$source does not exist\n" unless -e $source; + + if ($do_commit) { + my @git_status = $annex->git->RUN("status", { porcelain => 1 }); + die "git repo containing $source is not clean; please commit\n" + unless @git_status == 0; + + #<<< + try { + $annex->git->symbolic_ref({ quiet => 1 }, "HEAD"); + } catch { + die "$dest has a detached HEAD; aborting"; + }; + #>>> + } + + my $base = basename $source; + my @missing = $annex->git->annex("find", "--not", "--in", "here", $base); + if (@missing) { + say "Following annexed files are not present in this repo:"; + say for @missing; + die "cannot continue; please `git-annex get` them\n"; + } + + # start batch processes + my $lk = $annex->batch("lookupkey"); + my $cl = $annex->batch("contentlocation"); + my $find = $annex->batch("find", "--unlocked"); + + find({ + wanted => sub { + my $rel = abs2rel $File::Find::name, $dir; + my $target = catfile $dest, $rel; + die "$target already exists!\n" if -e $target and !-d $target; + + my $key = $lk->ask($rel); + if ($key) { # this is an annexed file + my $content = rel2abs $cl->ask($key), $annex->toplevel; + my $content_device_id = (stat $content)[0]; + if ($dest_device_id == $content_device_id) { + link $content, $target; + } else { + copy_and_md5($content, $target); + } + # add, and then maybe unlock. we don't use `-c + # annex.addunlocked=true` because we want to + # hardlink from .git/annex/objects in the source + # to .git/annex/objects in the dest, rather than + # having the unlocked copy in dest be hardlinked + # to the source, or anything like that + system "git", "-C", $dest, "annex", "add", $rel; + system "git", "-C", $dest, "annex", "unlock", $rel + if $find->ask($rel); + + # if using the default backend, quick sanity check + if ($key =~ /^SHA256E-s[0-9]+--([0-9a-f]+)/) { + my $key_sum = $1; + chomp(my $dest_key + = `git -C "$dest" annex lookupkey "$rel"`); + if ($dest_key =~ /^SHA256E-s[0-9]+--([0-9a-f]+)/) { + my $dest_key_sum = $1; + die +"git-annex calculated a different checksum for $target" + unless $key_sum eq $dest_key_sum; + } + } + } else { # this is not an annexed file + if (-d $File::Find::name) { + mkdir $target unless -d $target; + } else { + copy_and_md5($File::Find::name, $target); + system "git", "-C", $dest, + "-c", "annex.gitaddtoannex=false", "add", $rel; + } + } + $annex->git->rm($File::Find::name) unless -d $File::Find::name; + }, + no_chdir => 1, + }, + $source + ); + $annex->git->commit({ message => "migrated by annex-to-annex" }) + if $do_commit; +} +$dannex->git->commit({ message => "add" }) if $do_commit; + +sub copy_and_md5 { + copy($_[0], $_[1]); + die "md5 checksum failure after copying $_[0] to $_[1]!" + unless file_md5($_[0]) eq file_md5($_[1]); +} diff --git a/bin/annex-to-annex-dropunused b/bin/annex-to-annex-dropunused new file mode 100755 index 0000000..33ddf03 --- /dev/null +++ b/bin/annex-to-annex-dropunused @@ -0,0 +1,72 @@ +#!/usr/bin/perl + +# annex-to-annex-dropunused -- drop old hardlinks migrated by annex-to-annex + +# Copyright (C) 2019-2020 Sean Whitton +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or (at +# your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +=head1 NAME + +annex-to-annex-dropunused - drop old hardlinks migrated by annex-to-annex + +=head1 SYNOPSIS + + + +=head1 DESCRIPTION + + + +=cut + +use 5.028; +use strict; +use warnings; + +use autodie; +use Git::Annex; + +# This script used to have a --dest option which specified the +# destination annex previously used with annex-to-annex. Then, if the +# unused file had a hardlink count of 1, but was present in the +# destination annex, this script would drop it. +# +# That was somewhat dangerous functionality because it involves this +# script running `git annex dropunused --force` for files with a +# hardlink count of 1. And further, it is not actually needed, +# because running annex-to-annex-reinject after +# annex-to-annex-dropunused handles such files in a way that is safer. +# +# It is still good to run this script before annex-to-annex-reinject +# to make the latter faster. + +my $annex = Git::Annex->new; + +my @to_drop; +my @unused_files = grep { !$_->{bad} && !$_->{tmp} } @{ $annex->unused }; + +foreach my $unused_file (@unused_files) { + my $content = $annex->abs_contentlocation($unused_file->{key}); + my $link_count = (stat $content)[3]; + my @logs + = $annex->git->log({ no_textconv => 1 }, "-S", $unused_file->{key}); + + next unless $logs[0] and $logs[0]->message =~ /migrated by annex-to-annex/; + next unless $link_count > 1; + + push @to_drop, $unused_file->{number}; +} + +$annex->git->annex("dropunused", { force => 1 }, @to_drop); diff --git a/t/21_annex-to-annex.t b/t/21_annex-to-annex.t new file mode 100755 index 0000000..2666b4b --- /dev/null +++ b/t/21_annex-to-annex.t @@ -0,0 +1,163 @@ +#!/usr/bin/perl + +use 5.028; +use strict; +use warnings; +use lib 't/lib'; + +use Test::More; +use t::Setup; +use t::Util; +use File::Path qw(make_path); +use File::Slurp; +use Capture::Tiny qw(capture_merged); +use File::Spec::Functions qw(catfile rel2abs); +use File::chdir; + +# make sure that `make test` will always use the right version of the +# script we seek to test +# +# TODO testing the development version is currently difficult because +# the 'run' sub in this script seems to reset PATH. Right now the +# only sure fire way to run these test against the development version +# is `dzil test`, which runs everything +my $a2a = "annex-to-annex"; +$a2a = rel2abs "blib/script/annex-to-annex" if -x "blib/script/annex-to-annex"; + +my $exit; +my @output; + +with_temp_annexes { + make_path "dest/foo/foo2"; + write_file "dest/foo/bar", ""; + write_file "dest/foo/foo2/baz", ""; + ($exit, @output) = run($a2a, "source1/foo", "source2/other", "dest"); + ok $exit, "it exits nonzero instead of clobbering an existing file"; + ok grep(/\/dest\/foo\/bar already exists!$/, @output), + "it won't clobber an existing file"; +}; + +with_temp_annexes { + write_file catfile("source1", "quux"), "quux\n"; + ($exit, @output) = run($a2a, "--commit", "source1/foo/bar", "dest"); + ok $exit, + "with --commit, it exits nonzero when uncommitted source changes"; + ok grep(/^git repo containing [^ ]+\/bar is not clean; please commit$/, + @output), + "with --commit, it exits when uncommitted changes"; +}; + +with_temp_annexes { + write_file catfile("dest", "quux"), "quux\n"; + ($exit, @output) = run($a2a, "--commit", "source1/foo/bar", "dest"); + ok $exit, "with --commit, it exits nonzero when uncommitted dest changes"; + ok grep(/^git repo containing [^ ]+\/dest is not clean; please commit$/, + @output), + "with --commit, it exits when uncommitted changes"; +}; + +# following test is sensitive to changes in git-log output, but that's +# okay +with_temp_annexes { + my (undef, $source1, $source2, $dest) = @_; + + system $a2a, qw(--commit source1/foo source2/other dest); + + @output = $source1->RUN(qw(log -1 --oneline --name-status)); + like $output[0], qr/migrated by annex-to-annex/, + "--commit makes a source1 commit"; + ok grep(m{^D\s+foo/bar$}, @output[1 .. $#output]), + "--commit commit deletes bar"; + ok grep(m{^D\s+foo/foo2/baz$}, @output[1 .. $#output]), + "--commit commit deletes baz"; + + @output = $source2->RUN(qw(log -1 --oneline --name-status)); + like $output[0], qr/migrated by annex-to-annex/, + "--commit makes a source2 commit"; + ok grep(m{^D\s+other$}, @output[1 .. $#output]), + "--commit commit deletes other"; + + @output = $dest->RUN(qw(log -1 --oneline --name-status)); + like $output[0], qr/add/, "--commit makes a dest commit"; + ok grep(m{^A\s+other$}, @output[1 .. $#output]), + "--commit commit adds other"; +}; + +with_temp_annexes { + my (undef, $source1) = @_; + + corrupt_annexed_file $source1, "foo/foo2/baz"; + ($exit, @output) + = run($a2a, "--commit", "source1/foo", "source2/other", "dest"); + ok $exit, "it exits nonzero when dest annex calculates a diff checksum"; + ok grep(/git-annex calculated a different checksum for/, @output), + "it warns when dest annex calculates a diff checksum"; +}; + +with_temp_annexes { + my (undef, $source1) = @_; + + $source1->annex(qw(drop --force foo/foo2/baz)); + ($exit, @output) + = run($a2a, "--commit", "source1/foo", "source2/other", "dest"); + ok $exit, "it exits nonzero when an annexed file is not present"; + ok + grep(/^Following annexed files are not present in this repo:$/, @output), + "it exits when annexed files are not present"; +}; + +# this is the main integration test for the script doing its job +with_temp_annexes { + my (undef, $source1, $source2, $dest) = @_; + system $a2a, qw(source1/foo source2/other dest); + { + local $CWD = "source1"; + ok !-e "foo/bar", "bar should not exist in source1"; + ok !-e "foo/foo2/baz", "baz should not exist in source1"; + } + ok !-e "source2/other", "other should not exist in source2"; + { + local $CWD = "dest"; + + ok -f "foo/bar", "bar is regular file in dest"; + ok -f "foo/foo2/baz", "baz is regular file in dest"; + ok -l "other", "other is symlink in dest"; + my @bar_find = $dest->annex(qw(find foo/bar)); + ok @bar_find == 0, "bar is not annexed in dest"; + + my ($baz_key) = $dest->annex(qw(lookupkey foo/foo2/baz)); + my ($baz_content) = $dest->annex("contentlocation", $baz_key); + my @baz_content_stat = stat $baz_content; + ok $baz_content_stat[3] == 2, "baz was hardlinked into annex"; + ok((stat("foo/foo2/baz"))[3] == 1, + "baz in dest working tree is a copy"); + + my ($other_key) = $dest->annex(qw(lookupkey other)); + my ($other_content) = $dest->annex("contentlocation", $other_key); + my @other_content_stat = stat $other_content; + ok $other_content_stat[3] == 2, "other was hardlinked into annex"; + + is read_file("foo/bar"), "bar\n", "bar has expected file content"; + is read_file("foo/foo2/baz"), "baz\n", "baz has expected file content"; + is read_file("other"), "other\n", "other has expected file content"; + } + my @source1_git_status = $source1->RUN(qw(status --porcelain)); + ok grep(/D\s+foo\/bar/, @source1_git_status), "bar was removed from git"; + ok grep(/D\s+foo\/foo2\/baz/, @source1_git_status), + "baz was removed from git"; + my @source2_git_status = $source2->RUN(qw(status --porcelain)); + ok grep(/D\s+other/, @source2_git_status), "other was removed from git"; + my @dest_git_status = $dest->RUN(qw(status --porcelain)); + ok grep(/A\s+foo\/bar/, @dest_git_status), "bar was added to git"; + ok grep(/A\s+foo\/foo2\/baz/, @dest_git_status), "baz was added to git"; + ok grep(/A\s+other/, @dest_git_status), "other was added to git"; +}; + +done_testing; + +sub run { + my @cmd = @_; + my ($exit, @output); + @output = split "\n", capture_merged { system @cmd; $exit = $? >> 8 }; + return ($exit, @output); +} diff --git a/t/22_annex-to-annex-dropunused.t b/t/22_annex-to-annex-dropunused.t new file mode 100755 index 0000000..88affd7 --- /dev/null +++ b/t/22_annex-to-annex-dropunused.t @@ -0,0 +1,91 @@ +#!/usr/bin/perl + +use 5.028; +use strict; +use warnings; +use lib 't/lib'; + +use Test::More; +use File::Spec::Functions qw(rel2abs); +use t::Setup; +use File::chdir; +use File::Basename qw(dirname); +use File::Copy qw(copy); + +# make sure that `make test` will always use the right version of the +# script we seek to test +my $a2a = "annex-to-annex"; +my $a2a_du = "annex-to-annex-dropunused"; +$a2a = rel2abs "blib/script/annex-to-annex" if -x "blib/script/annex-to-annex"; +$a2a_du = rel2abs "blib/script/annex-to-annex-dropunused" + if -x "blib/script/annex-to-annex-dropunuesd"; + +with_temp_annexes { + my (undef, undef, $source2) = @_; + + system $a2a, qw(--commit source1/foo source2/other dest); + + { + local $CWD = "source2"; + system $a2a_du; + $source2->checkout("master~1"); + ok((lstat "other" and not stat "other"), "other was dropped"); + } +}; + +with_temp_annexes { + my (undef, undef, $source2) = @_; + + system $a2a, qw(--commit source1/foo source2/other dest); + + { + local $CWD = "source2"; + + $source2->checkout("master~1"); + my ($other_key) = $source2->annex(qw(lookupkey other)); + my ($other_content) = $source2->annex("contentlocation", $other_key); + $source2->checkout("master"); + + # break the hardlink + chmod 0755, dirname $other_content; + copy $other_content, "$other_content.tmp"; + system "mv", "-f", "$other_content.tmp", $other_content; + chmod 0555, dirname $other_content; + + system $a2a_du; + $source2->checkout("master~1"); + ok((lstat "other" and stat "other"), "other was not dropped"); + # $source2->checkout("master"); + # system $a2a_du, "--dest=../dest"; + # $source2->checkout("master~1"); + # ok((lstat "other" and not stat "other"), "other was dropped"); + } +}; + +# with_temp_annexes { +# my (undef, undef, $source2, $dest) = @_; + +# system $a2a, qw(--commit source1/foo source2/other dest); + +# $dest->annex(qw(drop --force other)); +# { +# local $CWD = "source2"; + +# $source2->checkout("master~1"); +# my ($other_key) = $source2->annex(qw(lookupkey other)); +# my ($other_content) = $source2->annex("contentlocation", $other_key); +# $source2->checkout("master"); + +# # break the hardlink +# chmod 0755, dirname $other_content; +# copy $other_content, "$other_content.tmp"; +# system "mv", "-f", "$other_content.tmp", $other_content; +# chmod 0555, dirname $other_content; + +# system $a2a_du, "--dest=../dest"; +# $source2->checkout("master~1"); +# ok((lstat "other" and stat "other"), "other was not dropped"); +# } +# }; + +done_testing; -- cgit v1.2.3