summaryrefslogtreecommitdiff
path: root/bin
diff options
context:
space:
mode:
authorSean Whitton <spwhitton@spwhitton.name>2020-02-04 17:04:58 -0700
committerSean Whitton <spwhitton@spwhitton.name>2020-02-04 17:04:58 -0700
commit4dc8c46c185a2e3233a955d251991312d2f03437 (patch)
tree35fc630c4540a1866821af93bfae9ca425a0233d /bin
parentc1d65f03978ed8c133479ea184dce5e0dc0f8bda (diff)
downloadp5-Git-Annex-4dc8c46c185a2e3233a955d251991312d2f03437.tar.gz
add annex-to-annex{,-dropunused}
Signed-off-by: Sean Whitton <spwhitton@spwhitton.name>
Diffstat (limited to 'bin')
-rwxr-xr-xbin/annex-to-annex188
-rwxr-xr-xbin/annex-to-annex-dropunused72
2 files changed, 260 insertions, 0 deletions
diff --git a/bin/annex-to-annex b/bin/annex-to-annex
new file mode 100755
index 0000000..0ebd771
--- /dev/null
+++ b/bin/annex-to-annex
@@ -0,0 +1,188 @@
+#!/usr/bin/perl
+
+# annex-to-annex -- use hardlinks to migrate files between git annex repos
+
+# Copyright (C) 2019-2020 Sean Whitton
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+=head1 NAME
+
+annex-to-annex - use hardlinks to migrate files between git annex repos
+
+=head1 SYNOPSIS
+
+
+
+=head1 DESCRIPTION
+
+This script moves files and directories from one or more git annexes
+into a destination git annex, using hardlinks rather than copying
+files where possible.
+
+It is useful for splitting and consolidating git annexes. For
+example, at the end of the semester I use this script to move files
+from my work annex, which gets synced to a lot of places, into an
+archival annex, which doesn't.
+
+=cut
+
+use 5.028;
+use strict;
+use warnings;
+
+use autodie;
+use Digest::MD5::File qw(file_md5);
+use File::Basename qw(dirname basename);
+use File::Copy;
+use File::Find;
+use File::Spec::Functions qw(catfile rel2abs abs2rel);
+use Try::Tiny;
+use Git::Annex;
+
+# only support v7 because supporting v5 too would make things quite
+# complex. require git-annex >=7.20191009 because it will refuse to
+# work in v5 repos, and because it supports `git annex find --unlocked`
+chomp(my %annex_version_fields = map { split ': ' } `git annex version`);
+die "I need git-annex >=7.20191009 and a v7 repository\n"
+ unless $annex_version_fields{'git-annex version'} >= 7.20191009;
+
+die "need at least two arguments\n" unless @ARGV > 1;
+my $dest = rel2abs pop @ARGV;
+die "dest is not a directory\n" unless -d $dest;
+my $dest_device_id = (stat($dest))[0];
+my $dannex = Git::Annex->new($dest);
+my $do_commit = 0;
+if ($ARGV[0] eq '--commit') {
+ $do_commit = 1;
+ shift @ARGV;
+
+ my @git_status = $dannex->git->RUN("status", { porcelain => 1 });
+ die "git repo containing $dest is not clean; please commit\n"
+ unless @git_status == 0;
+
+ #<<<
+ try {
+ $dannex->git->symbolic_ref({ quiet => 1 }, "HEAD");
+ } catch {
+ die "$dest has a detached HEAD; aborting";
+ };
+ #>>>
+}
+my @sources = map rel2abs($_), @ARGV;
+
+# process one entry in @sources at a time because we can start up
+# annex batch processes for each of these as all files under each
+# entry in @sources will lie in the same annex
+foreach my $source (@sources) {
+ my $dir = dirname $source;
+ my $annex = Git::Annex->new($dir);
+ #<<<
+ try {
+ $annex->git->annex("status");
+ } catch {
+ die "$source does not appear to lie within an annex\n";
+ };
+ #>>>
+ die "$source does not exist\n" unless -e $source;
+
+ if ($do_commit) {
+ my @git_status = $annex->git->RUN("status", { porcelain => 1 });
+ die "git repo containing $source is not clean; please commit\n"
+ unless @git_status == 0;
+
+ #<<<
+ try {
+ $annex->git->symbolic_ref({ quiet => 1 }, "HEAD");
+ } catch {
+ die "$dest has a detached HEAD; aborting";
+ };
+ #>>>
+ }
+
+ my $base = basename $source;
+ my @missing = $annex->git->annex("find", "--not", "--in", "here", $base);
+ if (@missing) {
+ say "Following annexed files are not present in this repo:";
+ say for @missing;
+ die "cannot continue; please `git-annex get` them\n";
+ }
+
+ # start batch processes
+ my $lk = $annex->batch("lookupkey");
+ my $cl = $annex->batch("contentlocation");
+ my $find = $annex->batch("find", "--unlocked");
+
+ find({
+ wanted => sub {
+ my $rel = abs2rel $File::Find::name, $dir;
+ my $target = catfile $dest, $rel;
+ die "$target already exists!\n" if -e $target and !-d $target;
+
+ my $key = $lk->ask($rel);
+ if ($key) { # this is an annexed file
+ my $content = rel2abs $cl->ask($key), $annex->toplevel;
+ my $content_device_id = (stat $content)[0];
+ if ($dest_device_id == $content_device_id) {
+ link $content, $target;
+ } else {
+ copy_and_md5($content, $target);
+ }
+ # add, and then maybe unlock. we don't use `-c
+ # annex.addunlocked=true` because we want to
+ # hardlink from .git/annex/objects in the source
+ # to .git/annex/objects in the dest, rather than
+ # having the unlocked copy in dest be hardlinked
+ # to the source, or anything like that
+ system "git", "-C", $dest, "annex", "add", $rel;
+ system "git", "-C", $dest, "annex", "unlock", $rel
+ if $find->ask($rel);
+
+ # if using the default backend, quick sanity check
+ if ($key =~ /^SHA256E-s[0-9]+--([0-9a-f]+)/) {
+ my $key_sum = $1;
+ chomp(my $dest_key
+ = `git -C "$dest" annex lookupkey "$rel"`);
+ if ($dest_key =~ /^SHA256E-s[0-9]+--([0-9a-f]+)/) {
+ my $dest_key_sum = $1;
+ die
+"git-annex calculated a different checksum for $target"
+ unless $key_sum eq $dest_key_sum;
+ }
+ }
+ } else { # this is not an annexed file
+ if (-d $File::Find::name) {
+ mkdir $target unless -d $target;
+ } else {
+ copy_and_md5($File::Find::name, $target);
+ system "git", "-C", $dest,
+ "-c", "annex.gitaddtoannex=false", "add", $rel;
+ }
+ }
+ $annex->git->rm($File::Find::name) unless -d $File::Find::name;
+ },
+ no_chdir => 1,
+ },
+ $source
+ );
+ $annex->git->commit({ message => "migrated by annex-to-annex" })
+ if $do_commit;
+}
+$dannex->git->commit({ message => "add" }) if $do_commit;
+
+sub copy_and_md5 {
+ copy($_[0], $_[1]);
+ die "md5 checksum failure after copying $_[0] to $_[1]!"
+ unless file_md5($_[0]) eq file_md5($_[1]);
+}
diff --git a/bin/annex-to-annex-dropunused b/bin/annex-to-annex-dropunused
new file mode 100755
index 0000000..33ddf03
--- /dev/null
+++ b/bin/annex-to-annex-dropunused
@@ -0,0 +1,72 @@
+#!/usr/bin/perl
+
+# annex-to-annex-dropunused -- drop old hardlinks migrated by annex-to-annex
+
+# Copyright (C) 2019-2020 Sean Whitton
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+=head1 NAME
+
+annex-to-annex-dropunused - drop old hardlinks migrated by annex-to-annex
+
+=head1 SYNOPSIS
+
+
+
+=head1 DESCRIPTION
+
+
+
+=cut
+
+use 5.028;
+use strict;
+use warnings;
+
+use autodie;
+use Git::Annex;
+
+# This script used to have a --dest option which specified the
+# destination annex previously used with annex-to-annex. Then, if the
+# unused file had a hardlink count of 1, but was present in the
+# destination annex, this script would drop it.
+#
+# That was somewhat dangerous functionality because it involves this
+# script running `git annex dropunused --force` for files with a
+# hardlink count of 1. And further, it is not actually needed,
+# because running annex-to-annex-reinject after
+# annex-to-annex-dropunused handles such files in a way that is safer.
+#
+# It is still good to run this script before annex-to-annex-reinject
+# to make the latter faster.
+
+my $annex = Git::Annex->new;
+
+my @to_drop;
+my @unused_files = grep { !$_->{bad} && !$_->{tmp} } @{ $annex->unused };
+
+foreach my $unused_file (@unused_files) {
+ my $content = $annex->abs_contentlocation($unused_file->{key});
+ my $link_count = (stat $content)[3];
+ my @logs
+ = $annex->git->log({ no_textconv => 1 }, "-S", $unused_file->{key});
+
+ next unless $logs[0] and $logs[0]->message =~ /migrated by annex-to-annex/;
+ next unless $link_count > 1;
+
+ push @to_drop, $unused_file->{number};
+}
+
+$annex->git->annex("dropunused", { force => 1 }, @to_drop);