From 4dc8c46c185a2e3233a955d251991312d2f03437 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Tue, 4 Feb 2020 17:04:58 -0700 Subject: add annex-to-annex{,-dropunused} Signed-off-by: Sean Whitton --- bin/annex-to-annex | 188 ++++++++++++++++++++++++++++++++++++++++++ bin/annex-to-annex-dropunused | 72 ++++++++++++++++ 2 files changed, 260 insertions(+) create mode 100755 bin/annex-to-annex create mode 100755 bin/annex-to-annex-dropunused (limited to 'bin') diff --git a/bin/annex-to-annex b/bin/annex-to-annex new file mode 100755 index 0000000..0ebd771 --- /dev/null +++ b/bin/annex-to-annex @@ -0,0 +1,188 @@ +#!/usr/bin/perl + +# annex-to-annex -- use hardlinks to migrate files between git annex repos + +# Copyright (C) 2019-2020 Sean Whitton +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or (at +# your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +=head1 NAME + +annex-to-annex - use hardlinks to migrate files between git annex repos + +=head1 SYNOPSIS + + + +=head1 DESCRIPTION + +This script moves files and directories from one or more git annexes +into a destination git annex, using hardlinks rather than copying +files where possible. + +It is useful for splitting and consolidating git annexes. For +example, at the end of the semester I use this script to move files +from my work annex, which gets synced to a lot of places, into an +archival annex, which doesn't. + +=cut + +use 5.028; +use strict; +use warnings; + +use autodie; +use Digest::MD5::File qw(file_md5); +use File::Basename qw(dirname basename); +use File::Copy; +use File::Find; +use File::Spec::Functions qw(catfile rel2abs abs2rel); +use Try::Tiny; +use Git::Annex; + +# only support v7 because supporting v5 too would make things quite +# complex. require git-annex >=7.20191009 because it will refuse to +# work in v5 repos, and because it supports `git annex find --unlocked` +chomp(my %annex_version_fields = map { split ': ' } `git annex version`); +die "I need git-annex >=7.20191009 and a v7 repository\n" + unless $annex_version_fields{'git-annex version'} >= 7.20191009; + +die "need at least two arguments\n" unless @ARGV > 1; +my $dest = rel2abs pop @ARGV; +die "dest is not a directory\n" unless -d $dest; +my $dest_device_id = (stat($dest))[0]; +my $dannex = Git::Annex->new($dest); +my $do_commit = 0; +if ($ARGV[0] eq '--commit') { + $do_commit = 1; + shift @ARGV; + + my @git_status = $dannex->git->RUN("status", { porcelain => 1 }); + die "git repo containing $dest is not clean; please commit\n" + unless @git_status == 0; + + #<<< + try { + $dannex->git->symbolic_ref({ quiet => 1 }, "HEAD"); + } catch { + die "$dest has a detached HEAD; aborting"; + }; + #>>> +} +my @sources = map rel2abs($_), @ARGV; + +# process one entry in @sources at a time because we can start up +# annex batch processes for each of these as all files under each +# entry in @sources will lie in the same annex +foreach my $source (@sources) { + my $dir = dirname $source; + my $annex = Git::Annex->new($dir); + #<<< + try { + $annex->git->annex("status"); + } catch { + die "$source does not appear to lie within an annex\n"; + }; + #>>> + die "$source does not exist\n" unless -e $source; + + if ($do_commit) { + my @git_status = $annex->git->RUN("status", { porcelain => 1 }); + die "git repo containing $source is not clean; please commit\n" + unless @git_status == 0; + + #<<< + try { + $annex->git->symbolic_ref({ quiet => 1 }, "HEAD"); + } catch { + die "$dest has a detached HEAD; aborting"; + }; + #>>> + } + + my $base = basename $source; + my @missing = $annex->git->annex("find", "--not", "--in", "here", $base); + if (@missing) { + say "Following annexed files are not present in this repo:"; + say for @missing; + die "cannot continue; please `git-annex get` them\n"; + } + + # start batch processes + my $lk = $annex->batch("lookupkey"); + my $cl = $annex->batch("contentlocation"); + my $find = $annex->batch("find", "--unlocked"); + + find({ + wanted => sub { + my $rel = abs2rel $File::Find::name, $dir; + my $target = catfile $dest, $rel; + die "$target already exists!\n" if -e $target and !-d $target; + + my $key = $lk->ask($rel); + if ($key) { # this is an annexed file + my $content = rel2abs $cl->ask($key), $annex->toplevel; + my $content_device_id = (stat $content)[0]; + if ($dest_device_id == $content_device_id) { + link $content, $target; + } else { + copy_and_md5($content, $target); + } + # add, and then maybe unlock. we don't use `-c + # annex.addunlocked=true` because we want to + # hardlink from .git/annex/objects in the source + # to .git/annex/objects in the dest, rather than + # having the unlocked copy in dest be hardlinked + # to the source, or anything like that + system "git", "-C", $dest, "annex", "add", $rel; + system "git", "-C", $dest, "annex", "unlock", $rel + if $find->ask($rel); + + # if using the default backend, quick sanity check + if ($key =~ /^SHA256E-s[0-9]+--([0-9a-f]+)/) { + my $key_sum = $1; + chomp(my $dest_key + = `git -C "$dest" annex lookupkey "$rel"`); + if ($dest_key =~ /^SHA256E-s[0-9]+--([0-9a-f]+)/) { + my $dest_key_sum = $1; + die +"git-annex calculated a different checksum for $target" + unless $key_sum eq $dest_key_sum; + } + } + } else { # this is not an annexed file + if (-d $File::Find::name) { + mkdir $target unless -d $target; + } else { + copy_and_md5($File::Find::name, $target); + system "git", "-C", $dest, + "-c", "annex.gitaddtoannex=false", "add", $rel; + } + } + $annex->git->rm($File::Find::name) unless -d $File::Find::name; + }, + no_chdir => 1, + }, + $source + ); + $annex->git->commit({ message => "migrated by annex-to-annex" }) + if $do_commit; +} +$dannex->git->commit({ message => "add" }) if $do_commit; + +sub copy_and_md5 { + copy($_[0], $_[1]); + die "md5 checksum failure after copying $_[0] to $_[1]!" + unless file_md5($_[0]) eq file_md5($_[1]); +} diff --git a/bin/annex-to-annex-dropunused b/bin/annex-to-annex-dropunused new file mode 100755 index 0000000..33ddf03 --- /dev/null +++ b/bin/annex-to-annex-dropunused @@ -0,0 +1,72 @@ +#!/usr/bin/perl + +# annex-to-annex-dropunused -- drop old hardlinks migrated by annex-to-annex + +# Copyright (C) 2019-2020 Sean Whitton +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or (at +# your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +=head1 NAME + +annex-to-annex-dropunused - drop old hardlinks migrated by annex-to-annex + +=head1 SYNOPSIS + + + +=head1 DESCRIPTION + + + +=cut + +use 5.028; +use strict; +use warnings; + +use autodie; +use Git::Annex; + +# This script used to have a --dest option which specified the +# destination annex previously used with annex-to-annex. Then, if the +# unused file had a hardlink count of 1, but was present in the +# destination annex, this script would drop it. +# +# That was somewhat dangerous functionality because it involves this +# script running `git annex dropunused --force` for files with a +# hardlink count of 1. And further, it is not actually needed, +# because running annex-to-annex-reinject after +# annex-to-annex-dropunused handles such files in a way that is safer. +# +# It is still good to run this script before annex-to-annex-reinject +# to make the latter faster. + +my $annex = Git::Annex->new; + +my @to_drop; +my @unused_files = grep { !$_->{bad} && !$_->{tmp} } @{ $annex->unused }; + +foreach my $unused_file (@unused_files) { + my $content = $annex->abs_contentlocation($unused_file->{key}); + my $link_count = (stat $content)[3]; + my @logs + = $annex->git->log({ no_textconv => 1 }, "-S", $unused_file->{key}); + + next unless $logs[0] and $logs[0]->message =~ /migrated by annex-to-annex/; + next unless $link_count > 1; + + push @to_drop, $unused_file->{number}; +} + +$annex->git->annex("dropunused", { force => 1 }, @to_drop); -- cgit v1.2.3