summaryrefslogtreecommitdiff
path: root/bin/annex-to-annex
blob: ee16f05b1f99840a0eccf4cde1f98f8b6507e136 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/perl
# PODNAME: annex-to-annex
# ABSTRACT: use hardlinks to migrate files between git annex repos
#
# Copyright (C) 2019-2020 Sean Whitton
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

=head1 SYNOPSIS

B<annex-to-annex> [B<--commit>] I<source> ... I<destination>

=head1 DESCRIPTION

This script moves files and directories from one or more git annexes
into a destination git annex, where possible using hardlinks instead
of copying files.

It is useful for splitting and consolidating git annexes.  For
example, at the end of the semester I use this script to move files
from my work annex, which gets synced to a lot of places, into an
archival annex, which doesn't.

Each I<source> should be a file or directory in a git annex.
I<destination> should be a subdirectory of a git annex.

=head1 OPTIONS

=over 4

=item B<--commit>

Commit changes in the source and destination annexes.  Otherwise
changes are merely added to the index.

=back

=head1 SEE ALSO

git-annex(1), annex-to-annex-dropunused(1), annex-to-annex-reinject(1)

=cut

use 5.028;
use strict;
use warnings;

use autodie;
use Digest::MD5::File qw(file_md5);
use File::Basename qw(dirname basename);
use File::Copy;
use File::Find;
use File::Spec::Functions qw(catfile rel2abs abs2rel);
use Try::Tiny;
use Git::Annex;

# only support v7 because supporting v5 too would make things quite
# complex.  require git-annex >=7.20191009 because it will refuse to
# work in v5 repos, and because it supports `git annex find --unlocked`
chomp(my %annex_version_fields = map { split ': ' } `git annex version`);
die "I need git-annex >=7.20191009 and a v7 repository\n"
  unless $annex_version_fields{'git-annex version'} >= 7.20191009;

die "need at least two arguments\n" unless @ARGV > 1;
my $dest = rel2abs pop @ARGV;
die "dest is not a directory\n" unless -d $dest;
my $dest_device_id = (stat($dest))[0];
my $dannex         = Git::Annex->new($dest);
my $do_commit      = 0;
if ($ARGV[0] eq '--commit') {
    $do_commit = 1;
    shift @ARGV;

    my @git_status = $dannex->git->RUN("status", { porcelain => 1 });
    die "git repo containing $dest is not clean; please commit\n"
      unless @git_status == 0;

    #<<<
    try {
        $dannex->git->symbolic_ref({ quiet => 1 }, "HEAD");
    } catch {
        die "$dest has a detached HEAD; aborting";
    };
    #>>>
}
my @sources = map rel2abs($_), @ARGV;

# process one entry in @sources at a time because we can start up
# annex batch processes for each of these as all files under each
# entry in @sources will lie in the same annex
foreach my $source (@sources) {
    my $dir   = dirname $source;
    my $annex = Git::Annex->new($dir);
    #<<<
    try {
        $annex->git->annex("status");
    } catch {
        die "$source does not appear to lie within an annex\n";
    };
    #>>>
    die "$source does not exist\n" unless -e $source;

    if ($do_commit) {
        my @git_status = $annex->git->RUN("status", { porcelain => 1 });
        die "git repo containing $source is not clean; please commit\n"
          unless @git_status == 0;

        #<<<
        try {
            $annex->git->symbolic_ref({ quiet => 1 }, "HEAD");
        } catch {
            die "$dest has a detached HEAD; aborting";
        };
        #>>>
    }

    my $base = basename $source;
    my @missing = $annex->git->annex("find", "--not", "--in", "here", $base);
    if (@missing) {
        say "Following annexed files are not present in this repo:";
        say for @missing;
        die "cannot continue; please `git-annex get` them\n";
    }

    # start batch processes
    my $lk   = $annex->batch("lookupkey");
    my $cl   = $annex->batch("contentlocation");
    my $find = $annex->batch("find", "--unlocked");

    find({
            wanted => sub {
                my $rel    = abs2rel $File::Find::name, $dir;
                my $target = catfile $dest,             $rel;
                die "$target already exists!\n" if -e $target and !-d $target;

                my $key = $lk->ask($rel);
                if ($key) {    # this is an annexed file
                    my $content = rel2abs $cl->ask($key), $annex->toplevel;
                    my $content_device_id = (stat $content)[0];
                    if ($dest_device_id == $content_device_id) {
                        link $content, $target;
                    } else {
                        copy_and_md5($content, $target);
                    }
                    # add, and then maybe unlock.  we don't use `-c
                    # annex.addunlocked=true` because we want to
                    # hardlink from .git/annex/objects in the source
                    # to .git/annex/objects in the dest, rather than
                    # having the unlocked copy in dest be hardlinked
                    # to the source, or anything like that
                    system "git", "-C", $dest, "annex", "add",    $rel;
                    system "git", "-C", $dest, "annex", "unlock", $rel
                      if $find->ask($rel);

                    # if using the default backend, quick sanity check
                    if ($key =~ /^SHA256E-s[0-9]+--([0-9a-f]+)/) {
                        my $key_sum = $1;
                        chomp(my $dest_key
                              = `git -C "$dest" annex lookupkey "$rel"`);
                        if ($dest_key =~ /^SHA256E-s[0-9]+--([0-9a-f]+)/) {
                            my $dest_key_sum = $1;
                            die
"git-annex calculated a different checksum for $target"
                              unless $key_sum eq $dest_key_sum;
                        }
                    }
                } else {    # this is not an annexed file
                    if (-d $File::Find::name) {
                        mkdir $target unless -d $target;
                    } else {
                        copy_and_md5($File::Find::name, $target);
                        system "git", "-C", $dest,
                          "-c", "annex.gitaddtoannex=false", "add", $rel;
                    }
                }
                $annex->git->rm($File::Find::name) unless -d $File::Find::name;
            },
            no_chdir => 1,
        },
        $source
    );
    $annex->git->commit({ message => "migrated by annex-to-annex" })
      if $do_commit;
}
$dannex->git->commit({ message => "add" }) if $do_commit;

sub copy_and_md5 {
    copy($_[0], $_[1]);
    die "md5 checksum failure after copying $_[0] to $_[1]!"
      unless file_md5($_[0]) eq file_md5($_[1]);
}