summaryrefslogtreecommitdiff
path: root/bin/annex-to-annex
blob: 3d412ee40aa5491aa0e2193a3cdec8c30301e6ae (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/perl

# annex-to-annex -- use hardlinks to migrate files between git annex repos

# Copyright (C) 2019 Sean Whitton
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

use 5.028;
use strict;
use warnings;

use Digest::MD5::File qw(file_md5);
use File::Basename qw(dirname basename);
use File::Copy;
use File::Find;
use File::Spec::Functions qw(catfile rel2abs abs2rel);
use IPC::Open2;

# only support v7 because supporting v5 too would make things quite
# complex.  require git-annex >=7.20191009 because it will refuse to
# work in v5 repos, and because it supports `git annex find --unlocked`
chomp(my %annex_version_fields = map { split ': ' } `git annex version`);
die "I need git-annex >=7.20191009 and a v7 repository\n"
  unless $annex_version_fields{'git-annex version'} ge 7.20191009;

die "need at least two arguments\n" unless @ARGV > 1;
my $dest = rel2abs(pop @ARGV);
die "dest is not a directory\n" unless -d $dest;
my $dest_device_id = (stat($dest))[0];
my $do_commit      = 0;
if ($ARGV[0] eq '--commit') {
    $do_commit = 1;
    shift @ARGV;

    my @git_status = `git -C "$dest" status --porcelain`;
    die "git repo containing $dest is not clean; please commit\n"
      unless @git_status == 0;
}
my @sources = map rel2abs($_), @ARGV;

# process one entry in @sources at a time because we can start up
# annex batch processes for each of these as all files under each
# entry in @sources will lie in the same annex
foreach my $source (@sources) {
    my $dir        = dirname($source);
    my $commit_dir = $dir;
    `git -C "$dir" annex status`;
    die "$source does not appear to lie within an annex\n" unless $? eq 0;
    die "$source does not exist\n" unless -e $source;

    if ($do_commit) {
        my @git_status = `git -C "$dir" status --porcelain`;
        die "git repo containing $source is not clean; please commit\n"
          unless @git_status == 0;
    }

    my $base    = basename($source);
    my @missing = `git -C "$dir" annex find --not --in here "$base"`;
    if (@missing) {
        say "Following annexed files are not present in this repo:";
        say for @missing;
        die "cannot continue; please `git-annex get` them\n";
    }

    # start batch processes
    my ($lk_out, $lk_in, $cl_out, $cl_in, $find_out, $find_in);
    my $lk_pid
      = open2($lk_out, $lk_in, 'git', '-C', $dir, 'annex', 'lookupkey',
        '--batch');
    my $cl_pid
      = open2($cl_out, $cl_in, 'git', '-C', $dir, 'annex', 'contentlocation',
        '--batch');
    my $find_pid = open2(
        $find_out, $find_in, 'git',  '-C',
        $dir,      'annex',  'find', '--unlocked',
        '--batch'
    );

    find({
            wanted => sub {
                my $rel = abs2rel($File::Find::name, $dir);
                my $target = catfile($dest, $rel);
                die "$target already exists!\n" if -e $target and !-d $target;

                my $key = ga_batch($lk_out, $lk_in, $rel);
                if (defined $key) {    # this is an annexed file
                    my $content = ga_batch($cl_out, $cl_in, $key);
                    $content = rel2abs($content, $dir);
                    my $content_device_id = (stat($content))[0];
                    if ($dest_device_id == $content_device_id) {
                        link($content, $target);
                    } else {
                        copy_and_md5($content, $target);
                    }
                    # add, and then maybe unlock.  we don't use `-c
                    # annex.addunlocked=true` because we want to hardlink
                    # from .git/annex/objects in the source to
                    # .git/annex/objects in the dest, rather than having
                    # the unlocked copy in dest be hardlinked to the
                    # source, or anything like that
                    system('git', '-C', $dest, 'annex', 'add',    $rel);
                    system('git', '-C', $dest, 'annex', 'unlock', $rel)
                      if defined ga_batch($find_out, $find_in, $rel);

                    # if using the default backend, quick sanity check
                    if ($key =~ /^SHA256E-s[0-9]+--([0-9a-f]+)/) {
                        my $key_sum = $1;
                        chomp(my $dest_key
                              = `git -C "$dest" annex lookupkey "$rel"`);
                        if ($dest_key =~ /^SHA256E-s[0-9]+--([0-9a-f]+)/) {
                            my $dest_key_sum = $1;
                            die
"git-annex calculated a different checksum for $target"
                              unless $key_sum eq $dest_key_sum;
                        }
                    }
                } else {    # this is not an annexed file
                    if (-d $File::Find::name) {
                        mkdir $target;
                    } else {
                        copy_and_md5($File::Find::name, $target);
                        system('git', '-C', $dest, '-c',
                            'annex.gitaddtoannex=false', 'add', $rel);
                    }
                }
                system('git', '-C', $dir, 'rm', $File::Find::name)
                  unless -d $File::Find::name;
                # $commit_dir may no longer exist, as if it was empty,
                # `git rm` will have removed it
                $commit_dir = dirname($commit_dir) until -d $commit_dir;
            },
            no_chdir => 1,
        },
        $source
    );
    system('git', '-C', $commit_dir, 'commit', '-m',
        "migrated by annex-to-annex")
      if $do_commit;
}
system('git', '-C', $dest, 'commit', '-m', 'add') if $do_commit;

sub ga_batch {
    my ($out_fh, $in_fh, $line) = @_;
    say $in_fh $line;
    chomp(my $out = <$out_fh>);
    if ($out eq "") {
        return undef;
    } else {
        return $out;
    }
}

sub copy_and_md5 {
    copy($_[0], $_[1]);
    die "md5 checksum failure after copying $_[0] to $_[1]!"
      unless file_md5($_[0]) eq file_md5($_[1]);
}