summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoey Hess <joey@kitenet.net>2014-03-10 15:12:54 -0400
committerJoey Hess <joey@kitenet.net>2014-03-10 15:12:54 -0400
commit39a3a2641c1b97e6a4c7e1c9bb405236a7c2e5c3 (patch)
treee98678db2346b18f3d471a389f1ac2ee6f994d0b
parent06d4c0713457cad82d11b7c5172a7dbecb65f93a (diff)
downloadgit-repair-39a3a2641c1b97e6a4c7e1c9bb405236a7c2e5c3.tar.gz
Improve memory usage when git fsck finds a great many broken objects.
From 1.7 gb to 900 mb on 300 thousand unique reported shas. When shas are not unique, this streams much better than before, so won't buffer the full list before putting them into the Set and throwing away dups. And when fsck output includes ignorable lines, especially dangling object lines, they won't be buffered in memory at all.
-rw-r--r--Git/Fsck.hs20
-rw-r--r--debian/changelog6
2 files changed, 23 insertions, 3 deletions
diff --git a/Git/Fsck.hs b/Git/Fsck.hs
index e90683b..b3948cb 100644
--- a/Git/Fsck.hs
+++ b/Git/Fsck.hs
@@ -23,6 +23,7 @@ import Utility.Batch
import qualified Git.Version
import qualified Data.Set as S
+import System.Process (std_out, std_err)
type MissingObjects = S.Set Sha
@@ -46,9 +47,17 @@ findBroken batchmode r = do
(command', params') <- if batchmode
then toBatchCommand (command, params)
else return (command, params)
- (output, fsckok) <- processTranscript command' (toCommand params') Nothing
- let objs = findShas supportsNoDangling output
- badobjs <- findMissing objs r
+
+ p@(_, _, _, pid) <- createProcess $
+ (proc command' (toCommand params'))
+ { std_out = CreatePipe
+ , std_err = CreatePipe
+ }
+ bad1 <- readMissingObjs r supportsNoDangling (stdoutHandle p)
+ bad2 <- readMissingObjs r supportsNoDangling (stderrHandle p)
+ fsckok <- checkSuccessProcess pid
+ let badobjs = S.union bad1 bad2
+
if S.null badobjs && not fsckok
then return FsckFailed
else return $ FsckFoundMissing badobjs
@@ -69,6 +78,11 @@ knownMissing (FsckFoundMissing s) = s
findMissing :: [Sha] -> Repo -> IO MissingObjects
findMissing objs r = S.fromList <$> filterM (`isMissing` r) objs
+readMissingObjs :: Repo -> Bool -> Handle -> IO MissingObjects
+readMissingObjs r supportsNoDangling h = do
+ objs <- findShas supportsNoDangling <$> hGetContents h
+ findMissing objs r
+
isMissing :: Sha -> Repo -> IO Bool
isMissing s r = either (const True) (const False) <$> tryIO dump
where
diff --git a/debian/changelog b/debian/changelog
index 3ac62ab..9a3560b 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+git-repair (1.20140228) UNRELEASED; urgency=medium
+
+ * Improve memory usage when git fsck finds a great many broken objects.
+
+ -- Joey Hess <joeyh@debian.org> Mon, 10 Mar 2014 15:09:30 -0400
+
git-repair (1.20140227) unstable; urgency=medium
* Optimise unpacking of pack files, and avoid repeated error