convert: stream from fd to required clean filter to reduce used address space

The data is streamed to the filter process anyway.  Better avoid mapping
the file if possible.  This is especially useful if a clean filter
reduces the size, for example if it computes a sha1 for binary data,
like git media.  The file size that the previous implementation could
handle was limited by the available address space; large files for
example could not be handled with (32-bit) msysgit.  The new
implementation can filter files of any size as long as the filter output
is small enough.

The new code path is only taken if the filter is required.  The filter
consumes data directly from the fd.  If it fails, the original data is
not immediately available.  The condition can easily be handled as
a fatal error, which is expected for a required filter anyway.

If the filter was not required, the condition would need to be handled
in a different way, like seeking to 0 and reading the data.  But this
would require more restructuring of the code and is probably not worth
it.  The obvious approach of falling back to reading all data would not
help achieving the main purpose of this patch, which is to handle large
files with limited address space.  If reading all data is an option, we
can simply take the old code path right away and mmap the entire file.

The environment variable GIT_MMAP_LIMIT, which has been introduced in
a previous commit is used to test that the expected code path is taken.
A related test that exercises required filters is modified to verify
that the data actually has been modified on its way from the file system
to the object store.

Signed-off-by: Steffen Prohaska <prohaska@zib.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
4 files changed
tree: a275b416bd102af42760a60edc77594744dbeb7e
  1. block-sha1/
  2. builtin/
  3. compat/
  4. contrib/
  5. Documentation/
  6. ewah/
  7. git-gui/
  8. gitk-git/
  9. gitweb/
  10. mergetools/
  11. perl/
  12. po/
  13. ppc/
  14. t/
  15. templates/
  16. vcs-svn/
  17. xdiff/
  18. .gitattributes
  19. .gitignore
  20. .mailmap
  21. abspath.c
  22. aclocal.m4
  23. advice.c
  24. advice.h
  25. alias.c
  26. alloc.c
  27. archive-tar.c
  28. archive-zip.c
  29. archive.c
  30. archive.h
  31. argv-array.c
  32. argv-array.h
  33. attr.c
  34. attr.h
  35. base85.c
  36. bisect.c
  37. bisect.h
  38. blob.c
  39. blob.h
  40. branch.c
  41. branch.h
  42. builtin.h
  43. bulk-checkin.c
  44. bulk-checkin.h
  45. bundle.c
  46. bundle.h
  47. cache-tree.c
  48. cache-tree.h
  49. cache.h
  50. check-builtins.sh
  51. check-racy.c
  52. check_bindir
  53. color.c
  54. color.h
  55. column.c
  56. column.h
  57. combine-diff.c
  58. command-list.txt
  59. commit-slab.h
  60. commit.c
  61. commit.h
  62. config.c
  63. config.mak.in
  64. config.mak.uname
  65. configure.ac
  66. connect.c
  67. connect.h
  68. connected.c
  69. connected.h
  70. convert.c
  71. convert.h
  72. copy.c
  73. COPYING
  74. credential-cache--daemon.c
  75. credential-cache.c
  76. credential-store.c
  77. credential.c
  78. credential.h
  79. csum-file.c
  80. csum-file.h
  81. ctype.c
  82. daemon.c
  83. date.c
  84. decorate.c
  85. decorate.h
  86. delta.h
  87. diff-delta.c
  88. diff-lib.c
  89. diff-no-index.c
  90. diff.c
  91. diff.h
  92. diffcore-break.c
  93. diffcore-delta.c
  94. diffcore-order.c
  95. diffcore-pickaxe.c
  96. diffcore-rename.c
  97. diffcore.h
  98. dir.c
  99. dir.h
  100. editor.c
  101. entry.c
  102. environment.c
  103. exec_cmd.c
  104. exec_cmd.h
  105. fast-import.c
  106. fetch-pack.c
  107. fetch-pack.h
  108. fmt-merge-msg.h
  109. fsck.c
  110. fsck.h
  111. generate-cmdlist.sh
  112. gettext.c
  113. gettext.h
  114. git-add--interactive.perl
  115. git-am.sh
  116. git-archimport.perl
  117. git-bisect.sh
  118. git-compat-util.h
  119. git-cvsexportcommit.perl
  120. git-cvsimport.perl
  121. git-cvsserver.perl
  122. git-difftool--helper.sh
  123. git-difftool.perl
  124. git-filter-branch.sh
  125. git-instaweb.sh
  126. git-merge-octopus.sh
  127. git-merge-one-file.sh
  128. git-merge-resolve.sh
  129. git-mergetool--lib.sh
  130. git-mergetool.sh
  131. git-p4.py
  132. git-parse-remote.sh
  133. git-pull.sh
  134. git-quiltimport.sh
  135. git-rebase--am.sh
  136. git-rebase--interactive.sh
  137. git-rebase--merge.sh
  138. git-rebase.sh
  139. git-relink.perl
  140. git-remote-testgit.sh
  141. git-request-pull.sh
  142. git-send-email.perl
  143. git-sh-i18n.sh
  144. git-sh-setup.sh
  145. git-stash.sh
  146. git-submodule.sh
  147. git-svn.perl
  148. GIT-VERSION-GEN
  149. git-web--browse.sh
  150. git.c
  151. git.rc
  152. git.spec.in
  153. gpg-interface.c
  154. gpg-interface.h
  155. graph.c
  156. graph.h
  157. grep.c
  158. grep.h
  159. hashmap.c
  160. hashmap.h
  161. help.c
  162. help.h
  163. hex.c
  164. http-backend.c
  165. http-fetch.c
  166. http-push.c
  167. http-walker.c
  168. http.c
  169. http.h
  170. ident.c
  171. imap-send.c
  172. INSTALL
  173. khash.h
  174. kwset.c
  175. kwset.h
  176. levenshtein.c
  177. levenshtein.h
  178. LGPL-2.1
  179. line-log.c
  180. line-log.h
  181. line-range.c
  182. line-range.h
  183. list-objects.c
  184. list-objects.h
  185. ll-merge.c
  186. ll-merge.h
  187. lockfile.c
  188. log-tree.c
  189. log-tree.h
  190. mailmap.c
  191. mailmap.h
  192. Makefile
  193. match-trees.c
  194. merge-blobs.c
  195. merge-blobs.h
  196. merge-recursive.c
  197. merge-recursive.h
  198. merge.c
  199. mergesort.c
  200. mergesort.h
  201. name-hash.c
  202. notes-cache.c
  203. notes-cache.h
  204. notes-merge.c
  205. notes-merge.h
  206. notes-utils.c
  207. notes-utils.h
  208. notes.c
  209. notes.h
  210. object.c
  211. object.h
  212. pack-bitmap-write.c
  213. pack-bitmap.c
  214. pack-bitmap.h
  215. pack-check.c
  216. pack-objects.c
  217. pack-objects.h
  218. pack-revindex.c
  219. pack-revindex.h
  220. pack-write.c
  221. pack.h
  222. pager.c
  223. parse-options-cb.c
  224. parse-options.c
  225. parse-options.h
  226. patch-delta.c
  227. patch-ids.c
  228. patch-ids.h
  229. path.c
  230. pathspec.c
  231. pathspec.h
  232. pkt-line.c
  233. pkt-line.h
  234. preload-index.c
  235. pretty.c
  236. prio-queue.c
  237. prio-queue.h
  238. progress.c
  239. progress.h
  240. prompt.c
  241. prompt.h
  242. quote.c
  243. quote.h
  244. reachable.c
  245. reachable.h
  246. read-cache.c
  247. README
  248. reflog-walk.c
  249. reflog-walk.h
  250. refs.c
  251. refs.h
  252. remote-curl.c
  253. remote-testsvn.c
  254. remote.c
  255. remote.h
  256. replace_object.c
  257. rerere.c
  258. rerere.h
  259. resolve-undo.c
  260. resolve-undo.h
  261. revision.c
  262. revision.h
  263. run-command.c
  264. run-command.h
  265. send-pack.c
  266. send-pack.h
  267. sequencer.c
  268. sequencer.h
  269. server-info.c
  270. setup.c
  271. sh-i18n--envsubst.c
  272. sha1-array.c
  273. sha1-array.h
  274. sha1-lookup.c
  275. sha1-lookup.h
  276. sha1_file.c
  277. sha1_name.c
  278. shallow.c
  279. shell.c
  280. shortlog.h
  281. show-index.c
  282. sideband.c
  283. sideband.h
  284. sigchain.c
  285. sigchain.h
  286. split-index.c
  287. split-index.h
  288. strbuf.c
  289. strbuf.h
  290. streaming.c
  291. streaming.h
  292. string-list.c
  293. string-list.h
  294. submodule.c
  295. submodule.h
  296. symlinks.c
  297. tag.c
  298. tag.h
  299. tar.h
  300. test-chmtime.c
  301. test-ctype.c
  302. test-date.c
  303. test-delta.c
  304. test-dump-cache-tree.c
  305. test-dump-split-index.c
  306. test-genrandom.c
  307. test-hashmap.c
  308. test-index-version.c
  309. test-line-buffer.c
  310. test-match-trees.c
  311. test-mergesort.c
  312. test-mktemp.c
  313. test-parse-options.c
  314. test-path-utils.c
  315. test-prio-queue.c
  316. test-read-cache.c
  317. test-regex.c
  318. test-revision-walking.c
  319. test-run-command.c
  320. test-scrap-cache-tree.c
  321. test-sha1.c
  322. test-sha1.sh
  323. test-sigchain.c
  324. test-string-list.c
  325. test-subprocess.c
  326. test-svn-fe.c
  327. test-urlmatch-normalization.c
  328. test-wildmatch.c
  329. thread-utils.c
  330. thread-utils.h
  331. trace.c
  332. trace.h
  333. transport-helper.c
  334. transport.c
  335. transport.h
  336. tree-diff.c
  337. tree-walk.c
  338. tree-walk.h
  339. tree.c
  340. tree.h
  341. unicode_width.h
  342. unimplemented.sh
  343. unix-socket.c
  344. unix-socket.h
  345. unpack-trees.c
  346. unpack-trees.h
  347. update_unicode.sh
  348. upload-pack.c
  349. url.c
  350. url.h
  351. urlmatch.c
  352. urlmatch.h
  353. usage.c
  354. userdiff.c
  355. userdiff.h
  356. utf8.c
  357. utf8.h
  358. varint.c
  359. varint.h
  360. version.c
  361. version.h
  362. versioncmp.c
  363. walker.c
  364. walker.h
  365. wildmatch.c
  366. wildmatch.h
  367. wrap-for-bin.sh
  368. wrapper.c
  369. write_or_die.c
  370. ws.c
  371. wt-status.c
  372. wt-status.h
  373. xdiff-interface.c
  374. xdiff-interface.h
  375. zlib.c