From 31ad7c9353ca1c82bb5785b5391efdd350be40d8 Mon Sep 17 00:00:00 2001 From: oliverpool Date: Thu, 19 Jun 2025 18:36:12 +0200 Subject: [PATCH] blob: GetBlobContent: reduce allocations (#8223) See #8222 for context. ## git.Blob.NewTruncatedReader This introduce a new `NewTruncatedReader` method to return a blob-reader which silently truncates when the limit is reached (io.EOF will be returned). Since the actual size is also returned `GetBlobContent` can pre-allocate a `[]byte` of the full-size (min of the asked size and the actual size) and call `io.ReadFull(rc, buf)` (instead of `util.ReadWithLimit(dataRc, int(limit))` which is convoluted and not used anywhere else). ### Tests - I added test coverage for Go changes... - [x] in their respective `*_test.go` for unit tests. ### Documentation - [x] I did not document these changes and I do not expect someone else to do it. ### Release notes - [x] I do not want this change to show in the release notes. Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/8223 Reviewed-by: Lucas Reviewed-by: Earl Warren Co-authored-by: oliverpool Co-committed-by: oliverpool --- modules/git/blob.go | 89 ++++++++++++++++++++++++++-------------- modules/git/blob_test.go | 37 +++++++++++++++++ modules/util/io.go | 37 ----------------- modules/util/io_test.go | 67 ------------------------------ 4 files changed, 95 insertions(+), 135 deletions(-) delete mode 100644 modules/util/io_test.go diff --git a/modules/git/blob.go b/modules/git/blob.go index 3fda358938..8c5c275146 100644 --- a/modules/git/blob.go +++ b/modules/git/blob.go @@ -12,7 +12,6 @@ import ( "forgejo.org/modules/log" "forgejo.org/modules/typesniffer" - "forgejo.org/modules/util" ) // Blob represents a Git object. @@ -25,42 +24,25 @@ type Blob struct { repo *Repository } -// DataAsync gets a ReadCloser for the contents of a blob without reading it all. -// Calling the Close function on the result will discard all unread output. -func (b *Blob) DataAsync() (io.ReadCloser, error) { +func (b *Blob) newReader() (*bufio.Reader, int64, func(), error) { wr, rd, cancel, err := b.repo.CatFileBatch(b.repo.Ctx) if err != nil { - return nil, err + return nil, 0, nil, err } _, err = wr.Write([]byte(b.ID.String() + "\n")) if err != nil { cancel() - return nil, err + return nil, 0, nil, err } _, _, size, err := ReadBatchLine(rd) if err != nil { cancel() - return nil, err + return nil, 0, nil, err } b.gotSize = true b.size = size - - if size < 4096 { - bs, err := io.ReadAll(io.LimitReader(rd, size)) - defer cancel() - if err != nil { - return nil, err - } - _, err = rd.Discard(1) - return io.NopCloser(bytes.NewReader(bs)), err - } - - return &blobReader{ - rd: rd, - n: size, - cancel: cancel, - }, nil + return rd, size, cancel, err } // Size returns the uncompressed size of the blob @@ -91,10 +73,36 @@ func (b *Blob) Size() int64 { return b.size } +// DataAsync gets a ReadCloser for the contents of a blob without reading it all. +// Calling the Close function on the result will discard all unread output. +func (b *Blob) DataAsync() (io.ReadCloser, error) { + rd, size, cancel, err := b.newReader() + if err != nil { + return nil, err + } + + if size < 4096 { + bs, err := io.ReadAll(io.LimitReader(rd, size)) + defer cancel() + if err != nil { + return nil, err + } + _, err = rd.Discard(1) + return io.NopCloser(bytes.NewReader(bs)), err + } + + return &blobReader{ + rd: rd, + n: size, + cancel: cancel, + }, nil +} + type blobReader struct { - rd *bufio.Reader - n int64 - cancel func() + rd *bufio.Reader + n int64 // number of bytes to read + additionalDiscard int64 // additional number of bytes to discard + cancel func() } func (b *blobReader) Read(p []byte) (n int, err error) { @@ -117,7 +125,8 @@ func (b *blobReader) Close() error { defer b.cancel() - if err := DiscardFull(b.rd, b.n+1); err != nil { + // discard the unread bytes, the truncated bytes and the trailing newline + if err := DiscardFull(b.rd, b.n+b.additionalDiscard+1); err != nil { return err } @@ -131,17 +140,35 @@ func (b *Blob) Name() string { return b.name } -// GetBlobContent Gets the limited content of the blob as raw text +// NewTruncatedReader return a blob-reader which silently truncates when the limit is reached (io.EOF will be returned) +func (b *Blob) NewTruncatedReader(limit int64) (rc io.ReadCloser, fullSize int64, err error) { + r, fullSize, cancel, err := b.newReader() + if err != nil { + return nil, fullSize, err + } + + limit = min(limit, fullSize) + return &blobReader{ + rd: r, + n: limit, + additionalDiscard: fullSize - limit, + cancel: cancel, + }, fullSize, nil +} + +// GetBlobContent Gets the truncated content of the blob as raw text func (b *Blob) GetBlobContent(limit int64) (string, error) { if limit <= 0 { return "", nil } - dataRc, err := b.DataAsync() + rc, fullSize, err := b.NewTruncatedReader(limit) if err != nil { return "", err } - defer dataRc.Close() - buf, err := util.ReadWithLimit(dataRc, int(limit)) + defer rc.Close() + + buf := make([]byte, min(fullSize, limit)) + _, err = io.ReadFull(rc, buf) return string(buf), err } diff --git a/modules/git/blob_test.go b/modules/git/blob_test.go index 810964b33d..2f81bb84e0 100644 --- a/modules/git/blob_test.go +++ b/modules/git/blob_test.go @@ -35,6 +35,43 @@ func TestBlob_Data(t *testing.T) { assert.Equal(t, output, string(data)) } +func TestBlob_GetBlobContent(t *testing.T) { + bareRepo1Path := filepath.Join(testReposDir, "repo1_bare") + repo, err := openRepositoryWithDefaultContext(bareRepo1Path) + require.NoError(t, err) + + defer repo.Close() + + testBlob, err := repo.GetBlob("6c493ff740f9380390d5c9ddef4af18697ac9375") + require.NoError(t, err) + + r, err := testBlob.GetBlobContent(100) + require.NoError(t, err) + require.Equal(t, "file2\n", r) + + r, err = testBlob.GetBlobContent(-1) + require.NoError(t, err) + require.Empty(t, r) + + r, err = testBlob.GetBlobContent(4) + require.NoError(t, err) + require.Equal(t, "file", r) + + r, err = testBlob.GetBlobContent(6) + require.NoError(t, err) + require.Equal(t, "file2\n", r) + + t.Run("non-existing blob", func(t *testing.T) { + inexistingBlob, err := repo.GetBlob("00003ff740f9380390d5c9ddef4af18690000000") + require.NoError(t, err) + + r, err := inexistingBlob.GetBlobContent(100) + require.Error(t, err) + require.IsType(t, ErrNotExist{}, err) + require.Empty(t, r) + }) +} + func Benchmark_Blob_Data(b *testing.B) { bareRepo1Path := filepath.Join(testReposDir, "repo1_bare") repo, err := openRepositoryWithDefaultContext(bareRepo1Path) diff --git a/modules/util/io.go b/modules/util/io.go index 1559b019a0..4c99004c0c 100644 --- a/modules/util/io.go +++ b/modules/util/io.go @@ -4,7 +4,6 @@ package util import ( - "bytes" "errors" "io" ) @@ -20,42 +19,6 @@ func ReadAtMost(r io.Reader, buf []byte) (n int, err error) { return n, err } -// ReadWithLimit reads at most "limit" bytes from r into buf. -// If EOF or ErrUnexpectedEOF occurs while reading, err will be nil. -func ReadWithLimit(r io.Reader, n int) (buf []byte, err error) { - return readWithLimit(r, 1024, n) -} - -func readWithLimit(r io.Reader, batch, limit int) ([]byte, error) { - if limit <= batch { - buf := make([]byte, limit) - n, err := ReadAtMost(r, buf) - if err != nil { - return nil, err - } - return buf[:n], nil - } - res := bytes.NewBuffer(make([]byte, 0, batch)) - bufFix := make([]byte, batch) - eof := false - for res.Len() < limit && !eof { - bufTmp := bufFix - if res.Len()+batch > limit { - bufTmp = bufFix[:limit-res.Len()] - } - n, err := io.ReadFull(r, bufTmp) - if err == io.EOF || err == io.ErrUnexpectedEOF { - eof = true - } else if err != nil { - return nil, err - } - if _, err = res.Write(bufTmp[:n]); err != nil { - return nil, err - } - } - return res.Bytes(), nil -} - // ErrNotEmpty is an error reported when there is a non-empty reader var ErrNotEmpty = errors.New("not-empty") diff --git a/modules/util/io_test.go b/modules/util/io_test.go deleted file mode 100644 index 870e713646..0000000000 --- a/modules/util/io_test.go +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2023 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package util - -import ( - "bytes" - "errors" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -type readerWithError struct { - buf *bytes.Buffer -} - -func (r *readerWithError) Read(p []byte) (n int, err error) { - if r.buf.Len() < 2 { - return 0, errors.New("test error") - } - return r.buf.Read(p) -} - -func TestReadWithLimit(t *testing.T) { - bs := []byte("0123456789abcdef") - - // normal test - buf, err := readWithLimit(bytes.NewBuffer(bs), 5, 2) - require.NoError(t, err) - assert.Equal(t, []byte("01"), buf) - - buf, err = readWithLimit(bytes.NewBuffer(bs), 5, 5) - require.NoError(t, err) - assert.Equal(t, []byte("01234"), buf) - - buf, err = readWithLimit(bytes.NewBuffer(bs), 5, 6) - require.NoError(t, err) - assert.Equal(t, []byte("012345"), buf) - - buf, err = readWithLimit(bytes.NewBuffer(bs), 5, len(bs)) - require.NoError(t, err) - assert.Equal(t, []byte("0123456789abcdef"), buf) - - buf, err = readWithLimit(bytes.NewBuffer(bs), 5, 100) - require.NoError(t, err) - assert.Equal(t, []byte("0123456789abcdef"), buf) - - // test with error - buf, err = readWithLimit(&readerWithError{bytes.NewBuffer(bs)}, 5, 10) - require.NoError(t, err) - assert.Equal(t, []byte("0123456789"), buf) - - buf, err = readWithLimit(&readerWithError{bytes.NewBuffer(bs)}, 5, 100) - require.ErrorContains(t, err, "test error") - assert.Empty(t, buf) - - // test public function - buf, err = ReadWithLimit(bytes.NewBuffer(bs), 2) - require.NoError(t, err) - assert.Equal(t, []byte("01"), buf) - - buf, err = ReadWithLimit(bytes.NewBuffer(bs), 9999999) - require.NoError(t, err) - assert.Equal(t, []byte("0123456789abcdef"), buf) -}