blob: GetBlobContent: reduce allocations (#8223)

See #8222 for context.

## git.Blob.NewTruncatedReader

This introduce a new `NewTruncatedReader` method to return a blob-reader which silently truncates when the limit is reached (io.EOF will be returned).
Since the actual size is also returned `GetBlobContent` can pre-allocate a `[]byte` of the full-size (min of the asked size and the actual size) and call `io.ReadFull(rc, buf)` (instead of `util.ReadWithLimit(dataRc, int(limit))` which is convoluted and not used anywhere else).

### Tests

- I added test coverage for Go changes...
  - [x] in their respective `*_test.go` for unit tests.

### Documentation

- [x] I did not document these changes and I do not expect someone else to do it.

### Release notes

- [x] I do not want this change to show in the release notes.

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/8223
Reviewed-by: Lucas <sclu1034@noreply.codeberg.org>
Reviewed-by: Earl Warren <earl-warren@noreply.codeberg.org>
Co-authored-by: oliverpool <git@olivier.pfad.fr>
Co-committed-by: oliverpool <git@olivier.pfad.fr>
This commit is contained in:
oliverpool 2025-06-19 18:36:12 +02:00 committed by Earl Warren
parent 913eaffb8a
commit 31ad7c9353
4 changed files with 95 additions and 135 deletions

View file

@ -12,7 +12,6 @@ import (
"forgejo.org/modules/log"
"forgejo.org/modules/typesniffer"
"forgejo.org/modules/util"
)
// Blob represents a Git object.
@ -25,42 +24,25 @@ type Blob struct {
repo *Repository
}
// DataAsync gets a ReadCloser for the contents of a blob without reading it all.
// Calling the Close function on the result will discard all unread output.
func (b *Blob) DataAsync() (io.ReadCloser, error) {
func (b *Blob) newReader() (*bufio.Reader, int64, func(), error) {
wr, rd, cancel, err := b.repo.CatFileBatch(b.repo.Ctx)
if err != nil {
return nil, err
return nil, 0, nil, err
}
_, err = wr.Write([]byte(b.ID.String() + "\n"))
if err != nil {
cancel()
return nil, err
return nil, 0, nil, err
}
_, _, size, err := ReadBatchLine(rd)
if err != nil {
cancel()
return nil, err
return nil, 0, nil, err
}
b.gotSize = true
b.size = size
if size < 4096 {
bs, err := io.ReadAll(io.LimitReader(rd, size))
defer cancel()
if err != nil {
return nil, err
}
_, err = rd.Discard(1)
return io.NopCloser(bytes.NewReader(bs)), err
}
return &blobReader{
rd: rd,
n: size,
cancel: cancel,
}, nil
return rd, size, cancel, err
}
// Size returns the uncompressed size of the blob
@ -91,9 +73,35 @@ func (b *Blob) Size() int64 {
return b.size
}
// DataAsync gets a ReadCloser for the contents of a blob without reading it all.
// Calling the Close function on the result will discard all unread output.
func (b *Blob) DataAsync() (io.ReadCloser, error) {
rd, size, cancel, err := b.newReader()
if err != nil {
return nil, err
}
if size < 4096 {
bs, err := io.ReadAll(io.LimitReader(rd, size))
defer cancel()
if err != nil {
return nil, err
}
_, err = rd.Discard(1)
return io.NopCloser(bytes.NewReader(bs)), err
}
return &blobReader{
rd: rd,
n: size,
cancel: cancel,
}, nil
}
type blobReader struct {
rd *bufio.Reader
n int64
n int64 // number of bytes to read
additionalDiscard int64 // additional number of bytes to discard
cancel func()
}
@ -117,7 +125,8 @@ func (b *blobReader) Close() error {
defer b.cancel()
if err := DiscardFull(b.rd, b.n+1); err != nil {
// discard the unread bytes, the truncated bytes and the trailing newline
if err := DiscardFull(b.rd, b.n+b.additionalDiscard+1); err != nil {
return err
}
@ -131,17 +140,35 @@ func (b *Blob) Name() string {
return b.name
}
// GetBlobContent Gets the limited content of the blob as raw text
// NewTruncatedReader return a blob-reader which silently truncates when the limit is reached (io.EOF will be returned)
func (b *Blob) NewTruncatedReader(limit int64) (rc io.ReadCloser, fullSize int64, err error) {
r, fullSize, cancel, err := b.newReader()
if err != nil {
return nil, fullSize, err
}
limit = min(limit, fullSize)
return &blobReader{
rd: r,
n: limit,
additionalDiscard: fullSize - limit,
cancel: cancel,
}, fullSize, nil
}
// GetBlobContent Gets the truncated content of the blob as raw text
func (b *Blob) GetBlobContent(limit int64) (string, error) {
if limit <= 0 {
return "", nil
}
dataRc, err := b.DataAsync()
rc, fullSize, err := b.NewTruncatedReader(limit)
if err != nil {
return "", err
}
defer dataRc.Close()
buf, err := util.ReadWithLimit(dataRc, int(limit))
defer rc.Close()
buf := make([]byte, min(fullSize, limit))
_, err = io.ReadFull(rc, buf)
return string(buf), err
}

View file

@ -35,6 +35,43 @@ func TestBlob_Data(t *testing.T) {
assert.Equal(t, output, string(data))
}
func TestBlob_GetBlobContent(t *testing.T) {
bareRepo1Path := filepath.Join(testReposDir, "repo1_bare")
repo, err := openRepositoryWithDefaultContext(bareRepo1Path)
require.NoError(t, err)
defer repo.Close()
testBlob, err := repo.GetBlob("6c493ff740f9380390d5c9ddef4af18697ac9375")
require.NoError(t, err)
r, err := testBlob.GetBlobContent(100)
require.NoError(t, err)
require.Equal(t, "file2\n", r)
r, err = testBlob.GetBlobContent(-1)
require.NoError(t, err)
require.Empty(t, r)
r, err = testBlob.GetBlobContent(4)
require.NoError(t, err)
require.Equal(t, "file", r)
r, err = testBlob.GetBlobContent(6)
require.NoError(t, err)
require.Equal(t, "file2\n", r)
t.Run("non-existing blob", func(t *testing.T) {
inexistingBlob, err := repo.GetBlob("00003ff740f9380390d5c9ddef4af18690000000")
require.NoError(t, err)
r, err := inexistingBlob.GetBlobContent(100)
require.Error(t, err)
require.IsType(t, ErrNotExist{}, err)
require.Empty(t, r)
})
}
func Benchmark_Blob_Data(b *testing.B) {
bareRepo1Path := filepath.Join(testReposDir, "repo1_bare")
repo, err := openRepositoryWithDefaultContext(bareRepo1Path)

View file

@ -4,7 +4,6 @@
package util
import (
"bytes"
"errors"
"io"
)
@ -20,42 +19,6 @@ func ReadAtMost(r io.Reader, buf []byte) (n int, err error) {
return n, err
}
// ReadWithLimit reads at most "limit" bytes from r into buf.
// If EOF or ErrUnexpectedEOF occurs while reading, err will be nil.
func ReadWithLimit(r io.Reader, n int) (buf []byte, err error) {
return readWithLimit(r, 1024, n)
}
func readWithLimit(r io.Reader, batch, limit int) ([]byte, error) {
if limit <= batch {
buf := make([]byte, limit)
n, err := ReadAtMost(r, buf)
if err != nil {
return nil, err
}
return buf[:n], nil
}
res := bytes.NewBuffer(make([]byte, 0, batch))
bufFix := make([]byte, batch)
eof := false
for res.Len() < limit && !eof {
bufTmp := bufFix
if res.Len()+batch > limit {
bufTmp = bufFix[:limit-res.Len()]
}
n, err := io.ReadFull(r, bufTmp)
if err == io.EOF || err == io.ErrUnexpectedEOF {
eof = true
} else if err != nil {
return nil, err
}
if _, err = res.Write(bufTmp[:n]); err != nil {
return nil, err
}
}
return res.Bytes(), nil
}
// ErrNotEmpty is an error reported when there is a non-empty reader
var ErrNotEmpty = errors.New("not-empty")

View file

@ -1,67 +0,0 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package util
import (
"bytes"
"errors"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type readerWithError struct {
buf *bytes.Buffer
}
func (r *readerWithError) Read(p []byte) (n int, err error) {
if r.buf.Len() < 2 {
return 0, errors.New("test error")
}
return r.buf.Read(p)
}
func TestReadWithLimit(t *testing.T) {
bs := []byte("0123456789abcdef")
// normal test
buf, err := readWithLimit(bytes.NewBuffer(bs), 5, 2)
require.NoError(t, err)
assert.Equal(t, []byte("01"), buf)
buf, err = readWithLimit(bytes.NewBuffer(bs), 5, 5)
require.NoError(t, err)
assert.Equal(t, []byte("01234"), buf)
buf, err = readWithLimit(bytes.NewBuffer(bs), 5, 6)
require.NoError(t, err)
assert.Equal(t, []byte("012345"), buf)
buf, err = readWithLimit(bytes.NewBuffer(bs), 5, len(bs))
require.NoError(t, err)
assert.Equal(t, []byte("0123456789abcdef"), buf)
buf, err = readWithLimit(bytes.NewBuffer(bs), 5, 100)
require.NoError(t, err)
assert.Equal(t, []byte("0123456789abcdef"), buf)
// test with error
buf, err = readWithLimit(&readerWithError{bytes.NewBuffer(bs)}, 5, 10)
require.NoError(t, err)
assert.Equal(t, []byte("0123456789"), buf)
buf, err = readWithLimit(&readerWithError{bytes.NewBuffer(bs)}, 5, 100)
require.ErrorContains(t, err, "test error")
assert.Empty(t, buf)
// test public function
buf, err = ReadWithLimit(bytes.NewBuffer(bs), 2)
require.NoError(t, err)
assert.Equal(t, []byte("01"), buf)
buf, err = ReadWithLimit(bytes.NewBuffer(bs), 9999999)
require.NoError(t, err)
assert.Equal(t, []byte("0123456789abcdef"), buf)
}