-
Notifications
You must be signed in to change notification settings - Fork 521
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ingester: Validate completed blocks #4256
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,13 @@ | ||
package vparquet4 | ||
|
||
import ( | ||
"context" | ||
"encoding/binary" | ||
"errors" | ||
"fmt" | ||
"sync" | ||
|
||
"github.com/google/uuid" | ||
"github.com/grafana/tempo/tempodb/backend" | ||
"github.com/grafana/tempo/tempodb/encoding/common" | ||
"go.opentelemetry.io/otel" | ||
|
@@ -33,3 +38,40 @@ func newBackendBlock(meta *backend.BlockMeta, r backend.Reader) *backendBlock { | |
func (b *backendBlock) BlockMeta() *backend.BlockMeta { | ||
return b.meta | ||
} | ||
|
||
// Validate will do a basic sanity check of the state of the parquet file. This can be extended to do more checks in the future. | ||
// This method should lean towards being cost effective over complete. | ||
func (b *backendBlock) Validate(ctx context.Context) error { | ||
if b.meta == nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a way to reassemble this information from data on disk and still avoid paying the allocs on startup like you mention above? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. possibly? what do you mean by "this information"? |
||
return errors.New("block meta is nil") | ||
} | ||
|
||
// read last 8 bytes of the file to confirm its at least complete. the last 4 should be ascii "PAR1" | ||
// and the 4 bytes before that should be the length of the footer | ||
buff := make([]byte, 8) | ||
err := b.r.ReadRange(ctx, DataFileName, uuid.UUID(b.meta.BlockID), b.meta.TenantID, b.meta.Size_-8, buff, nil) | ||
if err != nil { | ||
return fmt.Errorf("failed to read parquet magic footer: %w", err) | ||
} | ||
|
||
if string(buff[4:]) != "PAR1" { | ||
return fmt.Errorf("invalid parquet magic footer: %x", buff[4:]) | ||
} | ||
|
||
footerSize := int64(binary.LittleEndian.Uint32(buff[:4])) | ||
if footerSize != int64(b.meta.FooterSize) { | ||
return fmt.Errorf("unexpected parquet footer size: %d", footerSize) | ||
} | ||
|
||
// read the first byte from all blooms to confirm they exist | ||
buff = make([]byte, 1) | ||
for i := 0; i < int(b.meta.BloomShardCount); i++ { | ||
bloomName := common.BloomName(i) | ||
err = b.r.ReadRange(ctx, bloomName, uuid.UUID(b.meta.BlockID), b.meta.TenantID, 0, buff, nil) | ||
if err != nil { | ||
return fmt.Errorf("failed to read first byte of bloom(%d): %w", i, err) | ||
} | ||
} | ||
|
||
return nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
package vparquet4 | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
"time" | ||
|
||
"github.com/google/uuid" | ||
"github.com/grafana/tempo/pkg/util/test" | ||
"github.com/grafana/tempo/tempodb/backend" | ||
"github.com/grafana/tempo/tempodb/backend/local" | ||
"github.com/grafana/tempo/tempodb/encoding/common" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestValidateFailsOnCorruptParquetFile(t *testing.T) { | ||
ctx := context.Background() | ||
block, w := validBlock(t) | ||
meta := block.meta | ||
|
||
err := block.Validate(ctx) | ||
require.NoError(t, err) | ||
|
||
// Corrupt the file | ||
err = w.Write(ctx, DataFileName, uuid.UUID(meta.BlockID), meta.TenantID, []byte{0, 0, 0, 0, 0, 0, 0, 0}, nil) | ||
require.NoError(t, err) | ||
|
||
err = block.Validate(ctx) | ||
require.Error(t, err) | ||
} | ||
|
||
func TestValidateFailsOnMissingBloom(t *testing.T) { | ||
ctx := context.Background() | ||
block, w := validBlock(t) | ||
meta := block.meta | ||
|
||
err := block.Validate(ctx) | ||
require.NoError(t, err) | ||
|
||
// remove a bloom | ||
err = w.Delete(ctx, common.BloomName(0), backend.KeyPathForBlock(uuid.UUID(meta.BlockID), meta.TenantID)) | ||
require.NoError(t, err) | ||
|
||
err = block.Validate(ctx) | ||
require.Error(t, err) | ||
} | ||
|
||
func validBlock(t *testing.T) (*backendBlock, backend.Writer) { | ||
t.Helper() | ||
|
||
ctx := context.Background() | ||
|
||
rawR, rawW, _, err := local.New(&local.Config{ | ||
Path: t.TempDir(), | ||
}) | ||
require.NoError(t, err) | ||
|
||
r := backend.NewReader(rawR) | ||
w := backend.NewWriter(rawW) | ||
|
||
iter := newTestIterator() | ||
|
||
iter.Add(test.MakeTrace(10, nil), 100, 401) | ||
iter.Add(test.MakeTrace(10, nil), 101, 402) | ||
iter.Add(test.MakeTrace(10, nil), 102, 403) | ||
|
||
cfg := &common.BlockConfig{ | ||
BloomFP: 0.01, | ||
BloomShardSizeBytes: 100 * 1024, | ||
} | ||
|
||
meta := backend.NewBlockMeta("fake", uuid.New(), VersionString, backend.EncNone, "") | ||
meta.TotalObjects = 1 | ||
meta.StartTime = time.Unix(300, 0) | ||
meta.EndTime = time.Unix(305, 0) | ||
|
||
outMeta, err := CreateBlock(ctx, cfg, meta, iter, r, w) | ||
require.NoError(t, err) | ||
|
||
return newBackendBlock(outMeta, r), w | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if it would not be simpler trying to open the file directly with parquet-go:
OpenFile seems to do essentially the same plus some other validations. Is this difference noticeable enough? Since this is at startup time maybe it isn't.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OpenFile also unmarshals the thrift footer which is more costly. I was trying to avoid paying those allocs on startup.