Skip to content

Commit

Permalink
internal/counter: fix file rotation panics
Browse files Browse the repository at this point in the history
Weekly, the memory-mapped counter file is replaced by a new one.
If this fails, using counters may panic.

This CL fixes the code to avoid the above panics by
setting the counter's file's mapping to nil while there is a danger
that an existing mapping is no longer valid.

The reader can understand the changes using the new TestRotateCounters
at line 99 to see what happens when remapping fails.
1. In the existing code, the current.Lookup at existing line 308 will fault
2. Commenting out that code delays the fault to the c.Inc() at line 106
   of rotate_test.go.
3. So the counters have dangling pointers. These have to be reset.
   This is done by f.current.Store(nil), and running a cleanup
   function at line 313 in the new code. The cleanup function resets
   all the counter pointers, and then cleans up the old mapping, if
   there is one. (The code to reset the counters cannot be run while
   f.mu is locked.)
4. But there is the same problem with counter pointers pointing to the
   old mapping even after the file points to the new mapping. The same
   solution works, resetting the counter pointer, and then closing
   the old mapping. (The old code freed the mapping too soon.)
In both cases (3 and 4), if a counter pointer is used, it will point
   to a valid mapping (nil is fine; the increment goes into the
   counter's state)

Summary: an old mapping cannot be removed until all the file's
counters point to a new mapping.

The CL also
  Fixes some internal documentation,
  Fixes a time-related test error that only showed up in Eastern
      time evening,
  Adds a test showing the life of a counter and testing mapping
      failure in file rotation.

Fixes: golang/go#64577

Change-Id: I3e4775fb527e94d8a32505c8d77a131659b0a82a
Reviewed-on: https://go-review.googlesource.com/c/telemetry/+/552276
Reviewed-by: Robert Findley <[email protected]>
Reviewed-by: Hyang-Ah Hana Kim <[email protected]>
TryBot-Result: Gopher Robot <[email protected]>
Run-TryBot: Peter Weinberger <[email protected]>
  • Loading branch information
pjweinbgo committed Jan 16, 2024
1 parent 76c6b0d commit 88e2d1c
Show file tree
Hide file tree
Showing 3 changed files with 255 additions and 100 deletions.
59 changes: 1 addition & 58 deletions internal/counter/counter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -374,63 +374,6 @@ func future(days int) func() time.Time {
}
}

func TestRotate(t *testing.T) {
testenv.SkipIfUnsupportedPlatform(t)

t.Logf("GOOS %s GOARCH %s", runtime.GOOS, runtime.GOARCH)
year, month, day := counterTime().Date()
now := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
setup(t)
defer restore()
// pretend something was uploaded
os.WriteFile(filepath.Join(telemetry.UploadDir, "anything"), []byte{}, 0666)
var f file
defer close(&f)
c := f.New("gophers")
c.Inc()
var modified int
for i := 0; i < 2; i++ {
// nothing should change on the second rotate
f.rotate()
fi, err := os.ReadDir(telemetry.LocalDir)
if err != nil || len(fi) != 2 {
t.Fatalf("err=%v, len(fi) = %d, want 2", err, len(fi))
}
x := fi[0].Name()
y := x[len(x)-len("2006-01-02")-len(".v1.count") : len(x)-len(".v1.count")]
us, err := time.ParseInLocation("2006-01-02", y, time.UTC)
if err != nil {
t.Fatal(err)
}
// we expect today's date?
if us != now {
t.Errorf("us = %v, want %v, i=%d y=%s", us, now, i, y)
}
fd, err := os.Open(filepath.Join(telemetry.LocalDir, fi[0].Name()))
if err != nil {
t.Fatal(err)
}
stat, err := fd.Stat()
if err != nil {
t.Fatal(err)
}
mt := stat.ModTime().Nanosecond()
if modified == 0 {
modified = mt
}
if modified != mt {
t.Errorf("modified = %v, want %v", mt, modified)
}
fd.Close()
}
counterTime = func() time.Time { return now.Add(7 * 24 * time.Hour) }
f.rotate()
fi, err := os.ReadDir(telemetry.LocalDir)
if err != nil || len(fi) != 3 {
t.Fatalf("err=%v, len(fi) = %d, want 3", err, len(fi))
}
}

func TestStack(t *testing.T) {
testenv.SkipIfUnsupportedPlatform(t)
t.Logf("GOOS %s GOARCH %s", runtime.GOOS, runtime.GOARCH)
Expand Down Expand Up @@ -558,7 +501,7 @@ func setup(t *testing.T) {
}

func restore() {
counterTime = time.Now
counterTime = time.Now().UTC
}

func (f *file) New(name string) *Counter {
Expand Down
88 changes: 46 additions & 42 deletions internal/counter/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,10 @@ func (f *file) register(c *Counter) {
}

// invalidateCounters marks as invalid all the pointers
// held by f's counters (because they point into m),
// and then closes prev.
// held by f's counters and then refreshes them.
//
// invalidateCounters cannot be called while holding f.mu,
// because a counter invalidation may call f.lookup.
// because a counter refresh may call f.lookup.
func (f *file) invalidateCounters() {
// Mark every counter as needing to refresh its count pointer.
if head := f.counters.Load(); head != nil {
Expand Down Expand Up @@ -178,9 +177,8 @@ func (f *file) filename(now time.Time) (name string, expire time.Time, err error
now = now.UTC()
year, month, day := now.Date()
begin := time.Date(year, month, day, 0, 0, 0, 0, time.UTC)
// files always begin today, but expire on the day of the week
// from the 'weekends' file. The actual day is the next day, except
// first-time users have no files expiring in less than 7 days.
// files always begin today, but expire on the next day of the week
// from the 'weekends' file.
incr, err := fileValidity(now)
if err != nil {
return "", time.Time{}, err
Expand All @@ -201,8 +199,7 @@ func (f *file) filename(now time.Time) (name string, expire time.Time, err error
}

// fileValidity returns the number of days that a file is valid for.
// It is either the number of days to the next day of the week from the
// 'weekends' file, or 7 more than that for new users.
// It is the number of days to the next day of the week from the 'weekends' file.
func fileValidity(now time.Time) (int, error) {
// If there is no 'weekends' file create it and initialize it
// to a random day of the week. There is a short interval for
Expand All @@ -212,10 +209,10 @@ func fileValidity(now time.Time) (int, error) {
if _, err := os.ReadFile(weekends); err != nil {
if err := os.MkdirAll(telemetry.LocalDir, 0777); err != nil {
debugPrintf("%v: could not create telemetry.LocalDir %s", err, telemetry.LocalDir)
return 0, err
return 7, err
}
if err = os.WriteFile(weekends, []byte(day), 0666); err != nil {
return 0, err
return 7, err
}
}

Expand All @@ -224,11 +221,11 @@ func fileValidity(now time.Time) (int, error) {
// There is no reasonable way of recovering from errors
// so we just fail
if err != nil {
return 0, err
return 7, err
}
buf = bytes.TrimSpace(buf)
if len(buf) == 0 {
return 0, fmt.Errorf("empty weekends file")
return 7, err
}
dayofweek := time.Weekday(buf[0] - '0') // 0 is Sunday
// paranoia to make sure the value is legal
Expand Down Expand Up @@ -270,51 +267,58 @@ func (f *file) rotate1() (expire time.Time, cleanup func()) {
f.mu.Lock()
defer f.mu.Unlock()

// TODO(hyangah): shouldn't we close the current one if f.current.Load() != nil
// but f.filename fails to return a valid name for the next counter file?
var previous *mappedFile
cleanup = func() {
// convert counters to new mapping (or nil)
// from old mapping (or nil)
f.invalidateCounters()
if previous == nil {
// no old mapping to worry about
return
}
// now it is safe to clean up the old mapping
if err := previous.f.Close(); err != nil {
log.Print(err)
}
if err := munmap(previous.mapping); err != nil {
log.Print(err)
}
}

name, expire, err := f.filename(counterTime())
if err != nil {
// This could be mode == "off" (when rotate is called for the first time)
ret := nop
if previous = f.current.Load(); previous != nil {
// or it could be some strange error
f.current.Store(nil)
ret = cleanup
}
debugPrintf("rotate: %v\n", err)
return time.Time{}, nop
}
if name == "" {
return time.Time{}, nop
return time.Time{}, ret
}

current := f.current.Load()
if current != nil && name == current.f.Name() {
previous = f.current.Load()
if previous != nil && name == previous.f.Name() {
// the existing file is fine
return expire, nop
}

if current != nil {
if err := current.f.Close(); err != nil {
log.Print(err)
}
if err := munmap(current.mapping); err != nil {
log.Print(err)
}
// TODO(hyangah): Ask pjw/rsc:
// Lookup accesses f.current.Load without lock.
// Why is it ok to munmap here?
// pjw: i think current.mapping can't be what we
// want; we're discarding it.
}

m, err := openMapped(name, f.meta, nil)
if err != nil {
// Mapping failed:
// If there used to be a mapped file, after cleanup
// incrementing counters will only change their internal state.
// (before cleanup the existing mapped file would be updated)
f.current.Store(nil) // invalidate the current mapping
debugPrintf("rotate: openMapped: %v\n", err)
if current != nil {
// TODO(hyangah): Didn't we just munmap current.mapping?
if v, _, _, _ := current.lookup("counter/rotate-error"); v != nil {
v.Add(1)
}
}
return expire, nop
return time.Time{}, cleanup
}

debugPrintf("using %v", m.f.Name())
f.current.Store(m)
return expire, f.invalidateCounters

return expire, cleanup
}

func (f *file) newCounter(name string) *atomic.Uint64 {
Expand Down
Loading

0 comments on commit 88e2d1c

Please sign in to comment.