Skip to content

Commit eeec217

Browse files
authored
Write-ahead log (#29)
Replaces the unstructured data file for storing key-value pairs with a write-ahead log. - In the event of a crash or a power loss the database is automatically recovered (#27). - Fixes disk space overhead when storing small keys and values (#24). - Optional background compaction allows reclaiming disk space occupied by overwritten or deleted keys (#28). See docs/design.md for more details.
1 parent 4b58804 commit eeec217

37 files changed

Lines changed: 2366 additions & 1132 deletions

.gitignore

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1 @@
11
/test.db
2-
/test.db.index
3-
/test.db.lock

.travis.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
language: go
22

33
go:
4-
- 1.x
4+
- "1.11.x"
5+
- "1.x"
56

67
os:
78
- linux

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
# Changelog
22

3+
## [0.9.0] - 2020-03-08
4+
## Changed
5+
- Replace the unstructured data file for storing key-value pairs with a write-ahead log.
6+
### Added
7+
- In the event of a crash or a power loss the database is automatically recovered.
8+
- Optional background compaction allows reclaiming disk space occupied by overwritten or deleted keys.
9+
### Fixed
10+
- Fix disk space overhead when storing small keys and values.
11+
312
## [0.8.3] - 2019-11-03
413
### Fixed
514
- Fix slice bounds out of range error mapping files on Windows.

README.md

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ func main() {
4545

4646
### Writing to a database
4747

48-
Use the `DB.Put()` function to insert a new key/value pair:
48+
Use the `DB.Put()` function to insert a new key-value pair:
4949

5050
```go
5151
err := db.Put([]byte("testKey"), []byte("testValue"))
@@ -56,7 +56,7 @@ if err != nil {
5656

5757
### Reading from a database
5858

59-
Use the `DB.Get()` function to retrieve the inserted value:
59+
To retrieve the inserted value, use the `DB.Get()` function:
6060

6161
```go
6262
val, err := db.Get([]byte("testKey"))
@@ -68,17 +68,17 @@ log.Printf("%s", val)
6868

6969
### Iterating over items
7070

71-
Use the `DB.Items()` function which returns a new instance of `ItemIterator`:
71+
To iterate over items, use `ItemIterator` returned by `DB.Items()`:
7272

7373
```go
7474
it := db.Items()
7575
for {
7676
key, val, err := it.Next()
77-
if err != nil {
78-
if err != pogreb.ErrIterationDone {
79-
log.Fatal(err)
80-
}
81-
break
77+
if err == pogreb.ErrIterationDone {
78+
break
79+
}
80+
if err != nil {
81+
log.Fatal(err)
8282
}
8383
log.Printf("%s %s", key, val)
8484
}
@@ -95,4 +95,4 @@ on DigitalOcean 8 CPUs / 16 GB RAM / 160 GB SSD + Ubuntu 16.04.3 (higher is bett
9595

9696
## Internals
9797

98-
[Pogreb - how it works](https://artem.krylysov.com/blog/2018/03/24/pogreb-key-value-store/).
98+
[Design document](/docs/design.md).

bucket.go

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,59 +6,62 @@ import (
66
"github.com/akrylysov/pogreb/fs"
77
)
88

9+
const (
10+
slotsPerBucket = 31
11+
bucketSize = 512
12+
)
13+
14+
// slot corresponds to a single item in the hash table.
915
type slot struct {
1016
hash uint32
17+
segmentID uint16
1118
keySize uint16
1219
valueSize uint32
13-
kvOffset int64
20+
offset uint32 // Segment offset.
1421
}
1522

1623
func (sl slot) kvSize() uint32 {
1724
return uint32(sl.keySize) + sl.valueSize
1825
}
1926

27+
// bucket is an array of slots.
2028
type bucket struct {
2129
slots [slotsPerBucket]slot
22-
next int64
30+
next int64 // Offset of overflow bucket.
2331
}
2432

33+
// bucketHandle is a bucket, plus its offset and the file it's written to.
2534
type bucketHandle struct {
2635
bucket
2736
file fs.MmapFile
2837
offset int64
2938
}
3039

31-
const (
32-
bucketSize uint32 = 512
33-
)
34-
35-
func align512(n uint32) uint32 {
36-
return (n + 511) &^ 511
37-
}
38-
3940
func (b bucket) MarshalBinary() ([]byte, error) {
4041
buf := make([]byte, bucketSize)
4142
data := buf
4243
for i := 0; i < slotsPerBucket; i++ {
4344
sl := b.slots[i]
4445
binary.LittleEndian.PutUint32(buf[:4], sl.hash)
45-
binary.LittleEndian.PutUint16(buf[4:6], sl.keySize)
46-
binary.LittleEndian.PutUint32(buf[6:10], sl.valueSize)
47-
binary.LittleEndian.PutUint64(buf[10:18], uint64(sl.kvOffset))
48-
buf = buf[18:]
46+
binary.LittleEndian.PutUint16(buf[4:6], sl.segmentID)
47+
binary.LittleEndian.PutUint16(buf[6:8], sl.keySize)
48+
binary.LittleEndian.PutUint32(buf[8:12], sl.valueSize)
49+
binary.LittleEndian.PutUint32(buf[12:16], sl.offset)
50+
buf = buf[16:]
4951
}
5052
binary.LittleEndian.PutUint64(buf[:8], uint64(b.next))
5153
return data, nil
5254
}
5355

5456
func (b *bucket) UnmarshalBinary(data []byte) error {
5557
for i := 0; i < slotsPerBucket; i++ {
56-
_ = data[18] // bounds check hint to compiler; see golang.org/issue/14808
58+
_ = data[16] // bounds check hint to compiler; see golang.org/issue/14808
5759
b.slots[i].hash = binary.LittleEndian.Uint32(data[:4])
58-
b.slots[i].keySize = binary.LittleEndian.Uint16(data[4:6])
59-
b.slots[i].valueSize = binary.LittleEndian.Uint32(data[6:10])
60-
b.slots[i].kvOffset = int64(binary.LittleEndian.Uint64(data[10:18]))
61-
data = data[18:]
60+
b.slots[i].segmentID = binary.LittleEndian.Uint16(data[4:6])
61+
b.slots[i].keySize = binary.LittleEndian.Uint16(data[6:8])
62+
b.slots[i].valueSize = binary.LittleEndian.Uint32(data[8:12])
63+
b.slots[i].offset = binary.LittleEndian.Uint32(data[12:16])
64+
data = data[16:]
6265
}
6366
b.next = int64(binary.LittleEndian.Uint64(data[:8]))
6467
return nil
@@ -89,15 +92,17 @@ func (b *bucketHandle) write() error {
8992
return err
9093
}
9194

95+
// slotWriter inserts and writes slots into a bucket.
9296
type slotWriter struct {
9397
bucket *bucketHandle
9498
slotIdx int
9599
prevBuckets []*bucketHandle
96100
}
97101

98-
func (sw *slotWriter) insert(sl slot, db *DB) error {
102+
func (sw *slotWriter) insert(sl slot, idx *index) error {
99103
if sw.slotIdx == slotsPerBucket {
100-
nextBucket, err := db.createOverflowBucket()
104+
// Bucket is full, create a new overflow bucket.
105+
nextBucket, err := idx.createOverflowBucket()
101106
if err != nil {
102107
return err
103108
}
@@ -112,6 +117,7 @@ func (sw *slotWriter) insert(sl slot, db *DB) error {
112117
}
113118

114119
func (sw *slotWriter) write() error {
120+
// Write previous buckets first.
115121
for i := len(sw.prevBuckets) - 1; i >= 0; i-- {
116122
if err := sw.prevBuckets[i].write(); err != nil {
117123
return err

compaction.go

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
package pogreb
2+
3+
import (
4+
"sync/atomic"
5+
)
6+
7+
func (db *DB) moveRecord(rec record) (bool, error) {
8+
hash := db.hash(rec.key)
9+
reclaimed := true
10+
err := db.index.forEachBucket(db.index.bucketIndex(hash), func(b bucketHandle) (bool, error) {
11+
for i, sl := range b.slots {
12+
if sl.offset == 0 {
13+
return b.next == 0, nil
14+
}
15+
16+
// Slot points to a different record.
17+
if hash != sl.hash || rec.offset != sl.offset || rec.segmentID != sl.segmentID {
18+
continue
19+
}
20+
21+
segmentID, offset, err := db.datalog.writeRecord(rec.data, rec.rtype) // TODO: batch writes
22+
if err != nil {
23+
return true, err
24+
}
25+
// Update index.
26+
b.slots[i].segmentID = segmentID
27+
b.slots[i].offset = offset
28+
reclaimed = false
29+
return true, b.write()
30+
}
31+
return false, nil
32+
})
33+
return reclaimed, err
34+
}
35+
36+
// CompactionResult holds the compaction result.
37+
type CompactionResult struct {
38+
CompactedSegments int
39+
ReclaimedRecords int
40+
ReclaimedBytes int
41+
}
42+
43+
func (db *DB) compact(f *segment) (CompactionResult, error) {
44+
cr := CompactionResult{}
45+
46+
db.mu.Lock()
47+
f.meta.Full = true // Prevent writes to the compacted file.
48+
db.mu.Unlock()
49+
50+
it, err := newSegmentIterator(f)
51+
if err != nil {
52+
return cr, err
53+
}
54+
// Move records from f to the current segment.
55+
for {
56+
err := func() error {
57+
db.mu.Lock()
58+
defer db.mu.Unlock()
59+
rec, err := it.next()
60+
if err != nil {
61+
return err
62+
}
63+
if rec.rtype == recordTypeDelete {
64+
cr.ReclaimedRecords++
65+
cr.ReclaimedBytes += len(rec.data)
66+
return nil
67+
}
68+
reclaimed, err := db.moveRecord(rec)
69+
if reclaimed {
70+
cr.ReclaimedRecords++
71+
cr.ReclaimedBytes += len(rec.data)
72+
}
73+
return err
74+
}()
75+
if err == ErrIterationDone {
76+
break
77+
}
78+
if err != nil {
79+
return cr, err
80+
}
81+
}
82+
83+
db.mu.Lock()
84+
defer db.mu.Unlock()
85+
err = db.datalog.removeSegment(f)
86+
return cr, err
87+
}
88+
89+
func (db *DB) pickForCompaction() ([]*segment, error) {
90+
segments, err := db.datalog.segmentsByModification()
91+
if err != nil {
92+
return nil, err
93+
}
94+
var picked []*segment
95+
for i := len(segments) - 1; i >= 0; i-- {
96+
seg := segments[i]
97+
98+
if uint32(seg.size) < db.opts.compactionMinSegmentSize {
99+
continue
100+
}
101+
102+
fragmentation := float32(seg.meta.DeletedBytes) / float32(seg.size)
103+
if fragmentation < db.opts.compactionMinFragmentation {
104+
continue
105+
}
106+
107+
if seg.meta.DeleteRecords > 0 {
108+
// Delete records can be discarded only when older files contain no put records for the corresponding keys.
109+
// All files older than the file eligible for compaction have to be compacted.
110+
return append(segments[:i+1], picked...), nil
111+
}
112+
113+
picked = append([]*segment{seg}, picked...)
114+
}
115+
return picked, nil
116+
}
117+
118+
// Compact compacts the DB. Deleted and overwritten items are discarded.
119+
func (db *DB) Compact() (CompactionResult, error) {
120+
cr := CompactionResult{}
121+
122+
// Run only a single compaction at a time.
123+
if !atomic.CompareAndSwapInt32(&db.compactionRunning, 0, 1) {
124+
return cr, errBusy
125+
}
126+
defer func() {
127+
atomic.StoreInt32(&db.compactionRunning, 0)
128+
}()
129+
130+
db.mu.RLock()
131+
segments, err := db.pickForCompaction()
132+
db.mu.RUnlock()
133+
if err != nil {
134+
return cr, err
135+
}
136+
137+
for _, f := range segments {
138+
fcr, err := db.compact(f)
139+
if err != nil {
140+
return cr, err
141+
}
142+
cr.CompactedSegments++
143+
cr.ReclaimedRecords += fcr.ReclaimedRecords
144+
cr.ReclaimedBytes += fcr.ReclaimedBytes
145+
}
146+
147+
return cr, nil
148+
}

0 commit comments

Comments
 (0)