Skip to content

Commit 3904180

Browse files
authored
Merge branch 'apache:master' into fix/disk_abnormal
2 parents 103f011 + 89f17c7 commit 3904180

12 files changed

Lines changed: 478 additions & 153 deletions

File tree

collector/config.yml

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,16 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18+
# logging configurations.
19+
log:
20+
filename: collector.log
21+
max_file_size_mb: 64
22+
retention_days: 3
23+
max_file_number: 8
24+
level: info
25+
1826
# the cluster that this collector is binding
19-
cluster_name : "onebox"
27+
cluster_name: "onebox"
2028

2129
# the meta server addresses of the cluster.
2230
meta_servers:
@@ -25,33 +33,34 @@ meta_servers:
2533
- 127.0.0.1:34603
2634

2735
# local server port
28-
port : 34101
36+
port: 34101
2937

3038
metrics:
3139
# use falcon as monitoring system.
32-
sink : falcon
33-
report_interval : 10s
40+
sink: falcon
41+
report_interval: 10s
3442

3543
prometheus:
3644
# the exposed port for prometheus exposer
37-
exposer_port : 1111
45+
exposer_port: 1111
3846

3947
falcon_agent:
4048
# the host IP of falcon agent
41-
host : "127.0.0.1"
42-
port : 1988
43-
http_path : "/v1/push"
49+
host: "127.0.0.1"
50+
port: 1988
51+
http_path: "/v1/push"
4452

4553
availability_detect:
46-
table_name : test
47-
partition_count : 16
48-
max_replica_count : 3
54+
table_name: test
55+
partition_count: 16
56+
max_replica_count: 3
4957

5058
hotspot:
51-
rpc_timeout : 5s
52-
partition_detect_interval : 30s
53-
pull_metrics_timeout : 5s
54-
sample_metrics_interval : 10s
55-
max_sample_size : 128
59+
retention_period: 24h
60+
rpc_timeout: 5s
61+
partition_detect_interval: 30s
62+
pull_metrics_timeout: 5s
63+
sample_metrics_interval: 10s
64+
max_sample_size: 128
5665
hotspot_partition_min_score: 3
5766
hotspot_partition_min_qps: 100

collector/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ module github.com/apache/incubator-pegasus/collector
2020
go 1.18
2121

2222
require (
23-
github.com/apache/incubator-pegasus/go-client v0.0.0-20260121121155-96868ed93b2a
23+
github.com/apache/incubator-pegasus/go-client v0.0.0-20260211095029-022854b0259f
2424
github.com/gammazero/deque v1.0.0
2525
github.com/kataras/iris/v12 v12.2.0
2626
github.com/prometheus/client_golang v1.18.0

collector/go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuy
3535
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
3636
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
3737
github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
38-
github.com/apache/incubator-pegasus/go-client v0.0.0-20260121121155-96868ed93b2a h1:Vqws5uoQ/ibw4QcnDHdXIleiGunC1QmZaMCrJN0znEk=
39-
github.com/apache/incubator-pegasus/go-client v0.0.0-20260121121155-96868ed93b2a/go.mod h1:SQnz/3Qg6uH1tfl3MKmiYwNk+i5CZiMD9AtMOTZkpgw=
38+
github.com/apache/incubator-pegasus/go-client v0.0.0-20260211095029-022854b0259f h1:Q9jSLZZCsD8tdU8h+qFe6PN5DPqWfiezkfK/8l16i7Y=
39+
github.com/apache/incubator-pegasus/go-client v0.0.0-20260211095029-022854b0259f/go.mod h1:SQnz/3Qg6uH1tfl3MKmiYwNk+i5CZiMD9AtMOTZkpgw=
4040
github.com/apache/thrift v0.13.0 h1:5hryIiq9gtn+MiLVn0wP37kb/uTeRZgN08WoCsAhIhI=
4141
github.com/apache/thrift v0.13.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ=
4242
github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o=

collector/hotspot/partition_detector.go

Lines changed: 70 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ type PartitionDetector interface {
4242

4343
type PartitionDetectorConfig struct {
4444
MetaServers []string
45+
RetentionPeriod time.Duration
4546
RpcTimeout time.Duration
4647
DetectInterval time.Duration
4748
PullMetricsTimeout time.Duration
@@ -54,6 +55,7 @@ type PartitionDetectorConfig struct {
5455
func LoadPartitionDetectorConfig() *PartitionDetectorConfig {
5556
return &PartitionDetectorConfig{
5657
MetaServers: viper.GetStringSlice("meta_servers"),
58+
RetentionPeriod: viper.GetDuration("hotspot.retention_period"),
5759
RpcTimeout: viper.GetDuration("hotspot.rpc_timeout"),
5860
DetectInterval: viper.GetDuration("hotspot.partition_detect_interval"),
5961
PullMetricsTimeout: viper.GetDuration("hotspot.pull_metrics_timeout"),
@@ -69,6 +71,10 @@ func NewPartitionDetector(cfg *PartitionDetectorConfig) (PartitionDetector, erro
6971
return nil, fmt.Errorf("MetaServers should not be empty")
7072
}
7173

74+
if cfg.RetentionPeriod <= 0 {
75+
return nil, fmt.Errorf("RetentionPeriod(%d) must be > 0", cfg.RetentionPeriod)
76+
}
77+
7278
if cfg.DetectInterval <= 0 {
7379
return nil, fmt.Errorf("DetectInterval(%d) must be > 0", cfg.DetectInterval)
7480
}
@@ -111,6 +117,12 @@ type partitionDetectorImpl struct {
111117
}
112118

113119
func (d *partitionDetectorImpl) Run(tom *tomb.Tomb) error {
120+
ctx, cancel := context.WithCancel(context.Background())
121+
122+
var wg sync.WaitGroup
123+
wg.Add(1)
124+
go d.checkExpiration(ctx, &wg)
125+
114126
ticker := time.NewTicker(d.cfg.DetectInterval)
115127
defer ticker.Stop()
116128

@@ -119,12 +131,50 @@ func (d *partitionDetectorImpl) Run(tom *tomb.Tomb) error {
119131
case <-ticker.C:
120132
d.detect()
121133
case <-tom.Dying():
134+
cancel()
135+
wg.Wait()
136+
122137
log.Info("Hotspot partition detector exited.")
123138
return nil
124139
}
125140
}
126141
}
127142

143+
func (d *partitionDetectorImpl) checkExpiration(ctx context.Context, wg *sync.WaitGroup) {
144+
defer wg.Done()
145+
146+
ticker := time.NewTicker(d.cfg.RetentionPeriod)
147+
defer ticker.Stop()
148+
149+
for {
150+
select {
151+
case <-ticker.C:
152+
d.retireExpiredTables()
153+
154+
case <-ctx.Done():
155+
log.Info("Expiration checker for hotspot exited.")
156+
return
157+
}
158+
}
159+
}
160+
161+
func (d *partitionDetectorImpl) retireExpiredTables() {
162+
currentTimestampSeconds := time.Now().Unix()
163+
164+
d.mtx.Lock()
165+
defer d.mtx.Unlock()
166+
167+
log.Info("check expired tables")
168+
169+
for key, analyzer := range d.analyzers {
170+
if !analyzer.isExpired(currentTimestampSeconds) {
171+
continue
172+
}
173+
174+
delete(d.analyzers, key)
175+
}
176+
}
177+
128178
func (d *partitionDetectorImpl) detect() {
129179
appMap, err := d.aggregate()
130180
if err != nil {
@@ -369,10 +419,8 @@ func calculateStats(
369419
}
370420

371421
// Only primary replica of a partition will be counted.
372-
// TODO(wangdan): support Equal() for base.HostPort.
373422
primary := stats.partitionConfigs[partitionID].HpPrimary
374-
if primary.GetHost() != node.HpNode.GetHost() ||
375-
primary.GetPort() != node.HpNode.GetPort() {
423+
if !node.HpNode.Equal(primary) {
376424
continue
377425
}
378426

@@ -439,6 +487,10 @@ func calculateHotspotStats(appMap appStatsMap) map[partitionAnalyzerKey][]hotspo
439487
func (d *partitionDetectorImpl) analyse(appMap appStatsMap) {
440488
hotspotMap := calculateHotspotStats(appMap)
441489

490+
nowTime := time.Now()
491+
expireTime := nowTime.Add(d.cfg.RetentionPeriod)
492+
expireTimestampSeconds := expireTime.Unix()
493+
442494
d.mtx.Lock()
443495
defer d.mtx.Unlock()
444496

@@ -455,7 +507,7 @@ func (d *partitionDetectorImpl) analyse(appMap appStatsMap) {
455507
d.analyzers[key] = analyzer
456508
}
457509

458-
analyzer.add(value)
510+
analyzer.add(value, expireTimestampSeconds)
459511

460512
// Perform the analysis asynchronously.
461513
go analyzer.analyse()
@@ -489,13 +541,26 @@ type partitionAnalyzer struct {
489541
appID int32
490542
partitionCount int32
491543
mtx sync.RWMutex
544+
expireTimestampSeconds int64
492545
samples deque.Deque[[]hotspotPartitionStats] // Each element is a sample of all partitions of the table
493546
}
494547

495-
func (a *partitionAnalyzer) add(sample []hotspotPartitionStats) {
548+
func (a *partitionAnalyzer) isExpired(currentTimestampSeconds int64) bool {
549+
a.mtx.RLock()
550+
defer a.mtx.RUnlock()
551+
552+
return currentTimestampSeconds >= a.expireTimestampSeconds
553+
}
554+
555+
func (a *partitionAnalyzer) add(
556+
sample []hotspotPartitionStats,
557+
expireTimestampSeconds int64,
558+
) {
496559
a.mtx.Lock()
497560
defer a.mtx.Unlock()
498561

562+
a.expireTimestampSeconds = expireTimestampSeconds
563+
499564
for a.samples.Len() >= a.maxSampleSize {
500565
a.samples.PopFront()
501566
}

0 commit comments

Comments
 (0)