Skip to content

Commit 6062ca0

Browse files
authored
Merge pull request coroot#621 from coroot/incidents_reworked
Incident data materialization in database
2 parents 84f97a8 + 63593c5 commit 6062ca0

32 files changed

Lines changed: 1027 additions & 548 deletions

api/api.go

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"net/http"
99
"slices"
1010
"sort"
11+
"strconv"
1112
"time"
1213

1314
"github.com/coroot/coroot/api/forms"
@@ -364,7 +365,7 @@ func (api *Api) Overview(w http.ResponseWriter, r *http.Request, u *db.User) {
364365
return
365366
}
366367
var ch *clickhouse.Client
367-
if ch, err = api.getClickhouseClient(project); err != nil {
368+
if ch, err = api.GetClickhouseClient(project); err != nil {
368369
klog.Warningln(err)
369370
}
370371
auditor.Audit(world, project, nil, project.ClickHouseConfig(api.globalClickHouse) != nil, nil)
@@ -929,6 +930,49 @@ func (api *Api) RCA(w http.ResponseWriter, r *http.Request, u *db.User) {
929930
utils.WriteJson(w, api.WithContext(project, cacheStatus, world, "not implemented"))
930931
}
931932

933+
func (api *Api) Incidents(w http.ResponseWriter, r *http.Request, u *db.User) {
934+
vars := mux.Vars(r)
935+
projectId := db.ProjectId(vars["project"])
936+
limit := 100
937+
if l := r.URL.Query().Get("limit"); l != "" {
938+
l64, err := strconv.ParseUint(l, 10, 32)
939+
if err != nil {
940+
klog.Warningln("invalid limit:", l)
941+
http.Error(w, "", http.StatusBadRequest)
942+
return
943+
}
944+
limit = int(l64)
945+
}
946+
project, err := api.db.GetProject(projectId)
947+
if err != nil {
948+
if errors.Is(err, db.ErrNotFound) {
949+
http.Error(w, "project not found", http.StatusNotFound)
950+
klog.Warningln("project not found:", projectId)
951+
return
952+
}
953+
klog.Errorln(err)
954+
http.Error(w, "", http.StatusInternalServerError)
955+
return
956+
}
957+
incidents, err := api.db.GetLatestIncidents(project.Id, limit)
958+
if err != nil {
959+
klog.Errorln(err)
960+
http.Error(w, "", http.StatusInternalServerError)
961+
return
962+
}
963+
world, project, cacheStatus, err := api.LoadWorldByRequest(r)
964+
if err != nil {
965+
klog.Errorln(err)
966+
http.Error(w, "", http.StatusInternalServerError)
967+
return
968+
}
969+
if project == nil || world == nil {
970+
utils.WriteJson(w, api.WithContext(project, cacheStatus, world, nil))
971+
return
972+
}
973+
utils.WriteJson(w, api.WithContext(project, cacheStatus, world, views.Incidents(world, incidents)))
974+
}
975+
932976
func (api *Api) Incident(w http.ResponseWriter, r *http.Request, u *db.User) {
933977
vars := mux.Vars(r)
934978
projectId := vars["project"]
@@ -1239,7 +1283,7 @@ func (api *Api) Profiling(w http.ResponseWriter, r *http.Request, u *db.User) {
12391283
return
12401284
}
12411285
var ch *clickhouse.Client
1242-
if ch, err = api.getClickhouseClient(project); err != nil {
1286+
if ch, err = api.GetClickhouseClient(project); err != nil {
12431287
klog.Warningln(err)
12441288
http.Error(w, "ClickHouse is not available", http.StatusInternalServerError)
12451289
return
@@ -1296,7 +1340,7 @@ func (api *Api) Tracing(w http.ResponseWriter, r *http.Request, u *db.User) {
12961340
}
12971341
q := r.URL.Query()
12981342
var ch *clickhouse.Client
1299-
if ch, err = api.getClickhouseClient(project); err != nil {
1343+
if ch, err = api.GetClickhouseClient(project); err != nil {
13001344
klog.Warningln(err)
13011345
http.Error(w, "ClickHouse is not available", http.StatusInternalServerError)
13021346
return
@@ -1350,7 +1394,7 @@ func (api *Api) Logs(w http.ResponseWriter, r *http.Request, u *db.User) {
13501394
http.Error(w, "Application not found", http.StatusNotFound)
13511395
return
13521396
}
1353-
ch, chErr := api.getClickhouseClient(project)
1397+
ch, chErr := api.GetClickhouseClient(project)
13541398
if chErr != nil {
13551399
klog.Warningln(chErr)
13561400
}
@@ -1535,12 +1579,11 @@ func (api *Api) getTimeContext(r *http.Request) (from timeseries.Time, to timese
15351579
if incident, err := api.db.GetIncidentByKey(projectId, incidentKey); err != nil {
15361580
klog.Warningln("failed to get incident:", err)
15371581
} else {
1538-
margin := model.MaxAlertRuleShortWindow + 15*timeseries.Minute
1539-
from = incident.OpenedAt.Add(-margin)
1582+
from = incident.OpenedAt.Add(-model.IncidentTimeOffset)
15401583
if incident.Resolved() {
1541-
if t := incident.ResolvedAt.Add(margin); t.Before(to) {
1542-
to = t
1543-
}
1584+
to = incident.ResolvedAt.Add(model.IncidentTimeOffset)
1585+
} else {
1586+
to = now
15441587
}
15451588
}
15461589
}
@@ -1571,7 +1614,7 @@ func maxDuration(d1, d2 timeseries.Duration) timeseries.Duration {
15711614
return d2
15721615
}
15731616

1574-
func (api *Api) getClickhouseClient(project *db.Project) (*clickhouse.Client, error) {
1617+
func (api *Api) GetClickhouseClient(project *db.Project) (*clickhouse.Client, error) {
15751618
cfg := project.ClickHouseConfig(api.globalClickHouse)
15761619
if cfg == nil {
15771620
return nil, nil

api/ctx.go

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ type DataWithContext struct {
1515
}
1616

1717
type Context struct {
18-
Status Status `json:"status"`
19-
Search Search `json:"search"`
18+
Status Status `json:"status"`
19+
Search Search `json:"search"`
20+
Incidents map[model.ApplicationCategory]int `json:"incidents"`
2021
}
2122

2223
type Status struct {
@@ -62,13 +63,27 @@ type Node struct {
6263
func (api *Api) WithContext(p *db.Project, cacheStatus *cache.Status, w *model.World, data any) DataWithContext {
6364
return DataWithContext{
6465
Context: Context{
65-
Status: renderStatus(p, cacheStatus, w, api.globalPrometheus),
66-
Search: renderSearch(w),
66+
Status: renderStatus(p, cacheStatus, w, api.globalPrometheus),
67+
Search: renderSearch(w),
68+
Incidents: renderIncidents(w),
6769
},
6870
Data: data,
6971
}
7072
}
7173

74+
func renderIncidents(w *model.World) map[model.ApplicationCategory]int {
75+
res := map[model.ApplicationCategory]int{}
76+
for _, app := range w.Applications {
77+
if len(app.Incidents) == 0 {
78+
continue
79+
}
80+
if last := app.Incidents[len(app.Incidents)-1]; !last.Resolved() {
81+
res[app.Category]++
82+
}
83+
}
84+
return res
85+
}
86+
7287
func renderStatus(p *db.Project, cacheStatus *cache.Status, w *model.World, globalPrometheus *db.IntegrationPrometheus) Status {
7388
res := Status{
7489
Status: model.OK,

api/forms/forms.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -670,11 +670,11 @@ func testIncidentNotification(project *db.Project) *db.IncidentNotification {
670670
ProjectId: project.Id,
671671
ApplicationId: model.NewApplicationId("default", model.ApplicationKindDeployment, "fake-app"),
672672
IncidentKey: "123ab456",
673-
Status: model.WARNING,
673+
Status: model.CRITICAL,
674674
Details: &db.IncidentNotificationDetails{
675675
Reports: []db.IncidentNotificationDetailsReport{
676-
{Name: model.AuditReportSLO, Check: model.Checks.SLOLatency.Title, Message: "error budget burn rate is 20x within 1 hour"},
677676
{Name: model.AuditReportNetwork, Check: model.Checks.NetworkRTT.Title, Message: "high network latency to 2 upstream services"},
677+
{Name: model.AuditReportLogs, Check: model.Checks.LogErrors.Title, Message: "1206 errors occurred"},
678678
},
679679
},
680680
}

api/views/incident/incident.go

Lines changed: 117 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,36 +2,133 @@ package incident
22

33
import (
44
"github.com/coroot/coroot/model"
5+
"github.com/coroot/coroot/timeseries"
6+
"github.com/coroot/coroot/utils"
57
)
68

9+
type Incident struct {
10+
model.ApplicationIncident
11+
Impact float32 `json:"impact"`
12+
ShortDescription string `json:"short_description"`
13+
ApplicationCategory model.ApplicationCategory `json:"application_category"`
14+
Duration timeseries.Duration `json:"duration"`
15+
}
16+
17+
func RenderList(w *model.World, incidents []*model.ApplicationIncident) []Incident {
18+
res := make([]Incident, 0, len(incidents))
19+
20+
for _, i := range incidents {
21+
res = append(res, renderIncident(w, i))
22+
}
23+
return res
24+
}
25+
26+
func renderIncident(w *model.World, i *model.ApplicationIncident) Incident {
27+
category := model.ApplicationCategoryApplication
28+
if app := w.GetApplication(i.ApplicationId); app != nil {
29+
category = app.Category
30+
}
31+
to := timeseries.Now()
32+
if i.Resolved() {
33+
to = i.ResolvedAt
34+
}
35+
return Incident{
36+
ApplicationIncident: *i,
37+
ShortDescription: i.ShortDescription(),
38+
ApplicationCategory: category,
39+
Duration: to.Sub(i.OpenedAt),
40+
Impact: max(
41+
i.Details.LatencyImpact.AffectedRequestPercentage,
42+
i.Details.AvailabilityImpact.AffectedRequestPercentage,
43+
),
44+
}
45+
}
46+
47+
type SLODetails struct {
48+
Objective string `json:"objective"`
49+
Compliance string `json:"compliance"`
50+
Violated bool `json:"violated"`
51+
Threshold float32 `json:"threshold"`
52+
}
53+
754
type View struct {
8-
Summary
9-
HeatMap *model.Widget `json:"heatmap"`
55+
Incident
56+
AvailabilitySLO *SLODetails `json:"availability_slo,omitempty"`
57+
LatencySLO *SLODetails `json:"latency_slo,omitempty"`
58+
ActualFrom timeseries.Time `json:"actual_from"`
59+
ActualTo timeseries.Time `json:"actual_to"`
60+
61+
Widgets []*model.Widget `json:"widgets"`
1062
}
1163

1264
func Render(w *model.World, app *model.Application, incident *model.ApplicationIncident) *View {
13-
v := &View{Summary: CalcSummary(w, app, incident), HeatMap: getHeatMap(app)}
14-
if v.HeatMap == nil {
15-
return nil
65+
to := timeseries.Now()
66+
if incident.Resolved() {
67+
to = incident.ResolvedAt
68+
}
69+
v := &View{
70+
Incident: renderIncident(w, incident),
71+
ActualTo: to,
72+
Widgets: incidentWidgets(w, app),
73+
}
74+
if len(app.AvailabilitySLIs) > 0 {
75+
sli := app.AvailabilitySLIs[0]
76+
v.AvailabilitySLO = &SLODetails{
77+
Objective: utils.FormatPercentage(sli.Config.ObjectivePercentage) + " of requests should not fail",
78+
Compliance: "100%",
79+
}
80+
for _, br := range incident.Details.AvailabilityBurnRates {
81+
if br.Severity > model.OK {
82+
if t := incident.OpenedAt.Add(-br.ShortWindow); v.ActualFrom.IsZero() || t.After(v.ActualFrom) {
83+
v.ActualFrom = t
84+
}
85+
v.AvailabilitySLO.Violated = true
86+
v.AvailabilitySLO.Compliance = utils.FormatPercentage(100 - br.LongWindowPercentage)
87+
break
88+
}
89+
}
1690
}
17-
v.HeatMap.AddAnnotation(model.Annotation{Name: "incident", X1: v.ActualFrom, X2: v.ActualTo})
18-
return v
19-
}
2091

21-
func getHeatMap(app *model.Application) *model.Widget {
22-
var sloReport *model.AuditReport
23-
for _, r := range app.Reports {
24-
if r.Name == model.AuditReportSLO {
25-
sloReport = r
92+
if len(app.LatencySLIs) > 0 {
93+
sli := app.LatencySLIs[0]
94+
v.LatencySLO = &SLODetails{
95+
Objective: utils.FormatPercentage(sli.Config.ObjectivePercentage) + " of requests should be served faster than " + utils.FormatLatency(sli.Config.ObjectiveBucket),
96+
Compliance: "100%",
97+
Threshold: sli.Config.ObjectiveBucket,
98+
}
99+
for _, br := range incident.Details.LatencyBurnRates {
100+
if v.ActualFrom.IsZero() {
101+
v.ActualFrom = incident.OpenedAt.Add(-br.ShortWindow)
102+
}
103+
if br.Severity > model.OK {
104+
if t := incident.OpenedAt.Add(-br.ShortWindow); v.ActualFrom.IsZero() || t.After(v.ActualFrom) {
105+
v.ActualFrom = t
106+
}
107+
v.LatencySLO.Violated = true
108+
v.LatencySLO.Compliance = utils.FormatPercentage(100 - br.LongWindowPercentage)
109+
break
110+
}
26111
}
27112
}
28-
if sloReport == nil {
29-
return nil
113+
for _, widget := range v.Widgets {
114+
widget.AddAnnotation(model.Annotation{Name: "incident", X1: v.ActualFrom, X2: to})
30115
}
31-
for _, w := range sloReport.Widgets {
32-
if w.Heatmap != nil {
33-
return w
34-
}
116+
return v
117+
}
118+
119+
func incidentWidgets(w *model.World, app *model.Application) []*model.Widget {
120+
var res []*model.Widget
121+
if len(app.LatencySLIs) > 0 {
122+
ch := model.NewChart(w.Ctx, "Latency, seconds").
123+
PercentilesFrom(app.LatencySLIs[0].Histogram, 0.25, 0.5, 0.75, 0.95, 0.99)
124+
res = append(res, &model.Widget{Chart: ch})
125+
}
126+
if len(app.AvailabilitySLIs) > 0 {
127+
res = append(res, &model.Widget{
128+
Chart: model.NewChart(w.Ctx, "Errors, per second").
129+
AddSeries("errors", app.AvailabilitySLIs[0].FailedRequests.Map(timeseries.NanToZero), "black").
130+
Stacked(),
131+
})
35132
}
36-
return nil
133+
return res
37134
}

0 commit comments

Comments
 (0)