diff --git a/README.md b/README.md index bc2c826..8dc0bf4 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,10 @@ [![License](https://img.shields.io/badge/LICENSE-Apache2.0-ff69b4.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) -distributed monitoring system +​ ​ ​ ​ ​ ​ OWL 是由国内领先的第三方数据智能服务商 [TalkingData]() 开源的一款企业级分布式监控告警系统,目前由 Tech Operation Team 持续开发更新维护。 + +      OWL 后台组件全部使用 [Go](https://golang.org/) 语言开发,Go 语言是 Google 开发的一种静态强类型、编译型、并发型,并具有垃圾回收功能的编程语言,它的并发机制可以充分利用多核,同平台一次编译可以到处运行,运维成本极低,更多的信息可以参考[官方文档](https://golang.org/doc/)。前端页面使用 [iView]() 开发,iView 同样是由 TalkingData 开源的一套基于 Vue.js 的 UI 组件库,主要服务于 PC 界面的中后台产品。 -OWL是TalkingData公司推出的一款开源分布式监控系统 ## Features @@ -13,15 +14,38 @@ OWL是TalkingData公司推出的一款开源分布式监控系统 - 分布式,支持多机房 - 多维的数据模型,类opentsdb - 支持多种报警算法,支持多条件组合、时间范围、报警模板等 -- 灵活的插件机制,支持任意语言编写,支持传参,自动同步 -- 丰富的报警渠道,邮件、微信、短信、电话、自定义 -- 原始数据永久存储,支持发送到opentsdb、kairosdb、kafka -- 自带web管理界面以及强大的自定义图表功能 +- 灵活的插件机制,支持任意语言编写,支持传参,自动同步到客户端 +- 丰富的报警渠道,邮件、企业微信、短信、电话以及自定义脚本 +- 原始数据永久存储,支持发送到 opentsdb、kairosdb、kafka +- 自带 web 管理界面以及强大的自定义图表功能能 ## Architecture ![owl](./arch.png) +## Components + +**agent**:安装在每台被监控机器上,用于采集监控数据 + +**netcollect**:通过 SNMP V2 采集网络设备的接口数据 + +**repeater**:接收 `agent` 发送过来的监控数据,并写入后端存储 + +**cfc**:维护客户端需要执行的插件列表,主机名 、ip地址更新以及采集到的指标列表 + +**controller**:从数据库加载告警策略,生成任务发送给 `inspector`,并且根据执行结果进行告警 + +**inspector**:从 `controller` 获取监控任务,根据 `tsdb` 中的数据进行计算,并将结果返回 `controller` + +**api**:对外提供 http rest api接口,web 页面就是通过它来获取数据 + +**MySQL**:所有配置信息的持久化存储,包含主机信息,告警策略,主机组,人员等 + +**TSDB**:时序数据库(time seires database),用于存储采集到的监控数据 + +**frontend**:web 管理页面,可以方便的进行系统管理维护工作 + + ## Demo http://54.223.127.87/ diff --git a/api/data.go b/api/data.go index 8f77f44..af47ebf 100644 --- a/api/data.go +++ b/api/data.go @@ -35,7 +35,7 @@ func queryTimeSeriesData(c *gin.Context) { metric := c.Query("metric") tags := c.Query("tags") tagMap := types.ParseTags(tags) - if groupName, exist := tagMap["host_group"]; exist { + if groupNames, exist := tagMap["host_group"]; exist { productIDStr, ok := c.GetQuery("product_id") if !ok { response["code"] = http.StatusNotFound @@ -49,14 +49,21 @@ func queryTimeSeriesData(c *gin.Context) { return } delete(tagMap, "host_group") - hostSet := getHostnameTagsFromProductGroup(productID, groupName) + var hostSet []string + for _, groupName := range strings.Split(groupNames, "|") { + hostSet = append(hostSet, getHostnameTagsFromProductGroup(productID, groupName)...) + } if len(hostSet) == 0 { response["code"] = http.StatusBadRequest - response["message"] = groupName + " has no host" + response["message"] = "all group has no host" return } - - tagMap["host"] = strings.Join(hostSet, "|") + hosts := strings.Join(hostSet, "|") + // 如果存在 tag host, merge + if host, ok := tagMap["host"]; ok { + hosts = hosts + "|" + host + } + tagMap["host"] = hosts tags = Tags2String(tagMap) } diff --git a/api/host_groups.go b/api/host_groups.go index d1f624c..6184ccf 100644 --- a/api/host_groups.go +++ b/api/host_groups.go @@ -61,10 +61,15 @@ func listNotInProductHostGroupHosts(c *gin.Context) { func listProductHostGroups(c *gin.Context) { response := gin.H{"code": http.StatusOK} defer c.JSON(http.StatusOK, response) + var username string + if c.DefaultQuery("my", "false") == "true" { + username = c.GetString("username") + } total, hostGroups := mydb.getProductHostGroups( c.GetInt("product_id"), c.GetBool("paging"), c.GetString("query"), + username, c.GetString("order"), c.GetInt("offset"), c.GetInt("limit"), diff --git a/api/mysql.go b/api/mysql.go index 333786e..654c68b 100644 --- a/api/mysql.go +++ b/api/mysql.go @@ -450,7 +450,7 @@ func (d *db) GetAlarmRecords(eventID int64, order, limit string) (records []*Ala // GetTriggersRecords 获取报警事件下的表达式组 func (d *db) GetTriggersRecords(eventID int64, count int) []*TriggerEventRecord { triggers := []*TriggerEventRecord{} - rawSQL := "SELECT * FROM trigger_event_record WHERE strategy_event_id = ? AND count = ?" + rawSQL := "SELECT * FROM trigger_event_record WHERE strategy_event_id = ? AND count = ? AND triggered=TRUE" if err := d.Select(&triggers, rawSQL, eventID, count); err != nil { log.Println(err) return nil @@ -1551,13 +1551,14 @@ func (d *db) removeHostsFromProduct(productID int, ids []string) (err error) { } //获取产品线下的主机组 -func (d *db) getProductHostGroups(productID int, paging bool, query string, order string, offset, limit int) (int, []WarpHostGroup) { +func (d *db) getProductHostGroups(productID int, paging bool, query string, user string, order string, offset, limit int) (int, []WarpHostGroup) { var ( groups = make([]WarpHostGroup, 0) err error cnt int + rawSQL string ) - rawSQL := fmt.Sprintf("select hg.id, hg.name, hg.description, hg.creator, DATE_FORMAT(hg.create_at,'%s') as create_at,"+ + rawSQL = fmt.Sprintf("select hg.id, hg.name, hg.description, hg.creator, DATE_FORMAT(hg.create_at,'%s') as create_at,"+ "DATE_FORMAT(hg.update_at,'%s') as update_at, count(distinct host_group_plugin.id) as plugin_cnt, "+ "count(distinct host_group_host.id) as host_cnt, count(distinct strategy_group.id) as strategy_cnt "+ " from host_group as hg left join host_group_plugin on hg.id = host_group_plugin.group_id left join host_group_host "+ @@ -1565,6 +1566,10 @@ func (d *db) getProductHostGroups(productID int, paging bool, query string, orde " where hg.product_id=%d", dbDateFormat, dbDateFormat, productID) cntSQL := fmt.Sprintf("select count(*) from host_group where product_id = %d", productID) + if len(user) > 0 { + rawSQL = fmt.Sprintf("%s and hg.creator='%s'", rawSQL, user) + cntSQL = fmt.Sprintf("%s and creator='%s'", cntSQL, user) + } if len(query) > 0 { rawSQL = fmt.Sprintf("%s and hg.name like '%%%s%%'", rawSQL, query) cntSQL = fmt.Sprintf("%s and name like '%%%s%%'", cntSQL, query) diff --git a/client/builtin/fd.go b/client/builtin/fd.go index 6093a61..f84ecd5 100644 --- a/client/builtin/fd.go +++ b/client/builtin/fd.go @@ -30,6 +30,7 @@ func fdMetrics(cycle int) []*types.TimeSeriesData { if err != nil { return nil } + defer fd.Close() ts := time.Now().Unix() r := bufio.NewReader(fd) line, err := r.ReadString('\n') diff --git a/common/chanMonitor/monitor.go b/common/chanMonitor/monitor.go new file mode 100755 index 0000000..b57f1b5 --- /dev/null +++ b/common/chanMonitor/monitor.go @@ -0,0 +1,80 @@ +package chanMonitor + +import ( + "fmt" + "reflect" + "sync" +) + +var chans = make(map[key]interface{}) +var chmu sync.RWMutex + +// AddNamed adds a channel to be monitor and associates the channel +// with this name and, optionally, the instance of this named channel (there may be many) +func AddNamed(name, instance string, channel interface{}) error { + + //reflect on the input to get the correct channel type. + if reflect.TypeOf(channel).Kind() != reflect.Chan { + return fmt.Errorf("invalid input type %v for input param channel, must be of type chan", channel) + } + + chmu.Lock() + defer chmu.Unlock() + + k := key{name: name, instance: instance} + + if _, found := chans[k]; found { + return fmt.Errorf("channel with name: %s already being monitored.", name) + } + chans[k] = channel + + return nil +} + +// ChanState struct holding Length and Capacity. +type ChanState struct { + Len int `json:"length"` + Cap int `json:"capacity"` + Instance string `json:"instance"` +} + +type key struct { + name string + instance string +} + +// Get returns the channel state for a give channel name. +func Get(name, instance string) *ChanState { + + chmu.RLock() + defer chmu.RUnlock() + + k := key{name: name, instance: instance} + + ch, found := chans[k] + if !found { + return nil + } + + return &ChanState{ + Len: reflect.ValueOf(ch).Len(), + Cap: reflect.ValueOf(ch).Cap(), + Instance: k.instance, + } + +} + +// Get the channel states map[string]*ChanState of all the monitored channels. Keyed by channel name. +func GetAll() map[string]*ChanState { + + results := make(map[string]*ChanState) + + chmu.RLock() + defer chmu.RUnlock() + for k, _ := range chans { + results[k.name] = Get(k.name, k.instance) + } + + return results + +} diff --git a/common/chanMonitor/service.go b/common/chanMonitor/service.go new file mode 100755 index 0000000..100ee66 --- /dev/null +++ b/common/chanMonitor/service.go @@ -0,0 +1,62 @@ +package chanMonitor + +import ( + "encoding/json" + "log" + "net/http" +) + +type Service struct { + url string + name string +} + +func New(serviceName string, url string) *Service { + + return &Service{ + url: url, + name: serviceName, + } +} + +func (this *Service) Start() { + http.HandleFunc("/channels", this.chanHandler) + go func() { + if err := this.start(); err != nil { + panic(err) + } + }() +} + +func (this *Service) start() error { + return http.ListenAndServe(this.url, nil) +} + +func (this *Service) chanHandler(w http.ResponseWriter, r *http.Request) { + chStats := GetAll() + + resp := &ServiceChannelsStatus{ + Service: this.name, + Channels: chStats, + } + + jsonResp, err := json.Marshal(resp) + if err != nil { + w.WriteHeader(http.StatusInternalServerError) + w.Write(nil) + log.Printf("Error: %#v", err) + } + + w.Header().Add("Content-Type", "application/json") + w.Write(jsonResp) +} + +type ServiceChannelsStatus struct { + Service string `json:"service"` + Channels map[string]*ChanState `json:"channels"` +} + +type Config struct { + Name string + Url string +} diff --git a/inspector/main.go b/inspector/main.go index 35b33cc..792be23 100644 --- a/inspector/main.go +++ b/inspector/main.go @@ -6,6 +6,7 @@ package main import ( "fmt" "os" + chm "owl/common/chanMonitor" "path/filepath" "runtime" ) @@ -33,5 +34,9 @@ func main() { fmt.Println("failed to init inspector:", err) return } + + chm.AddNamed("inspector.resultPool.results", "owl-inspector", inspector.resultPool.results) + chm.AddNamed("inspector.taskPool.tasks", "owl-inspector", inspector.taskPool.tasks) + chm.New("owl-inspector", ":20001").Start() select {} }