From 6286e34af0276a15d7d015cfedbfddc6af410069 Mon Sep 17 00:00:00 2001 From: naiba Date: Tue, 20 Apr 2021 19:30:34 +0800 Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20=E9=87=8D=E6=9E=84?= =?UTF-8?q?=E9=83=A8=E5=88=86=E4=BB=A3=E7=A0=81=EF=BC=8C=E6=89=93=E5=8D=B0?= =?UTF-8?q?=E8=B0=83=E8=AF=95=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd/agent/main.go | 14 +++--- cmd/dashboard/controller/oauth2.go | 2 - cmd/playground/main.go | 2 +- service/dao/alertsentinel.go | 4 +- service/dao/dao.go | 2 +- service/dao/servicesentinel.go | 79 ++++++++++++++++++++---------- service/rpc/nezha.go | 5 +- 7 files changed, 70 insertions(+), 38 deletions(-) diff --git a/cmd/agent/main.go b/cmd/agent/main.go index 2239204..e1fe27e 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -35,11 +35,10 @@ var ( ) var ( - reporting bool client pb.NezhaServiceClient ctx = context.Background() - delayWhenError = time.Second * 10 // Agent 重连间隔 - updateCh = make(chan struct{}, 0) // Agent 自动更新间隔 + delayWhenError = time.Second * 10 // Agent 重连间隔 + updateCh = make(chan struct{}) // Agent 自动更新间隔 httpClient = &http.Client{ Transport: &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, @@ -179,12 +178,14 @@ func doTask(task *pb.Task) { start := time.Now() resp, err := httpClient.Get(task.GetData()) if err == nil { - result.Delay = float32(time.Now().Sub(start).Microseconds()) / 1000.0 + // 检查 HTTP Response 状态 + result.Delay = float32(time.Since(start).Microseconds()) / 1000.0 if resp.StatusCode > 399 || resp.StatusCode < 200 { err = errors.New("\n应用错误:" + resp.Status) } } if err == nil { + // 检查 SSL 证书信息 if strings.HasPrefix(task.GetData(), "https://") { c := cert.NewCert(task.GetData()[8:]) if c.Error != "" { @@ -197,6 +198,7 @@ func doTask(task *pb.Task) { result.Successful = true } } else { + // HTTP 请求失败 result.Data = err.Error() } case model.TaskTypeICMPPing: @@ -219,7 +221,7 @@ func doTask(task *pb.Task) { if err == nil { conn.Write([]byte("ping\n")) conn.Close() - result.Delay = float32(time.Now().Sub(start).Microseconds()) / 1000.0 + result.Delay = float32(time.Since(start).Microseconds()) / 1000.0 result.Successful = true } else { result.Data = err.Error() @@ -260,7 +262,7 @@ func doTask(task *pb.Task) { result.Data = string(output) result.Successful = true } - result.Delay = float32(time.Now().Sub(startedAt).Seconds()) + result.Delay = float32(time.Since(startedAt).Seconds()) default: log.Printf("Unknown action: %v", task) } diff --git a/cmd/dashboard/controller/oauth2.go b/cmd/dashboard/controller/oauth2.go index 773df9c..ccac017 100644 --- a/cmd/dashboard/controller/oauth2.go +++ b/cmd/dashboard/controller/oauth2.go @@ -4,7 +4,6 @@ import ( "context" "errors" "fmt" - "log" "net/http" "strings" @@ -71,7 +70,6 @@ func (oa *oauth2controller) callback(c *gin.Context) { if err == nil { gu, _, err = client.Users.Get(ctx, "") } - log.Printf("%+v", gu) if err != nil { mygin.ShowErrorPage(c, mygin.ErrInfo{ Code: http.StatusBadRequest, diff --git a/cmd/playground/main.go b/cmd/playground/main.go index 81c80d4..1bb569e 100644 --- a/cmd/playground/main.go +++ b/cmd/playground/main.go @@ -34,7 +34,7 @@ func tcpping() { } conn.Write([]byte("ping\n")) conn.Close() - fmt.Println(time.Now().Sub(start).Microseconds(), float32(time.Now().Sub(start).Microseconds())/1000.0) + fmt.Println(time.Since(start).Microseconds(), float32(time.Since(start).Microseconds())/1000.0) } func sysinfo() { diff --git a/service/dao/alertsentinel.go b/service/dao/alertsentinel.go index 7b8e5f5..1ad0a77 100644 --- a/service/dao/alertsentinel.go +++ b/service/dao/alertsentinel.go @@ -43,7 +43,9 @@ func AlertSentinelStart() { checkStatus() checkCount++ if lastPrint.Before(startedAt.Add(-1 * time.Hour)) { - log.Println("报警规则检测每小时", checkCount, "次", startedAt, time.Now()) + if Conf.Debug { + log.Println("报警规则检测每小时", checkCount, "次", startedAt, time.Now()) + } checkCount = 0 lastPrint = startedAt } diff --git a/service/dao/dao.go b/service/dao/dao.go index dc4849f..149a20a 100644 --- a/service/dao/dao.go +++ b/service/dao/dao.go @@ -46,7 +46,7 @@ func ReSortServer() { sort.SliceStable(SortedServerList, func(i, j int) bool { if SortedServerList[i].DisplayIndex == SortedServerList[j].DisplayIndex { - return SortedServerList[i].ID < SortedServerList[i].ID + return SortedServerList[i].ID < SortedServerList[j].ID } return SortedServerList[i].DisplayIndex > SortedServerList[j].DisplayIndex }) diff --git a/service/dao/servicesentinel.go b/service/dao/servicesentinel.go index 8371135..ae1237d 100644 --- a/service/dao/servicesentinel.go +++ b/service/dao/servicesentinel.go @@ -15,7 +15,7 @@ var ServiceSentinelShared *ServiceSentinel func NewServiceSentinel() { ServiceSentinelShared = &ServiceSentinel{ - serviceResponseChannel: make(chan *pb.TaskResult, 200), + serviceResponseChannel: make(chan ReportData, 200), serviceResponseDataStoreTodaySavedIndex: make(map[uint64]int), serviceCurrentStatusIndex: make(map[uint64]int), serviceCurrentStatusData: make(map[uint64][]model.MonitorHistory), @@ -26,14 +26,35 @@ func NewServiceSentinel() { serviceResponseDataStoreCurrentDown: make(map[uint64]uint64), monitors: make(map[uint64]model.Monitor), serviceResponseDataStoreToday: make(map[uint64][]model.MonitorHistory), + sslCertCache: make(map[uint64]string), } ServiceSentinelShared.OnMonitorUpdate() + + year, month, day := time.Now().Date() + today := time.Date(year, month, day, 0, 0, 0, 0, time.Local) + var mhs []model.MonitorHistory + DB.Where("created_at >= ?", today).Find(&mhs) + + // 加载当日记录 + for i := 0; i < len(mhs); i++ { + ServiceSentinelShared.serviceResponseDataStoreToday[mhs[i].MonitorID] = + append(ServiceSentinelShared.serviceResponseDataStoreToday[mhs[i].MonitorID], mhs[i]) + } + + // 更新入库时间及当日数据入库游标 for k := range ServiceSentinelShared.monitors { ServiceSentinelShared.latestDate[k] = time.Now().Format("02-Jan-06") + ServiceSentinelShared.serviceResponseDataStoreTodaySavedIndex[k] = len(ServiceSentinelShared.serviceResponseDataStoreToday[k]) } + go ServiceSentinelShared.worker() } +type ReportData struct { + Data *pb.TaskResult + Reporter uint64 +} + /* 使用缓存 channel,处理上报的 Service 请求结果,然后判断是否需要报警 需要记录上一次的状态信息 @@ -41,7 +62,7 @@ func NewServiceSentinel() { type ServiceSentinel struct { serviceResponseDataStoreLock sync.RWMutex monitorsLock sync.RWMutex - serviceResponseChannel chan *pb.TaskResult + serviceResponseChannel chan ReportData serviceResponseDataStoreTodaySavedIndex map[uint64]int serviceCurrentStatusIndex map[uint64]int serviceCurrentStatusData map[uint64][]model.MonitorHistory @@ -52,9 +73,10 @@ type ServiceSentinel struct { serviceResponseDataStoreCurrentDown map[uint64]uint64 monitors map[uint64]model.Monitor serviceResponseDataStoreToday map[uint64][]model.MonitorHistory + sslCertCache map[uint64]string } -func (ss *ServiceSentinel) Dispatch(r *pb.TaskResult) { +func (ss *ServiceSentinel) Dispatch(r ReportData) { ss.serviceResponseChannel <- r } @@ -94,6 +116,7 @@ func (ss *ServiceSentinel) OnMonitorDelete(id uint64) { delete(ss.serviceResponseDataStoreCurrentUp, id) delete(ss.serviceResponseDataStoreCurrentDown, id) delete(ss.serviceResponseDataStoreToday, id) + delete(ss.sslCertCache, id) ss.monitorsLock.Lock() defer ss.monitorsLock.Unlock() delete(ss.monitors, id) @@ -186,10 +209,10 @@ func getStateStr(percent uint64) string { func (ss *ServiceSentinel) worker() { for r := range ss.serviceResponseChannel { - if ss.monitors[r.GetId()].ID == 0 { + if ss.monitors[r.Data.GetId()].ID == 0 { continue } - mh := model.PB2MonitorHistory(r) + mh := model.PB2MonitorHistory(r.Data) ss.serviceResponseDataStoreLock.Lock() // 先查看是否到下一天 nowDate := time.Now().Format("02-Jan-06") @@ -239,30 +262,34 @@ func (ss *ServiceSentinel) worker() { upPercent = ss.serviceResponseDataStoreCurrentUp[mh.MonitorID] * 100 / (ss.serviceResponseDataStoreCurrentDown[mh.MonitorID] + ss.serviceResponseDataStoreCurrentUp[mh.MonitorID]) } stateStr := getStateStr(upPercent) - log.Println(ss.monitors[mh.MonitorID].Target, stateStr) + if Conf.Debug { + log.Println(ss.monitors[mh.MonitorID].Target, stateStr, "Reporter:", r.Reporter, "Successful:", mh.Successful, "Data:", mh.Data) + } if stateStr == "故障" || stateStr != ss.lastStatus[mh.MonitorID] { ss.monitorsLock.RLock() isSendNotification := (ss.lastStatus[mh.MonitorID] != "" || stateStr == "故障") && ss.monitors[mh.MonitorID].Notify ss.lastStatus[mh.MonitorID] = stateStr if isSendNotification { - SendNotification(fmt.Sprintf("服务监控:%s 服务状态:%s", ss.monitors[mh.MonitorID].Name, stateStr), true) + go SendNotification(fmt.Sprintf("服务监控:%s 服务状态:%s", ss.monitors[mh.MonitorID].Name, stateStr), true) } ss.monitorsLock.RUnlock() } ss.serviceResponseDataStoreLock.Unlock() // SSL 证书报警 var errMsg string - if strings.HasPrefix(r.GetData(), "SSL证书错误:") { + if strings.HasPrefix(mh.Data, "SSL证书错误:") { // 排除 i/o timeont、connection timeout、EOF 错误 - if !strings.HasSuffix(r.GetData(), "timeout") && - !strings.HasSuffix(r.GetData(), "EOF") && - !strings.HasSuffix(r.GetData(), "timed out") { - errMsg = r.GetData() + if !strings.HasSuffix(mh.Data, "timeout") && + !strings.HasSuffix(mh.Data, "EOF") && + !strings.HasSuffix(mh.Data, "timed out") { + errMsg = mh.Data } } else { - var last model.MonitorHistory - var newCert = strings.Split(r.GetData(), "|") + var newCert = strings.Split(mh.Data, "|") if len(newCert) > 1 { + if ss.sslCertCache[mh.MonitorID] == "" { + ss.sslCertCache[mh.MonitorID] = mh.Data + } expiresNew, _ := time.Parse("2006-01-02 15:04:05 -0700 MST", newCert[1]) // 证书过期提醒 if expiresNew.Before(time.Now().AddDate(0, 0, 7)) { @@ -271,23 +298,23 @@ func (ss *ServiceSentinel) worker() { expiresNew.Format("2006-01-02 15:04:05")) } // 证书变更提醒 - if err := DB.Where("monitor_id = ? AND data LIKE ?", r.GetId(), "%|%").Order("id DESC").First(&last).Error; err == nil { - var oldCert = strings.Split(last.Data, "|") - var expiresOld time.Time - if len(oldCert) > 1 { - expiresOld, _ = time.Parse("2006-01-02 15:04:05 -0700 MST", oldCert[1]) - } - if last.Data != "" && oldCert[0] != newCert[0] && !expiresNew.Equal(expiresOld) { - errMsg = fmt.Sprintf( - "SSL证书变更,旧:%s, %s 过期;新:%s, %s 过期。", - oldCert[0], expiresOld.Format("2006-01-02 15:04:05"), newCert[0], expiresNew.Format("2006-01-02 15:04:05")) - } + var oldCert = strings.Split(ss.sslCertCache[mh.MonitorID], "|") + var expiresOld time.Time + if len(oldCert) > 1 { + expiresOld, _ = time.Parse("2006-01-02 15:04:05 -0700 MST", oldCert[1]) + } + if oldCert[0] != newCert[0] && !expiresNew.Equal(expiresOld) { + errMsg = fmt.Sprintf( + "SSL证书变更,旧:%s, %s 过期;新:%s, %s 过期。", + oldCert[0], expiresOld.Format("2006-01-02 15:04:05"), newCert[0], expiresNew.Format("2006-01-02 15:04:05")) } } } if errMsg != "" { ss.monitorsLock.RLock() - SendNotification(fmt.Sprintf("服务监控:%s %s", ss.monitors[mh.MonitorID].Name, errMsg), true) + if ss.monitors[mh.MonitorID].Notify { + go SendNotification(fmt.Sprintf("服务监控:%s %s", ss.monitors[mh.MonitorID].Name, errMsg), true) + } ss.monitorsLock.RUnlock() } } diff --git a/service/rpc/nezha.go b/service/rpc/nezha.go index a44991e..070062d 100644 --- a/service/rpc/nezha.go +++ b/service/rpc/nezha.go @@ -21,7 +21,10 @@ func (s *NezhaHandler) ReportTask(c context.Context, r *pb.TaskResult) (*pb.Rece return nil, err } if r.GetType() != model.TaskTypeCommand { - dao.ServiceSentinelShared.Dispatch(r) + dao.ServiceSentinelShared.Dispatch(dao.ReportData{ + Data: r, + Reporter: clientID, + }) } else { // 处理上报的计划任务 dao.CronLock.RLock()