Skip to content

Commit c169b4b

Browse files
darkkdiscordianfish
authored andcommittedSep 19, 2017
Add metrics from SNTPv4 packet to ntp collector & add ntpd sanity check (prometheus#655)
* Add metrics from SNTPv4 packet to ntp collector & add ntpd sanity check 1. Checking local clock against remote NTP daemon is bad idea, local ntpd acting as a client should do it better and avoid excessive load on remote NTP server so the collector is refactored to query local NTP server. 2. Checking local clock against remote one does not check local ntpd itself. Local ntpd may be down or out of sync due to network issues, but clock will be OK. 3. Checking NTP server using sanity of it's response is tricky and depends on ntpd implementation, that's why common `node_ntp_sanity` variable is exported. * `govendor add golang.org/x/net/ipv4`, it is dependency of github.com/beevik/ntp * Update github.com/beevik/ntp to include boring SNTP fix * Use variable name from RFC5905 * ntp: move code to make export of raw metrics more explicit * Move NTP math to `github.com/beevik/ntp` * Make `golint` happy * Add some brief docs explaining `ntp` prometheus#655 and `timex` prometheus#664 modules * ntp: drop XXX comment that got its decision * ntp: add `_seconds` suffix to relevant metrics * Better `node_ntp_leap` comment * s/node_ntp_reftime/node_ntp_reference_timestamp_seconds/ as requested by @discordianfish * Extract subsystem name to const as suggested by @SuperQ
1 parent b0d5c00 commit c169b4b

File tree

169 files changed

+10650
-64
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

169 files changed

+10650
-64
lines changed
 

‎README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/So
6464
meminfo\_numa | Exposes memory statistics from `/proc/meminfo_numa`. | Linux
6565
mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux
6666
nfs | Exposes NFS client statistics from `/proc/net/rpc/nfs`. This is the same information as `nfsstat -c`. | Linux
67+
ntp | Exposes local NTP daemon health to check [time](./docs/TIME.md) | _any_
6768
qdisc | Exposes [queuing discipline](https://en.wikipedia.org/wiki/Network_scheduler#Linux_kernel) statistics | Linux
6869
runit | Exposes service status from [runit](http://smarden.org/runit/). | _any_
6970
supervisord | Exposes service status from [supervisord](http://supervisord.org/). | _any_
@@ -78,7 +79,6 @@ Name | Description | OS
7879
---------|-------------|----
7980
gmond | Exposes statistics from Ganglia. | _any_
8081
megacli | Exposes RAID statistics from MegaCLI. | Linux
81-
ntp | Exposes time drift from an NTP server. | _any_
8282

8383
### Textfile Collector
8484

‎collector/ntp.go

+106-22
Original file line numberDiff line numberDiff line change
@@ -17,62 +17,146 @@ package collector
1717

1818
import (
1919
"fmt"
20+
"net"
21+
"time"
2022

2123
"github.com/beevik/ntp"
2224
"github.com/prometheus/client_golang/prometheus"
23-
"github.com/prometheus/common/log"
2425
"gopkg.in/alecthomas/kingpin.v2"
2526
)
2627

28+
const (
29+
hour24 = 24 * time.Hour // `time` does not export `Day` as Day != 24h because of DST
30+
ntpSubsystem = "ntp"
31+
)
32+
2733
var (
28-
ntpServer = kingpin.Flag("collector.ntp.server", "NTP server to use for ntp collector.").Default("").String()
34+
ntpServer = kingpin.Flag("collector.ntp.server", "NTP server to use for ntp collector").Default("127.0.0.1").String()
2935
ntpProtocolVersion = kingpin.Flag("collector.ntp.protocol-version", "NTP protocol version").Default("4").Int()
36+
ntpServerIsLocal = kingpin.Flag("collector.ntp.server-is-local", "Certify that collector.ntp.server address is the same local host as this collector.").Default("false").Bool()
37+
ntpIPTTL = kingpin.Flag("collector.ntp.ip-ttl", "IP TTL to use while sending NTP query").Default("1").Int()
38+
// 3.46608s ~ 1.5s + PHI * (1 << maxPoll), where 1.5s is MAXDIST from ntp.org, it is 1.0 in RFC5905
39+
// max-distance option is used as-is without phi*(1<<poll)
40+
ntpMaxDistance = kingpin.Flag("collector.ntp.max-distance", "Max accumulated distance to the root").Default("3.46608s").Duration()
41+
ntpOffsetTolerance = kingpin.Flag("collector.ntp.local-offset-tolerance", "Offset between local clock and local ntpd time to tolerate").Default("1ms").Duration()
42+
43+
leapMidnight time.Time
3044
)
3145

3246
type ntpCollector struct {
33-
drift, stratum typedDesc
47+
stratum, leap, rtt, offset, reftime, rootDelay, rootDispersion, sanity typedDesc
3448
}
3549

3650
func init() {
3751
Factories["ntp"] = NewNtpCollector
3852
}
3953

40-
// NewNtpCollector returns a new Collector exposing the offset between ntp and
41-
// the current system time.
54+
// NewNtpCollector returns a new Collector exposing sanity of local NTP server.
55+
// Default definition of "local" is:
56+
// - collector.ntp.server address is a loopback address (or collector.ntp.server-is-mine flag is turned on)
57+
// - the server is reachable with outgoin IP_TTL = 1
4258
func NewNtpCollector() (Collector, error) {
43-
warnDeprecated("ntp")
44-
if *ntpServer == "" {
45-
return nil, fmt.Errorf("no NTP server specified, see -collector.ntp.server")
59+
ipaddr := net.ParseIP(*ntpServer)
60+
if !*ntpServerIsLocal && (ipaddr == nil || !ipaddr.IsLoopback()) {
61+
return nil, fmt.Errorf("only IP address of local NTP server is valid for -collector.ntp.server")
4662
}
63+
4764
if *ntpProtocolVersion < 2 || *ntpProtocolVersion > 4 {
4865
return nil, fmt.Errorf("invalid NTP protocol version %d; must be 2, 3, or 4", *ntpProtocolVersion)
4966
}
5067

68+
if *ntpOffsetTolerance < 0 {
69+
return nil, fmt.Errorf("Offset tolerance must be non-negative")
70+
}
71+
5172
return &ntpCollector{
52-
drift: typedDesc{prometheus.NewDesc(
53-
prometheus.BuildFQName(Namespace, "ntp", "drift_seconds"),
54-
"Time between system time and ntp time.",
73+
stratum: typedDesc{prometheus.NewDesc(
74+
prometheus.BuildFQName(Namespace, ntpSubsystem, "stratum"),
75+
"NTPD stratum.",
5576
nil, nil,
5677
), prometheus.GaugeValue},
57-
stratum: typedDesc{prometheus.NewDesc(
58-
prometheus.BuildFQName(Namespace, "ntp", "stratum"),
59-
"NTP server stratum.",
78+
leap: typedDesc{prometheus.NewDesc(
79+
prometheus.BuildFQName(Namespace, ntpSubsystem, "leap"),
80+
"NTPD leap second indicator, 2 bits.",
81+
nil, nil,
82+
), prometheus.GaugeValue},
83+
rtt: typedDesc{prometheus.NewDesc(
84+
prometheus.BuildFQName(Namespace, ntpSubsystem, "rtt_seconds"),
85+
"RTT to NTPD.",
86+
nil, nil,
87+
), prometheus.GaugeValue},
88+
offset: typedDesc{prometheus.NewDesc(
89+
prometheus.BuildFQName(Namespace, ntpSubsystem, "offset_seconds"),
90+
"ClockOffset between NTP and local clock.",
91+
nil, nil,
92+
), prometheus.GaugeValue},
93+
reftime: typedDesc{prometheus.NewDesc(
94+
prometheus.BuildFQName(Namespace, ntpSubsystem, "reference_timestamp_seconds"),
95+
"NTPD ReferenceTime, UNIX timestamp.",
96+
nil, nil,
97+
), prometheus.GaugeValue},
98+
rootDelay: typedDesc{prometheus.NewDesc(
99+
prometheus.BuildFQName(Namespace, ntpSubsystem, "root_delay_seconds"),
100+
"NTPD RootDelay.",
101+
nil, nil,
102+
), prometheus.GaugeValue},
103+
rootDispersion: typedDesc{prometheus.NewDesc(
104+
prometheus.BuildFQName(Namespace, ntpSubsystem, "root_dispersion_seconds"),
105+
"NTPD RootDispersion.",
106+
nil, nil,
107+
), prometheus.GaugeValue},
108+
sanity: typedDesc{prometheus.NewDesc(
109+
prometheus.BuildFQName(Namespace, ntpSubsystem, "sanity"),
110+
"NTPD sanity according to RFC5905 heuristics and configured limits.",
60111
nil, nil,
61112
), prometheus.GaugeValue},
62113
}, nil
63114
}
64115

65116
func (c *ntpCollector) Update(ch chan<- prometheus.Metric) error {
66-
resp, err := ntp.Query(*ntpServer, *ntpProtocolVersion)
117+
resp, err := ntp.QueryWithOptions(*ntpServer, ntp.QueryOptions{
118+
Version: *ntpProtocolVersion,
119+
TTL: *ntpIPTTL,
120+
Timeout: time.Second, // default `ntpdate` timeout
121+
})
67122
if err != nil {
68-
return fmt.Errorf("couldn't get NTP drift: %s", err)
123+
return fmt.Errorf("couldn't get SNTP reply: %s", err)
124+
}
125+
126+
ch <- c.stratum.mustNewConstMetric(float64(resp.Stratum))
127+
ch <- c.leap.mustNewConstMetric(float64(resp.Leap))
128+
ch <- c.rtt.mustNewConstMetric(resp.RTT.Seconds())
129+
ch <- c.offset.mustNewConstMetric(resp.ClockOffset.Seconds())
130+
if resp.ReferenceTime.Unix() > 0 {
131+
// Go Zero is 0001-01-01 00:00:00 UTC
132+
// NTP Zero is 1900-01-01 00:00:00 UTC
133+
// UNIX Zero is 1970-01-01 00:00:00 UTC
134+
// so let's keep ALL ancient `reftime` values as zero
135+
ch <- c.reftime.mustNewConstMetric(float64(resp.ReferenceTime.UnixNano()) / 1e9)
136+
} else {
137+
ch <- c.reftime.mustNewConstMetric(0)
138+
}
139+
ch <- c.rootDelay.mustNewConstMetric(resp.RootDelay.Seconds())
140+
ch <- c.rootDispersion.mustNewConstMetric(resp.RootDispersion.Seconds())
141+
142+
// Here is SNTP packet sanity check that is exposed to move burden of
143+
// configuration from node_exporter user to the developer.
144+
145+
maxerr := *ntpOffsetTolerance
146+
if resp.Leap == ntp.LeapAddSecond || resp.Leap == ntp.LeapDelSecond {
147+
// state of leapMidnight is cached as leap flag is dropped right after midnight
148+
leapMidnight = resp.Time.Truncate(hour24).Add(hour24)
149+
}
150+
if leapMidnight.Add(-hour24).Before(resp.Time) && resp.Time.Before(leapMidnight.Add(hour24)) {
151+
// tolerate leap smearing
152+
maxerr += time.Second
153+
}
154+
155+
if resp.Validate() && resp.RootDistance <= *ntpMaxDistance && resp.CausalityViolation <= maxerr {
156+
ch <- c.sanity.mustNewConstMetric(1)
157+
} else {
158+
ch <- c.sanity.mustNewConstMetric(0)
69159
}
70-
driftSeconds := resp.ClockOffset.Seconds()
71-
log.Debugf("Set ntp_drift_seconds: %f", driftSeconds)
72-
ch <- c.drift.mustNewConstMetric(driftSeconds)
73160

74-
stratum := float64(resp.Stratum)
75-
log.Debugf("Set ntp_stratum: %f", stratum)
76-
ch <- c.stratum.mustNewConstMetric(stratum)
77161
return nil
78162
}

0 commit comments

Comments
 (0)