Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""A `Nagios plugin <https://nagios-plugins.org/doc/guidelines.html>`_ 

2for monitoring GWCelery. 

3""" 

4from enum import IntEnum 

5from sys import exit 

6from traceback import format_exc, format_exception 

7 

8from celery.bin.base import Command 

9import kombu.exceptions 

10 

11# Make sure that all tasks are registered 

12from .. import tasks # noqa: F401 

13 

14 

15class NagiosPluginStatus(IntEnum): 

16 """Nagios plugin status codes.""" 

17 

18 OK = 0 

19 WARNING = 1 

20 CRITICAL = 2 

21 UNKNOWN = 3 

22 

23 

24class NagiosCriticalError(Exception): 

25 """An exception that maps to a Nagios status of `CRITICAL`.""" 

26 

27 

28def get_active_queues(inspector): 

29 return {queue['name'] 

30 for queues in (inspector.active_queues() or {}).values() 

31 for queue in queues} 

32 

33 

34def get_active_lvalert_nodes(inspector): 

35 return {node for stat in inspector.stats().values() 

36 for node in stat.get('lvalert-nodes', ())} 

37 

38 

39def get_expected_queues(app): 

40 # Get the queues for all registered tasks. 

41 result = {getattr(task, 'queue', None) for task in app.tasks.values()} 

42 # We use 'celery' for all tasks that do not explicitly specify a queue. 

43 result -= {None} 

44 result |= {'celery'} 

45 # Done. 

46 return result 

47 

48 

49def get_expected_lvalert_nodes(app): 

50 return app.conf['lvalert_nodes'] 

51 

52 

53def get_active_voevent_peers(inspector): 

54 stats = inspector.stats() 

55 broker_peers, receiver_peers = ( 

56 {peer for stat in stats.values() for peer in stat.get(key, ())} 

57 for key in ['voevent-broker-peers', 'voevent-receiver-peers']) 

58 return broker_peers, receiver_peers 

59 

60 

61def check_status(app): 

62 connection = app.connection() 

63 try: 

64 connection.ensure_connection(max_retries=1) 

65 except kombu.exceptions.OperationalError as e: 

66 raise NagiosCriticalError('No connection to broker') from e 

67 

68 inspector = app.control.inspect() 

69 

70 active = get_active_queues(inspector) 

71 expected = get_expected_queues(app) 

72 missing = expected - active 

73 if missing: 

74 raise NagiosCriticalError('Not all expected queues are active') from \ 

75 AssertionError('Missing queues: ' + ', '.join(missing)) 

76 

77 active = get_active_lvalert_nodes(inspector) 

78 expected = get_expected_lvalert_nodes(app) 

79 missing = expected - active 

80 extra = active - expected 

81 if missing: 

82 raise NagiosCriticalError('Not all lvalert nodes are subscribed') \ 

83 from AssertionError('Missing nodes: ' + ', '.join(missing)) 

84 if extra: 

85 raise NagiosCriticalError('Too many lvalert nodes are subscribed') \ 

86 from AssertionError('Extra nodes: ' + ', '.join(extra)) 

87 

88 broker_peers, receiver_peers = get_active_voevent_peers(inspector) 

89 if app.conf['voevent_broadcaster_whitelist'] and not broker_peers: 

90 raise NagiosCriticalError( 

91 'The VOEvent broker has no active connections') \ 

92 from AssertionError('voevent_broadcaster_whitelist: {}'.format( 

93 app.conf['voevent_broadcaster_whitelist'])) 

94 if app.conf['voevent_receiver_address'] and not receiver_peers: 

95 raise NagiosCriticalError( 

96 'The VOEvent receiver has no active connections') \ 

97 from AssertionError('voevent_receiver_address: {}'.format( 

98 app.conf['voevent_receiver_address'])) 

99 

100 

101class NagiosCommand(Command): 

102 

103 def run(self, **kwargs): 

104 try: 

105 check_status(self.app) 

106 except NagiosCriticalError as e: 

107 status = NagiosPluginStatus.CRITICAL 

108 output, = e.args 

109 e = e.__cause__ 

110 detail = ''.join(format_exception(type(e), e, e.__traceback__)) 

111 except: # noqa: E722 

112 status = NagiosPluginStatus.UNKNOWN 

113 output = 'Unexpected error' 

114 detail = format_exc() 

115 else: 

116 status = NagiosPluginStatus.OK 

117 output = 'Running normally' 

118 detail = None 

119 print('{}: {}'.format(status.name, output)) 

120 if detail: 

121 print(detail) 

122 exit(status) 

123 

124 

125NagiosCommand.__doc__ = __doc__