parser.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604
  1. #!/usr/bin/python
  2. from __future__ import unicode_literals
  3. import math
  4. import re
  5. from ..metrics_core import Metric, METRIC_LABEL_NAME_RE
  6. from ..samples import Exemplar, Sample, Timestamp
  7. from ..utils import floatToGoString
  8. try:
  9. import StringIO
  10. except ImportError:
  11. # Python 3
  12. import io as StringIO
  13. def text_string_to_metric_families(text):
  14. """Parse Openmetrics text format from a unicode string.
  15. See text_fd_to_metric_families.
  16. """
  17. for metric_family in text_fd_to_metric_families(StringIO.StringIO(text)):
  18. yield metric_family
  19. _CANONICAL_NUMBERS = set([i / 1000.0 for i in range(10000)] + [10.0**i for i in range(-10, 11)] + [float("inf")])
  20. def _isUncanonicalNumber(s):
  21. f = float(s)
  22. if f not in _CANONICAL_NUMBERS:
  23. return False # Only the canonical numbers are required to be canonical.
  24. return s != floatToGoString(f)
  25. ESCAPE_SEQUENCES = {
  26. '\\\\': '\\',
  27. '\\n': '\n',
  28. '\\"': '"',
  29. }
  30. def _replace_escape_sequence(match):
  31. return ESCAPE_SEQUENCES[match.group(0)]
  32. ESCAPING_RE = re.compile(r'\\[\\n"]')
  33. def _replace_escaping(s):
  34. return ESCAPING_RE.sub(_replace_escape_sequence, s)
  35. def _unescape_help(text):
  36. result = []
  37. slash = False
  38. for char in text:
  39. if slash:
  40. if char == '\\':
  41. result.append('\\')
  42. elif char == '"':
  43. result.append('"')
  44. elif char == 'n':
  45. result.append('\n')
  46. else:
  47. result.append('\\' + char)
  48. slash = False
  49. else:
  50. if char == '\\':
  51. slash = True
  52. else:
  53. result.append(char)
  54. if slash:
  55. result.append('\\')
  56. return ''.join(result)
  57. def _parse_value(value):
  58. value = ''.join(value)
  59. if value != value.strip() or '_' in value:
  60. raise ValueError("Invalid value: {0!r}".format(value))
  61. try:
  62. return int(value)
  63. except ValueError:
  64. return float(value)
  65. def _parse_timestamp(timestamp):
  66. timestamp = ''.join(timestamp)
  67. if not timestamp:
  68. return None
  69. if timestamp != timestamp.strip() or '_' in timestamp:
  70. raise ValueError("Invalid timestamp: {0!r}".format(timestamp))
  71. try:
  72. # Simple int.
  73. return Timestamp(int(timestamp), 0)
  74. except ValueError:
  75. try:
  76. # aaaa.bbbb. Nanosecond resolution supported.
  77. parts = timestamp.split('.', 1)
  78. return Timestamp(int(parts[0]), int(parts[1][:9].ljust(9, "0")))
  79. except ValueError:
  80. # Float.
  81. ts = float(timestamp)
  82. if math.isnan(ts) or math.isinf(ts):
  83. raise ValueError("Invalid timestamp: {0!r}".format(timestamp))
  84. return ts
  85. def _is_character_escaped(s, charpos):
  86. num_bslashes = 0
  87. while (charpos > num_bslashes and
  88. s[charpos - 1 - num_bslashes] == '\\'):
  89. num_bslashes += 1
  90. return num_bslashes % 2 == 1
  91. def _parse_labels_with_state_machine(text):
  92. # The { has already been parsed.
  93. state = 'startoflabelname'
  94. labelname = []
  95. labelvalue = []
  96. labels = {}
  97. labels_len = 0
  98. for char in text:
  99. if state == 'startoflabelname':
  100. if char == '}':
  101. state = 'endoflabels'
  102. else:
  103. state = 'labelname'
  104. labelname.append(char)
  105. elif state == 'labelname':
  106. if char == '=':
  107. state = 'labelvaluequote'
  108. else:
  109. labelname.append(char)
  110. elif state == 'labelvaluequote':
  111. if char == '"':
  112. state = 'labelvalue'
  113. else:
  114. raise ValueError("Invalid line: " + text)
  115. elif state == 'labelvalue':
  116. if char == '\\':
  117. state = 'labelvalueslash'
  118. elif char == '"':
  119. ln = ''.join(labelname)
  120. if not METRIC_LABEL_NAME_RE.match(ln):
  121. raise ValueError("Invalid line, bad label name: " + text)
  122. if ln in labels:
  123. raise ValueError("Invalid line, duplicate label name: " + text)
  124. labels[ln] = ''.join(labelvalue)
  125. labelname = []
  126. labelvalue = []
  127. state = 'endoflabelvalue'
  128. else:
  129. labelvalue.append(char)
  130. elif state == 'endoflabelvalue':
  131. if char == ',':
  132. state = 'labelname'
  133. elif char == '}':
  134. state = 'endoflabels'
  135. else:
  136. raise ValueError("Invalid line: " + text)
  137. elif state == 'labelvalueslash':
  138. state = 'labelvalue'
  139. if char == '\\':
  140. labelvalue.append('\\')
  141. elif char == 'n':
  142. labelvalue.append('\n')
  143. elif char == '"':
  144. labelvalue.append('"')
  145. else:
  146. labelvalue.append('\\' + char)
  147. elif state == 'endoflabels':
  148. if char == ' ':
  149. break
  150. else:
  151. raise ValueError("Invalid line: " + text)
  152. labels_len += 1
  153. return labels, labels_len
  154. def _parse_labels(text):
  155. labels = {}
  156. # Raise error if we don't have valid labels
  157. if text and "=" not in text:
  158. raise ValueError
  159. # Copy original labels
  160. sub_labels = text
  161. try:
  162. # Process one label at a time
  163. while sub_labels:
  164. # The label name is before the equal
  165. value_start = sub_labels.index("=")
  166. label_name = sub_labels[:value_start]
  167. sub_labels = sub_labels[value_start + 1:]
  168. # Check for missing quotes
  169. if not sub_labels or sub_labels[0] != '"':
  170. raise ValueError
  171. # The first quote is guaranteed to be after the equal
  172. value_substr = sub_labels[1:]
  173. # Check for extra commas
  174. if not label_name or label_name[0] == ',':
  175. raise ValueError
  176. if not value_substr or value_substr[-1] == ',':
  177. raise ValueError
  178. # Find the last unescaped quote
  179. i = 0
  180. while i < len(value_substr):
  181. i = value_substr.index('"', i)
  182. if not _is_character_escaped(value_substr[:i], i):
  183. break
  184. i += 1
  185. # The label value is between the first and last quote
  186. quote_end = i + 1
  187. label_value = sub_labels[1:quote_end]
  188. # Replace escaping if needed
  189. if "\\" in label_value:
  190. label_value = _replace_escaping(label_value)
  191. if not METRIC_LABEL_NAME_RE.match(label_name):
  192. raise ValueError("invalid line, bad label name: " + text)
  193. if label_name in labels:
  194. raise ValueError("invalid line, duplicate label name: " + text)
  195. labels[label_name] = label_value
  196. # Remove the processed label from the sub-slice for next iteration
  197. sub_labels = sub_labels[quote_end + 1:]
  198. if sub_labels.startswith(","):
  199. next_comma = 1
  200. else:
  201. next_comma = 0
  202. sub_labels = sub_labels[next_comma:]
  203. # Check for missing commas
  204. if sub_labels and next_comma == 0:
  205. raise ValueError
  206. return labels
  207. except ValueError:
  208. raise ValueError("Invalid labels: " + text)
  209. def _parse_sample(text):
  210. separator = " # "
  211. # Detect the labels in the text
  212. label_start = text.find("{")
  213. if label_start == -1 or separator in text[:label_start]:
  214. # We don't have labels, but there could be an exemplar.
  215. name_end = text.index(" ")
  216. name = text[:name_end]
  217. # Parse the remaining text after the name
  218. remaining_text = text[name_end + 1:]
  219. value, timestamp, exemplar = _parse_remaining_text(remaining_text)
  220. return Sample(name, {}, value, timestamp, exemplar)
  221. # The name is before the labels
  222. name = text[:label_start]
  223. if separator not in text:
  224. # Line doesn't contain an exemplar
  225. # We can use `rindex` to find `label_end`
  226. label_end = text.rindex("}")
  227. label = text[label_start + 1:label_end]
  228. labels = _parse_labels(label)
  229. else:
  230. # Line potentially contains an exemplar
  231. # Fallback to parsing labels with a state machine
  232. labels, labels_len = _parse_labels_with_state_machine(text[label_start + 1:])
  233. label_end = labels_len + len(name)
  234. # Parsing labels succeeded, continue parsing the remaining text
  235. remaining_text = text[label_end + 2:]
  236. value, timestamp, exemplar = _parse_remaining_text(remaining_text)
  237. return Sample(name, labels, value, timestamp, exemplar)
  238. def _parse_remaining_text(text):
  239. split_text = text.split(" ", 1)
  240. val = _parse_value(split_text[0])
  241. if len(split_text) == 1:
  242. # We don't have timestamp or exemplar
  243. return val, None, None
  244. timestamp = []
  245. exemplar_value = []
  246. exemplar_timestamp = []
  247. exemplar_labels = None
  248. state = 'timestamp'
  249. text = split_text[1]
  250. it = iter(text)
  251. for char in it:
  252. if state == 'timestamp':
  253. if char == '#' and not timestamp:
  254. state = 'exemplarspace'
  255. elif char == ' ':
  256. state = 'exemplarhash'
  257. else:
  258. timestamp.append(char)
  259. elif state == 'exemplarhash':
  260. if char == '#':
  261. state = 'exemplarspace'
  262. else:
  263. raise ValueError("Invalid line: " + text)
  264. elif state == 'exemplarspace':
  265. if char == ' ':
  266. state = 'exemplarstartoflabels'
  267. else:
  268. raise ValueError("Invalid line: " + text)
  269. elif state == 'exemplarstartoflabels':
  270. if char == '{':
  271. label_start, label_end = text.index("{"), text.rindex("}")
  272. exemplar_labels = _parse_labels(text[label_start + 1:label_end])
  273. state = 'exemplarparsedlabels'
  274. else:
  275. raise ValueError("Invalid line: " + text)
  276. elif state == 'exemplarparsedlabels':
  277. if char == '}':
  278. state = 'exemplarvaluespace'
  279. elif state == 'exemplarvaluespace':
  280. if char == ' ':
  281. state = 'exemplarvalue'
  282. else:
  283. raise ValueError("Invalid line: " + text)
  284. elif state == 'exemplarvalue':
  285. if char == ' ' and not exemplar_value:
  286. raise ValueError("Invalid line: " + text)
  287. elif char == ' ':
  288. state = 'exemplartimestamp'
  289. else:
  290. exemplar_value.append(char)
  291. elif state == 'exemplartimestamp':
  292. exemplar_timestamp.append(char)
  293. # Trailing space after value.
  294. if state == 'timestamp' and not timestamp:
  295. raise ValueError("Invalid line: " + text)
  296. # Trailing space after value.
  297. if state == 'exemplartimestamp' and not exemplar_timestamp:
  298. raise ValueError("Invalid line: " + text)
  299. # Incomplete exemplar.
  300. if state in ['exemplarhash', 'exemplarspace', 'exemplarstartoflabels', 'exemplarparsedlabels']:
  301. raise ValueError("Invalid line: " + text)
  302. ts = _parse_timestamp(timestamp)
  303. exemplar = None
  304. if exemplar_labels is not None:
  305. exemplar_length = sum([len(k) + len(v) for k, v in exemplar_labels.items()])
  306. if exemplar_length > 64:
  307. raise ValueError("Exmplar labels are too long: " + text)
  308. exemplar = Exemplar(
  309. exemplar_labels,
  310. _parse_value(exemplar_value),
  311. _parse_timestamp(exemplar_timestamp),
  312. )
  313. return val, ts, exemplar
  314. def _group_for_sample(sample, name, typ):
  315. if typ == 'info':
  316. # We can't distinguish between groups for info metrics.
  317. return {}
  318. if typ == 'summary' and sample.name == name:
  319. d = sample.labels.copy()
  320. del d['quantile']
  321. return d
  322. if typ == 'stateset':
  323. d = sample.labels.copy()
  324. del d[name]
  325. return d
  326. if typ in ['histogram', 'gaugehistogram'] and sample.name == name + '_bucket':
  327. d = sample.labels.copy()
  328. del d['le']
  329. return d
  330. return sample.labels
  331. def _check_histogram(samples, name):
  332. group = None
  333. timestamp = None
  334. def do_checks():
  335. if bucket != float('+Inf'):
  336. raise ValueError("+Inf bucket missing: " + name)
  337. if count is not None and value != count:
  338. raise ValueError("Count does not match +Inf value: " + name)
  339. if has_negative_buckets and has_sum:
  340. raise ValueError("Cannot have _sum with negative buckets: " + name)
  341. if not has_negative_buckets and has_negative_gsum:
  342. raise ValueError("Cannot have negative _gsum with non-negative buckets: " + name)
  343. for s in samples:
  344. suffix = s.name[len(name):]
  345. g = _group_for_sample(s, name, 'histogram')
  346. if g != group or s.timestamp != timestamp:
  347. if group is not None:
  348. do_checks()
  349. count = None
  350. bucket = None
  351. has_negative_buckets = False
  352. has_sum = False
  353. has_negative_gsum = False
  354. value = 0
  355. group = g
  356. timestamp = s.timestamp
  357. if suffix == '_bucket':
  358. b = float(s.labels['le'])
  359. if b < 0:
  360. has_negative_buckets = True
  361. if bucket is not None and b <= bucket:
  362. raise ValueError("Buckets out of order: " + name)
  363. if s.value < value:
  364. raise ValueError("Bucket values out of order: " + name)
  365. bucket = b
  366. value = s.value
  367. elif suffix in ['_count', '_gcount']:
  368. count = s.value
  369. elif suffix in ['_sum']:
  370. has_sum = True
  371. elif suffix in ['_gsum'] and s.value < 0:
  372. has_negative_gsum = True
  373. if group is not None:
  374. do_checks()
  375. def text_fd_to_metric_families(fd):
  376. """Parse Prometheus text format from a file descriptor.
  377. This is a laxer parser than the main Go parser,
  378. so successful parsing does not imply that the parsed
  379. text meets the specification.
  380. Yields Metric's.
  381. """
  382. name = None
  383. allowed_names = []
  384. eof = False
  385. seen_metrics = set()
  386. def build_metric(name, documentation, typ, unit, samples):
  387. if name in seen_metrics:
  388. raise ValueError("Duplicate metric: " + name)
  389. seen_metrics.add(name)
  390. if typ is None:
  391. typ = 'unknown'
  392. if documentation is None:
  393. documentation = ''
  394. if unit is None:
  395. unit = ''
  396. if unit and not name.endswith("_" + unit):
  397. raise ValueError("Unit does not match metric name: " + name)
  398. if unit and typ in ['info', 'stateset']:
  399. raise ValueError("Units not allowed for this metric type: " + name)
  400. if typ in ['histogram', 'gaugehistogram']:
  401. _check_histogram(samples, name)
  402. metric = Metric(name, documentation, typ, unit)
  403. # TODO: check labelvalues are valid utf8
  404. metric.samples = samples
  405. return metric
  406. for line in fd:
  407. if line[-1] == '\n':
  408. line = line[:-1]
  409. if eof:
  410. raise ValueError("Received line after # EOF: " + line)
  411. if line == '# EOF':
  412. eof = True
  413. elif line.startswith('#'):
  414. parts = line.split(' ', 3)
  415. if len(parts) < 4:
  416. raise ValueError("Invalid line: " + line)
  417. if parts[2] == name and samples:
  418. raise ValueError("Received metadata after samples: " + line)
  419. if parts[2] != name:
  420. if name is not None:
  421. yield build_metric(name, documentation, typ, unit, samples)
  422. # New metric
  423. name = parts[2]
  424. unit = None
  425. typ = None
  426. documentation = None
  427. group = None
  428. seen_groups = set()
  429. group_timestamp = None
  430. group_timestamp_samples = set()
  431. samples = []
  432. allowed_names = [parts[2]]
  433. if parts[1] == 'HELP':
  434. if documentation is not None:
  435. raise ValueError("More than one HELP for metric: " + line)
  436. if len(parts) == 4:
  437. documentation = _unescape_help(parts[3])
  438. elif len(parts) == 3:
  439. raise ValueError("Invalid line: " + line)
  440. elif parts[1] == 'TYPE':
  441. if typ is not None:
  442. raise ValueError("More than one TYPE for metric: " + line)
  443. typ = parts[3]
  444. if typ == 'untyped':
  445. raise ValueError("Invalid TYPE for metric: " + line)
  446. allowed_names = {
  447. 'counter': ['_total', '_created'],
  448. 'summary': ['_count', '_sum', '', '_created'],
  449. 'histogram': ['_count', '_sum', '_bucket', '_created'],
  450. 'gaugehistogram': ['_gcount', '_gsum', '_bucket'],
  451. 'info': ['_info'],
  452. }.get(typ, [''])
  453. allowed_names = [name + n for n in allowed_names]
  454. elif parts[1] == 'UNIT':
  455. if unit is not None:
  456. raise ValueError("More than one UNIT for metric: " + line)
  457. unit = parts[3]
  458. else:
  459. raise ValueError("Invalid line: " + line)
  460. else:
  461. sample = _parse_sample(line)
  462. if sample.name not in allowed_names:
  463. if name is not None:
  464. yield build_metric(name, documentation, typ, unit, samples)
  465. # Start an unknown metric.
  466. name = sample.name
  467. documentation = None
  468. unit = None
  469. typ = 'unknown'
  470. samples = []
  471. group = None
  472. group_timestamp = None
  473. group_timestamp_samples = set()
  474. seen_groups = set()
  475. allowed_names = [sample.name]
  476. if typ == 'stateset' and name not in sample.labels:
  477. raise ValueError("Stateset missing label: " + line)
  478. if (typ in ['histogram', 'gaugehistogram'] and name + '_bucket' == sample.name
  479. and (sample.labels.get('le', "NaN") == "NaN"
  480. or _isUncanonicalNumber(sample.labels['le']))):
  481. raise ValueError("Invalid le label: " + line)
  482. if (typ == 'summary' and name == sample.name
  483. and (not (0 <= float(sample.labels.get('quantile', -1)) <= 1)
  484. or _isUncanonicalNumber(sample.labels['quantile']))):
  485. raise ValueError("Invalid quantile label: " + line)
  486. g = tuple(sorted(_group_for_sample(sample, name, typ).items()))
  487. if group is not None and g != group and g in seen_groups:
  488. raise ValueError("Invalid metric grouping: " + line)
  489. if group is not None and g == group:
  490. if (sample.timestamp is None) != (group_timestamp is None):
  491. raise ValueError("Mix of timestamp presence within a group: " + line)
  492. if group_timestamp is not None and group_timestamp > sample.timestamp and typ != 'info':
  493. raise ValueError("Timestamps went backwards within a group: " + line)
  494. else:
  495. group_timestamp_samples = set()
  496. series_id = (sample.name, tuple(sorted(sample.labels.items())))
  497. if sample.timestamp != group_timestamp or series_id not in group_timestamp_samples:
  498. # Not a duplicate due to timestamp truncation.
  499. samples.append(sample)
  500. group_timestamp_samples.add(series_id)
  501. group = g
  502. group_timestamp = sample.timestamp
  503. seen_groups.add(g)
  504. if typ == 'stateset' and sample.value not in [0, 1]:
  505. raise ValueError("Stateset samples can only have values zero and one: " + line)
  506. if typ == 'info' and sample.value != 1:
  507. raise ValueError("Info samples can only have value one: " + line)
  508. if typ == 'summary' and name == sample.name and sample.value < 0:
  509. raise ValueError("Quantile values cannot be negative: " + line)
  510. if sample.name[len(name):] in ['_total', '_sum', '_count', '_bucket', '_gcount', '_gsum'] and math.isnan(
  511. sample.value):
  512. raise ValueError("Counter-like samples cannot be NaN: " + line)
  513. if sample.name[len(name):] in ['_total', '_sum', '_count', '_bucket', '_gcount'] and sample.value < 0:
  514. raise ValueError("Counter-like samples cannot be negative: " + line)
  515. if sample.exemplar and not (
  516. (typ in ['histogram', 'gaugehistogram'] and sample.name.endswith('_bucket'))
  517. or (typ in ['counter'] and sample.name.endswith('_total'))):
  518. raise ValueError("Invalid line only histogram/gaugehistogram buckets and counters can have exemplars: " + line)
  519. if name is not None:
  520. yield build_metric(name, documentation, typ, unit, samples)
  521. if not eof:
  522. raise ValueError("Missing # EOF at end")