variety.js 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. /* Variety: A MongoDB Schema Analyzer
  2. This tool helps you get a sense of your application's schema, as well as any
  3. outliers to that schema. Particularly useful when you inherit a codebase with
  4. data dump and want to quickly learn how the data's structured. Also useful for
  5. finding rare keys.
  6. Please see https://github.com/variety/variety for details.
  7. Released by Maypop Inc, © 2012-2018, under the MIT License. */
  8. (function () {
  9. 'use strict'; // wraps everything for which we can use strict mode -JC
  10. var log = function(message) {
  11. if(!__quiet) { // mongo shell param, coming from https://github.com/mongodb/mongo/blob/5fc306543cd3ba2637e5cb0662cc375f36868b28/src/mongo/shell/dbshell.cpp#L624
  12. print(message);
  13. }
  14. };
  15. log('Variety: A MongoDB Schema Analyzer');
  16. log('Version 1.5.1, released 02 October 2017');
  17. var dbs = [];
  18. var emptyDbs = [];
  19. if (typeof slaveOk !== 'undefined') {
  20. if (slaveOk === true) {
  21. db.getMongo().setSlaveOk();
  22. }
  23. }
  24. var knownDatabases = db.adminCommand('listDatabases').databases;
  25. if(typeof knownDatabases !== 'undefined') { // not authorized user receives error response (json) without databases key
  26. knownDatabases.forEach(function(d){
  27. if(db.getSisterDB(d.name).getCollectionNames().length > 0) {
  28. dbs.push(d.name);
  29. }
  30. if(db.getSisterDB(d.name).getCollectionNames().length === 0) {
  31. emptyDbs.push(d.name);
  32. }
  33. });
  34. if (emptyDbs.indexOf(db.getName()) !== -1) {
  35. throw 'The database specified ('+ db +') is empty.\n'+
  36. 'Possible database options are: ' + dbs.join(', ') + '.';
  37. }
  38. if (dbs.indexOf(db.getName()) === -1) {
  39. throw 'The database specified ('+ db +') does not exist.\n'+
  40. 'Possible database options are: ' + dbs.join(', ') + '.';
  41. }
  42. }
  43. var collNames = db.getCollectionNames().join(', ');
  44. if (typeof collection === 'undefined') {
  45. throw 'You have to supply a \'collection\' variable, à la --eval \'var collection = "animals"\'.\n'+
  46. 'Possible collection options for database specified: ' + collNames + '.\n'+
  47. 'Please see https://github.com/variety/variety for details.';
  48. }
  49. if (db.getCollection(collection).count() === 0) {
  50. throw 'The collection specified (' + collection + ') in the database specified ('+ db +') does not exist or is empty.\n'+
  51. 'Possible collection options for database specified: ' + collNames + '.';
  52. }
  53. var readConfig = function(configProvider) {
  54. var config = {};
  55. var read = function(name, defaultValue) {
  56. var value = typeof configProvider[name] !== 'undefined' ? configProvider[name] : defaultValue;
  57. config[name] = value;
  58. log('Using '+name+' of ' + tojson(value));
  59. };
  60. read('collection', null);
  61. read('query', {});
  62. read('limit', db.getCollection(config.collection).find(config.query).count());
  63. read('maxDepth', 99);
  64. read('sort', {_id: -1});
  65. read('outputFormat', 'ascii');
  66. read('persistResults', false);
  67. read('resultsDatabase', 'varietyResults');
  68. read('resultsCollection', collection + 'Keys');
  69. read('resultsUser', null);
  70. read('resultsPass', null);
  71. read('logKeysContinuously', false);
  72. read('excludeSubkeys', []);
  73. read('arrayEscape', 'XX');
  74. read('lastValue', false);
  75. //Translate excludeSubkeys to set like object... using an object for compatibility...
  76. config.excludeSubkeys = config.excludeSubkeys.reduce(function (result, item) { result[item+'.'] = true; return result; }, {});
  77. return config;
  78. };
  79. var config = readConfig(this);
  80. var PluginsClass = function(context) {
  81. var parsePath = function(val) { return val.slice(-3) !== '.js' ? val + '.js' : val;};
  82. var parseConfig = function(val) {
  83. var config = {};
  84. val.split('&').reduce(function(acc, val) {
  85. var parts = val.split('=');
  86. acc[parts[0]] = parts[1];
  87. return acc;
  88. }, config);
  89. return config;
  90. };
  91. if(typeof context.plugins !== 'undefined') {
  92. this.plugins = context.plugins.split(',')
  93. .map(function(path){return path.trim();})
  94. .map(function(definition){
  95. var path = parsePath(definition.split('|')[0]);
  96. var config = parseConfig(definition.split('|')[1] || '');
  97. context.module = context.module || {};
  98. load(path);
  99. var plugin = context.module.exports;
  100. plugin.path = path;
  101. if(typeof plugin.init === 'function') {
  102. plugin.init(config);
  103. }
  104. return plugin;
  105. }, this);
  106. } else {
  107. this.plugins = [];
  108. }
  109. this.execute = function(methodName) {
  110. var args = Array.prototype.slice.call(arguments, 1);
  111. var applicablePlugins = this.plugins.filter(function(plugin){return typeof plugin[methodName] === 'function';});
  112. return applicablePlugins.map(function(plugin) {
  113. return plugin[methodName].apply(plugin, args);
  114. });
  115. };
  116. log('Using plugins of ' + tojson(this.plugins.map(function(plugin){return plugin.path;})));
  117. };
  118. var $plugins = new PluginsClass(this);
  119. $plugins.execute('onConfig', config);
  120. var varietyTypeOf = function(thing) {
  121. if (!arguments.length) { throw 'varietyTypeOf() requires an argument'; }
  122. if (typeof thing === 'undefined') {
  123. return 'undefined';
  124. } else if (typeof thing !== 'object') {
  125. // the messiness below capitalizes the first letter, so the output matches
  126. // the other return values below. -JC
  127. var typeofThing = typeof thing; // edgecase of JSHint's "singleGroups"
  128. return typeofThing[0].toUpperCase() + typeofThing.slice(1);
  129. } else {
  130. if (thing && thing.constructor === Array) {
  131. return 'Array';
  132. } else if (thing === null) {
  133. return 'null';
  134. } else if (thing instanceof Date) {
  135. return 'Date';
  136. } else if(thing instanceof NumberLong) {
  137. return 'NumberLong';
  138. } else if (thing instanceof ObjectId) {
  139. return 'ObjectId';
  140. } else if (thing instanceof BinData) {
  141. var binDataTypes = {};
  142. binDataTypes[0x00] = 'generic';
  143. binDataTypes[0x01] = 'function';
  144. binDataTypes[0x02] = 'old';
  145. binDataTypes[0x03] = 'UUID';
  146. binDataTypes[0x04] = 'UUID';
  147. binDataTypes[0x05] = 'MD5';
  148. binDataTypes[0x80] = 'user';
  149. return 'BinData-' + binDataTypes[thing.subtype()];
  150. } else {
  151. return 'Object';
  152. }
  153. }
  154. };
  155. //flattens object keys to 1D. i.e. {'key1':1,{'key2':{'key3':2}}} becomes {'key1':1,'key2.key3':2}
  156. //we assume no '.' characters in the keys, which is an OK assumption for MongoDB
  157. var serializeDoc = function(doc, maxDepth, excludeSubkeys) {
  158. var result = {};
  159. //determining if an object is a Hash vs Array vs something else is hard
  160. //returns true, if object in argument may have nested objects and makes sense to analyse its content
  161. function isHash(v) {
  162. var isArray = Array.isArray(v);
  163. var isObject = typeof v === 'object';
  164. var specialObject = v instanceof Date ||
  165. v instanceof ObjectId ||
  166. v instanceof BinData ||
  167. v instanceof NumberLong;
  168. return !specialObject && (isArray || isObject);
  169. }
  170. var arrayRegex = new RegExp('\\.' + config.arrayEscape + '\\d+' + config.arrayEscape + '\\.', 'g');
  171. function serialize(document, parentKey, maxDepth) {
  172. if(Object.prototype.hasOwnProperty.call(excludeSubkeys, parentKey.replace(arrayRegex, '.')))
  173. return;
  174. for(var key in document) {
  175. //skip over inherited properties such as string, length, etch
  176. if(!document.hasOwnProperty(key)) {
  177. continue;
  178. }
  179. var value = document[key];
  180. if(Array.isArray(document))
  181. key = config.arrayEscape + key + config.arrayEscape; //translate unnamed object key from {_parent_name_}.{_index_} to {_parent_name_}.arrayEscape{_index_}arrayEscape.
  182. result[parentKey+key] = value;
  183. //it's an object, recurse...only if we haven't reached max depth
  184. if(isHash(value) && maxDepth > 1) {
  185. serialize(value, parentKey+key+'.', maxDepth-1);
  186. }
  187. }
  188. }
  189. serialize(doc, '', maxDepth);
  190. return result;
  191. };
  192. // convert document to key-value map, where value is always an array with types as plain strings
  193. var analyseDocument = function(document) {
  194. var result = {};
  195. var arrayRegex = new RegExp('\\.' + config.arrayEscape + '\\d+' + config.arrayEscape, 'g');
  196. for (var key in document) {
  197. var value = document[key];
  198. key = key.replace(arrayRegex, '.' + config.arrayEscape);
  199. if(typeof result[key] === 'undefined') {
  200. result[key] = {};
  201. }
  202. var type = varietyTypeOf(value);
  203. result[key][type] = null;
  204. if(config.lastValue){
  205. if (type in {'String': true, 'Boolean': true}) {
  206. result[key][type] = value.toString();
  207. }else if (type in {'Number': true, 'NumberLong': true}) {
  208. result[key][type] = value.valueOf();
  209. }else if(type == 'ObjectId'){
  210. result[key][type] = value.str;
  211. }else if(type == 'Date'){
  212. result[key][type] = new Date(value).getTime();
  213. }else if(type.startsWith('BinData')){
  214. result[key][type] = value.hex();
  215. }
  216. }
  217. }
  218. return result;
  219. };
  220. var mergeDocument = function(docResult, interimResults) {
  221. for (var key in docResult) {
  222. if(key in interimResults) {
  223. var existing = interimResults[key];
  224. for(var type in docResult[key]) {
  225. if (type in existing.types) {
  226. existing.types[type] = existing.types[type] + 1;
  227. } else {
  228. existing.types[type] = 1;
  229. if (config.logKeysContinuously) {
  230. log('Found new key type "' + key + '" type "' + type + '"');
  231. }
  232. }
  233. }
  234. existing.totalOccurrences = existing.totalOccurrences + 1;
  235. } else {
  236. var lastValue = null;
  237. var types = {};
  238. for (var newType in docResult[key]) {
  239. types[newType] = 1;
  240. lastValue = docResult[key][newType];
  241. if (config.logKeysContinuously) {
  242. log('Found new key type "' + key + '" type "' + newType + '"');
  243. }
  244. }
  245. interimResults[key] = {'types': types,'totalOccurrences':1};
  246. if (config.lastValue) {
  247. interimResults[key]['lastValue'] = lastValue ? lastValue : '['+newType+']';
  248. }
  249. }
  250. }
  251. };
  252. var convertResults = function(interimResults, documentsCount) {
  253. var getKeys = function(obj) {
  254. var keys = {};
  255. for(var key in obj) {
  256. keys[key] = obj[key];
  257. }
  258. return keys;
  259. //return keys.sort();
  260. };
  261. var varietyResults = [];
  262. //now convert the interimResults into the proper format
  263. for(var key in interimResults) {
  264. var entry = interimResults[key];
  265. var obj = {
  266. '_id': {'key':key},
  267. 'value': {'types':getKeys(entry.types)},
  268. 'totalOccurrences': entry.totalOccurrences,
  269. 'percentContaining': entry.totalOccurrences * 100 / documentsCount
  270. };
  271. if(config.lastValue){
  272. obj.lastValue = entry.lastValue;
  273. }
  274. varietyResults.push(obj);
  275. }
  276. return varietyResults;
  277. };
  278. // Merge the keys and types of current object into accumulator object
  279. var reduceDocuments = function(accumulator, object) {
  280. var docResult = analyseDocument(serializeDoc(object, config.maxDepth, config.excludeSubkeys));
  281. mergeDocument(docResult, accumulator);
  282. return accumulator;
  283. };
  284. // We throw away keys which end in an array index, since they are not useful
  285. // for our analysis. (We still keep the key of their parent array, though.) -JC
  286. var arrayRegex = new RegExp('\\.' + config.arrayEscape + '$', 'g');
  287. var filter = function(item) {
  288. return !item._id.key.match(arrayRegex);
  289. };
  290. // sort desc by totalOccurrences or by key asc if occurrences equal
  291. var comparator = function(a, b) {
  292. var countsDiff = b.totalOccurrences - a.totalOccurrences;
  293. return countsDiff !== 0 ? countsDiff : a._id.key.localeCompare(b._id.key);
  294. };
  295. // extend standard MongoDB cursor of reduce method - call forEach and combine the results
  296. DBQuery.prototype.reduce = function(callback, initialValue) {
  297. var result = initialValue;
  298. this.forEach(function(obj){
  299. result = callback(result, obj);
  300. });
  301. return result;
  302. };
  303. var cursor = db.getCollection(config.collection).find(config.query).sort(config.sort).limit(config.limit);
  304. var interimResults = cursor.reduce(reduceDocuments, {});
  305. var varietyResults = convertResults(interimResults, cursor.size())
  306. .filter(filter)
  307. .sort(comparator);
  308. if(config.persistResults) {
  309. var resultsDB;
  310. var resultsCollectionName = config.resultsCollection;
  311. if (config.resultsDatabase.indexOf('/') === -1) {
  312. // Local database; don't reconnect
  313. resultsDB = db.getMongo().getDB(config.resultsDatabase);
  314. } else {
  315. // Remote database, establish new connection
  316. resultsDB = connect(config.resultsDatabase);
  317. }
  318. if (config.resultsUser !== null && config.resultsPass !== null) {
  319. resultsDB.auth(config.resultsUser, config.resultsPass);
  320. }
  321. // replace results collection
  322. log('replacing results collection: '+ resultsCollectionName);
  323. resultsDB.getCollection(resultsCollectionName).drop();
  324. resultsDB.getCollection(resultsCollectionName).insert(varietyResults);
  325. }
  326. var createAsciiTable = function(results) {
  327. var headers = ['key', 'types', 'occurrences', 'percents'];
  328. if (config.lastValue) {
  329. headers.push('lastValue');
  330. }
  331. // return the number of decimal places or 1, if the number is int (1.23=>2, 100=>1, 0.1415=>4)
  332. var significantDigits = function(value) {
  333. var res = value.toString().match(/^[0-9]+\.([0-9]+)$/);
  334. return res !== null ? res[1].length : 1;
  335. };
  336. var maxDigits = varietyResults.map(function(value){return significantDigits(value.percentContaining);}).reduce(function(acc,val){return acc>val?acc:val;});
  337. var rows = results.map(function(row) {
  338. var types = [];
  339. var typeKeys = Object.keys(row.value.types);
  340. if (typeKeys.length > 1) {
  341. for (var type in row.value.types) {
  342. var typestring = type + ' (' + row.value.types[type] + ')';
  343. types.push(typestring);
  344. }
  345. } else {
  346. types = typeKeys;
  347. }
  348. var rawArray = [row._id.key, types, row.totalOccurrences, row.percentContaining.toFixed(Math.min(maxDigits, 20))];
  349. if (config.lastValue && row['lastValue']) {
  350. rawArray.push(row['lastValue']);
  351. }
  352. return rawArray;
  353. });
  354. var table = [headers, headers.map(function(){return '';})].concat(rows);
  355. var colMaxWidth = function(arr, index) {return Math.max.apply(null, arr.map(function(row){return row[index] ? row[index].toString().length : 0;}));};
  356. var pad = function(width, string, symbol) { return width <= string.length ? string : pad(width, isNaN(string) ? string + symbol : symbol + string, symbol); };
  357. table = table.map(function(row, ri){
  358. return '| ' + row.map(function(cell, i) {return pad(colMaxWidth(table, i), cell.toString(), ri === 1 ? '-' : ' ');}).join(' | ') + ' |';
  359. });
  360. var border = '+' + pad(table[0].length - 2, '', '-') + '+';
  361. return [border].concat(table).concat(border).join('\n');
  362. };
  363. var pluginsOutput = $plugins.execute('formatResults', varietyResults);
  364. if (pluginsOutput.length > 0) {
  365. pluginsOutput.forEach(function(i){print(i);});
  366. } else if(config.outputFormat === 'json') {
  367. printjson(varietyResults); // valid formatted json output, compressed variant is printjsononeline()
  368. } else {
  369. print(createAsciiTable(varietyResults)); // output nice ascii table with results
  370. }
  371. }.bind(this)()); // end strict mode