remove_old_data_by_dt.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. # -*- coding: utf-8 -*-
  2. # !/usr/bin/env python
  3. """
  4. 删除已经迁移到历史数据库的数据. dateTimeAdded作为过滤条件
  5. """
  6. import calendar
  7. import datetime
  8. import getopt
  9. import os
  10. import sys
  11. try:
  12. options, args = getopt.getopt(sys.argv[1:], 'l:e:y:m:o:c:',
  13. ['log=', 'env=', 'year=', 'month=', 'model=', 'check='])
  14. except getopt.GetoptError as e:
  15. print(str(e))
  16. sys.exit()
  17. log_file = None
  18. platform_env = 'testing'
  19. year = None
  20. month = None
  21. model_name = None
  22. check = True
  23. for name, value in options:
  24. if name in ('-l', '--log'):
  25. log_file = value
  26. if name in ('-e', '--env'):
  27. platform_env = value
  28. if name in ('-y', '--year'):
  29. year = int(value)
  30. if name in ('-m', '--month'):
  31. month = int(value)
  32. if name in ('-o', '--model'):
  33. model_name = value
  34. if name in ('-c', '--check'):
  35. check = True if value in ['y', 'Y'] else False
  36. print 'check is {}'.format(check)
  37. os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'configs.{env}'.format(env = platform_env))
  38. PROJECT_ROOT = os.path.join(os.path.abspath(os.path.split(os.path.realpath(__file__))[0] + "/.."), '..')
  39. sys.path.insert(0, PROJECT_ROOT)
  40. from script.base import init_env, setup_logger, get_logger
  41. init_env(interactive = False)
  42. if log_file:
  43. logger = setup_logger(filename = log_file, namespace = __name__)
  44. else:
  45. logger = get_logger(__name__)
  46. from apps.web.models import ArchivedModelProxyConfig
  47. from apilib.utils_mongo import BulkHandlerEx
  48. item = ArchivedModelProxyConfig.objects(model = model_name).first() # type: ArchivedModelProxyConfig
  49. if not item:
  50. item = ArchivedModelProxyConfig.objects(model = 'default').first() # type: ArchivedModelProxyConfig
  51. his_data_line = '-'.join(item.startDay.split('-')[0:2])
  52. print his_data_line
  53. if not year or not month or not model_name:
  54. logger.error('year or month or model is not valid.')
  55. sys.exit(1)
  56. start_day = '%04d-%02d-%02d' % (year, month, 1)
  57. if month < 12:
  58. next_year = year
  59. next_month = month + 1
  60. else:
  61. next_year = (year + 1)
  62. next_month = 1
  63. end_day = '%04d-%02d-%02d' % (next_year, next_month, 1)
  64. print start_day
  65. print end_day
  66. if check and '%04d-%02d' % (year, month) >= his_data_line:
  67. logger.error('{} >= {}'.format('%04d-%02d' % (year, month), his_data_line))
  68. sys.exit(1)
  69. _, lastDay = calendar.monthrange(year, month)
  70. start_day = datetime.datetime.strptime('%04d-%02d-%02d' % (year, month, 1), "%Y-%m-%d")
  71. end_day = datetime.datetime.strptime('%04d-%02d-%02d' % (year, month, lastDay), "%Y-%m-%d") + datetime.timedelta(
  72. days = 1)
  73. print start_day
  74. print end_day
  75. delete_id_list = []
  76. from script.db import MODEL_MAP
  77. model_class = MODEL_MAP.get(model_name)
  78. items = model_class.get_collection().find({
  79. 'dateTimeAdded': {'$gte': start_day, '$lt': end_day}
  80. }, {'_id': 1}).batch_size(50000).limit(1000000)
  81. for item in items:
  82. # print item['_id']
  83. delete_id_list.append(item['_id'])
  84. logger.debug('try to delete {} items.'.format(len(delete_id_list)))
  85. bulker = BulkHandlerEx(model_class.get_collection()) # type: BulkHandlerEx
  86. for _id in delete_id_list:
  87. bulker.delete(query_dict = {'_id': _id})
  88. if len(bulker.requests) >= 2000:
  89. bulker.execute()
  90. bulker = BulkHandlerEx(model_class.get_collection()) # type: BulkHandlerEx
  91. if len(bulker.requests) > 0:
  92. bulker.execute()
  93. bulker = None