memory_control.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. import datetime
  2. import gc
  3. import logging
  4. import os
  5. import re
  6. import subprocess
  7. from collections import OrderedDict
  8. from typing import Union
  9. from time import sleep
  10. import psutil
  11. try:
  12. from config import USE_MEMORY_CONTROL
  13. except ImportError:
  14. USE_MEMORY_CONTROL = False
  15. try:
  16. from config import GPU_MEMORY_USAGE
  17. except ImportError:
  18. GPU_MEMORY_USAGE: Union[float, str] = 'growth'
  19. class NVLog(dict):
  20. __indent_re__ = re.compile('^ *')
  21. __version_re__ = re.compile(r'v([0-9.]+)$')
  22. def __init__(self):
  23. super().__init__()
  24. lines = run_cmd(['nvidia-smi', '-q'])
  25. lines = lines.splitlines()
  26. while '' in lines:
  27. lines.remove('')
  28. path = [self]
  29. self['version'] = self.__version__()
  30. for line in lines[1:]:
  31. indent = NVLog.__get_indent__(line)
  32. line = NVLog.__parse_key_value_pair__(line)
  33. while indent < len(path) * 4 - 4:
  34. path.pop()
  35. cursor = path[-1]
  36. if len(line) == 1:
  37. if line[0] == 'Processes':
  38. cursor[line[0]] = []
  39. else:
  40. cursor[line[0]] = {}
  41. cursor = cursor[line[0]]
  42. path.append(cursor)
  43. elif len(line) == 2:
  44. if line[0] in ['GPU instance ID', 'Compute instance ID']:
  45. continue
  46. if line[0] == 'Process ID':
  47. cursor.append({})
  48. cursor = cursor[-1]
  49. path.append(cursor)
  50. cursor[line[0]] = line[1]
  51. self['Attached GPUs'] = OrderedDict()
  52. keys = list(self.keys())
  53. for i in keys:
  54. if i.startswith('GPU '):
  55. self['Attached GPUs'][i] = self[i]
  56. del self[i]
  57. @staticmethod
  58. def __get_indent__(line):
  59. return len(NVLog.__indent_re__.match(line).group())
  60. @staticmethod
  61. def __parse_key_value_pair__(line):
  62. result = line.split(' : ')
  63. result[0] = result[0].strip()
  64. if len(result) > 1:
  65. try:
  66. result[1] = int(result[1])
  67. except:
  68. pass
  69. if result[1] in ['N/A', 'None']:
  70. result[1] = None
  71. if result[1] in ['Disabled', 'No']:
  72. result[1] = False
  73. return result
  74. def __get_processes__(self):
  75. processes = []
  76. for i, gpu in enumerate(self['Attached GPUs']):
  77. gpu = self['Attached GPUs'][gpu]
  78. if gpu['Processes']:
  79. for j in gpu['Processes']:
  80. processes.append((i, j))
  81. return processes
  82. @staticmethod
  83. def __version__():
  84. lines = run_cmd(['nvidia-smi', '-h'])
  85. lines = lines.splitlines()
  86. result = NVLog.__version_re__.search(lines[0]).group(1)
  87. return result
  88. def gpu_table(self):
  89. output = []
  90. output.append(self['Timestamp'])
  91. output.append('+-----------------------------------------------------------------------------+')
  92. values = []
  93. values.append(self['version'])
  94. values.append(self['Driver Version'])
  95. if 'CUDA Version' in self:
  96. values.append(self['CUDA Version'])
  97. else:
  98. values.append('N/A')
  99. output.append('| NVIDIA-SMI %s Driver Version: %s CUDA Version: %-5s |' % tuple(values))
  100. output.append('|-------------------------------+----------------------+----------------------+')
  101. output.append('| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |')
  102. output.append('| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |')
  103. output.append('|===============================+======================+======================|')
  104. for i, gpu in enumerate(self['Attached GPUs']):
  105. gpu = self['Attached GPUs'][gpu]
  106. values = []
  107. values.append(i)
  108. values.append(gpu['Product Name'])
  109. values.append('On' if gpu['Persistence Mode'] else 'Off')
  110. values.append(gpu['PCI']['Bus Id'])
  111. values.append('On' if gpu['Display Active'] else 'Off')
  112. output.append('| %d %-19s %3s | %s %3s | N/A |' % tuple(values))
  113. values = []
  114. values.append(gpu['Fan Speed'].replace(' ', ''))
  115. values.append(gpu['Temperature']['GPU Current Temp'].replace(' ', ''))
  116. values.append(gpu['Performance State'])
  117. values.append(int(float(gpu['Power Readings']['Power Draw'][:-2])))
  118. values.append(int(float(gpu['Power Readings']['Power Limit'][:-2])))
  119. values.append(gpu['FB Memory Usage']['Used'].replace(' ', ''))
  120. values.append(gpu['FB Memory Usage']['Total'].replace(' ', ''))
  121. values.append(gpu['Utilization']['Gpu'].replace(' ', ''))
  122. values.append(gpu['Compute Mode'])
  123. output.append('| %3s %3s %s %3dW / %3dW | %8s / %8s | %4s %8s |' % tuple(values))
  124. output.append('+-----------------------------------------------------------------------------+')
  125. return '\n'.join(output)
  126. def processes_table(self):
  127. output = []
  128. output.append('+-----------------------------------------------------------------------------+')
  129. output.append('| Processes: GPU Memory |')
  130. output.append('| GPU PID Type Process name Usage |')
  131. output.append('|=============================================================================|')
  132. processes = self.__get_processes__()
  133. if len(processes) == 0:
  134. output.append('| No running processes found |')
  135. for i, process in processes:
  136. values = []
  137. values.append(i)
  138. values.append(process['Process ID'])
  139. values.append(process['Type'])
  140. if len(process['Name']) > 42:
  141. values.append(process['Name'][:39] + '...')
  142. else:
  143. values.append(process['Name'])
  144. values.append(process['Used GPU Memory'].replace(' ', ''))
  145. output.append('| %2d %5d %6s %-42s %8s |' % tuple(values))
  146. output.append('+-----------------------------------------------------------------------------+')
  147. return '\n'.join(output)
  148. def as_table(self):
  149. output = []
  150. output.append(self.gpu_table())
  151. output.append('')
  152. output.append(self.processes_table())
  153. return '\n'.join(output)
  154. class NVLogPlus(NVLog):
  155. def processes_table(self):
  156. output = ['+-----------------------------------------------------------------------------+',
  157. '| Processes: GPU Memory |',
  158. '| GPU PID User Process name Usage |',
  159. '|=============================================================================|']
  160. processes = self.__get_processes__()
  161. if len(processes) == 0:
  162. output.append('| No running processes found |')
  163. for i, process in processes:
  164. values = []
  165. values.append(i)
  166. values.append(process['Process ID'])
  167. p = psutil.Process(process['Process ID'])
  168. with p.oneshot():
  169. values.append(p.username()[:8].center(8))
  170. command = p.cmdline()
  171. command[0] = os.path.basename(command[0])
  172. command = ' '.join(command)
  173. if len(command) > 42:
  174. values.append(command[:39] + '...')
  175. else:
  176. values.append(command)
  177. values.append(process['Used GPU Memory'].replace(' ', ''))
  178. output.append('| %2d %5d %8s %-42s %8s |' % tuple(values))
  179. output.append('+-----------------------------------------------------------------------------+')
  180. return '\n'.join(output)
  181. def run_cmd(cmd):
  182. return subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stdout.decode('utf-8')
  183. class MemoryLimiter:
  184. limited = False
  185. @classmethod
  186. def limit_memory_usage(cls, verbose=1):
  187. if not USE_MEMORY_CONTROL:
  188. print('memory_control module disabled.')
  189. return
  190. if cls.limited:
  191. if verbose:
  192. print('Already limited memory usage. Skipping...')
  193. return
  194. use_gpu_idx = None
  195. printed_compute_mode = False
  196. while True: # until a GPU is free
  197. try:
  198. data = NVLog()
  199. except FileNotFoundError:
  200. print('WARNING: nvidia-smi is not available')
  201. break
  202. if 'use_specific_gpu' in os.environ:
  203. print(f'use_specific_gpu = {os.environ["use_specific_gpu"]}')
  204. use_gpu_idx = int(os.environ['use_specific_gpu'])
  205. else:
  206. for idx, gpu_data in reversed(list(enumerate(data['Attached GPUs'].values()))):
  207. any_processes = (
  208. gpu_data['Processes'] is not None
  209. or gpu_data['Utilization']['Memory'] != '0 %'
  210. or gpu_data['FB Memory Usage']['Used'] != '0 MiB'
  211. )
  212. compute_mode = gpu_data['Compute Mode']
  213. if not printed_compute_mode:
  214. print('GPU Compute Mode:', compute_mode)
  215. printed_compute_mode = True
  216. if compute_mode in ['Exclusive_Process', 'Exclusive_Thread'] or os.environ.get('use_empty_gpu'):
  217. if not any_processes:
  218. use_gpu_idx = idx
  219. break
  220. elif compute_mode == 'Default':
  221. if GPU_MEMORY_USAGE != 'growth':
  222. free_memory = int(re.search(r'(\d+) MiB', gpu_data['FB Memory Usage']['Free']).group(1))
  223. if free_memory > 2.5 * GPU_MEMORY_USAGE:
  224. use_gpu_idx = idx
  225. break
  226. else:
  227. use_gpu_idx = idx
  228. break
  229. elif compute_mode == 'Prohibited':
  230. continue
  231. else:
  232. raise NotImplementedError(f'Unknown compute mode: {compute_mode}.')
  233. else:
  234. print(datetime.datetime.now().strftime("%H:%M") + ': All GPUs are currently in use.')
  235. sleep(300)
  236. continue
  237. os.environ["CUDA_VISIBLE_DEVICES"] = str(use_gpu_idx)
  238. print('Using GPU', f'{use_gpu_idx}:', list(data['Attached GPUs'].values())[use_gpu_idx]['Product Name'])
  239. break
  240. import tensorflow as tf
  241. # limit GPU memory usage
  242. # gpu_memory_limit = 3.5 * 1024
  243. for gpu in tf.config.experimental.list_physical_devices('GPU'):
  244. if GPU_MEMORY_USAGE == 'growth':
  245. tf.config.experimental.set_memory_growth(gpu, True)
  246. else:
  247. tf.config.experimental.set_virtual_device_configuration(gpu, [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=GPU_MEMORY_USAGE)])
  248. # tf.config.experimental.set_virtual_device_configuration(gpus[use_gpu_idx], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2)])
  249. # limit RAM usage
  250. try:
  251. from lib.memory_limit_windows import create_job, limit_memory, assign_job
  252. import psutil
  253. ram_limit = psutil.virtual_memory().total * 2 // 3
  254. print('Limiting RAM usage to {0:,} Bytes.'.format(ram_limit))
  255. assign_job(create_job())
  256. limit_memory(ram_limit)
  257. except ModuleNotFoundError:
  258. try:
  259. from lib.memory_limit_linux import limit_memory, get_memory
  260. ram_limit = get_memory() * 2 // 3
  261. print('Limiting RAM usage to {0:,} Bytes.'.format(ram_limit))
  262. limit_memory(ram_limit)
  263. except ModuleNotFoundError:
  264. print('WARNING: Setting memory limit failed. '
  265. 'This can happen if you are not on Windows nor on Linux or if you have forgot to install some dependencies.')
  266. cls.limited = True
  267. MemoryLimiter.limit_memory_usage()
  268. def tf_memory_leak_cleanup():
  269. import tensorflow
  270. for obj in gc.get_objects():
  271. if isinstance(obj, tensorflow.Graph):
  272. if hasattr(obj, '_py_funcs_used_in_graph'):
  273. del obj._py_funcs_used_in_graph[:]
  274. if isinstance(obj, tensorflow.keras.utils.GeneratorEnqueuer):
  275. obj.stop()
  276. gc.collect()