123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311 |
- import datetime
- import gc
- import logging
- import os
- import re
- import subprocess
- from collections import OrderedDict
- from typing import Union
- from time import sleep
- import psutil
- try:
- from config import USE_MEMORY_CONTROL
- except ImportError:
- USE_MEMORY_CONTROL = False
- try:
- from config import GPU_MEMORY_USAGE
- except ImportError:
- GPU_MEMORY_USAGE: Union[float, str] = 'growth'
- class NVLog(dict):
- __indent_re__ = re.compile('^ *')
- __version_re__ = re.compile(r'v([0-9.]+)$')
- def __init__(self):
- super().__init__()
- lines = run_cmd(['nvidia-smi', '-q'])
- lines = lines.splitlines()
- while '' in lines:
- lines.remove('')
- path = [self]
- self['version'] = self.__version__()
- for line in lines[1:]:
- indent = NVLog.__get_indent__(line)
- line = NVLog.__parse_key_value_pair__(line)
- while indent < len(path) * 4 - 4:
- path.pop()
- cursor = path[-1]
- if len(line) == 1:
- if line[0] == 'Processes':
- cursor[line[0]] = []
- else:
- cursor[line[0]] = {}
- cursor = cursor[line[0]]
- path.append(cursor)
- elif len(line) == 2:
- if line[0] in ['GPU instance ID', 'Compute instance ID']:
- continue
- if line[0] == 'Process ID':
- cursor.append({})
- cursor = cursor[-1]
- path.append(cursor)
- cursor[line[0]] = line[1]
- self['Attached GPUs'] = OrderedDict()
- keys = list(self.keys())
- for i in keys:
- if i.startswith('GPU '):
- self['Attached GPUs'][i] = self[i]
- del self[i]
- @staticmethod
- def __get_indent__(line):
- return len(NVLog.__indent_re__.match(line).group())
- @staticmethod
- def __parse_key_value_pair__(line):
- result = line.split(' : ')
- result[0] = result[0].strip()
- if len(result) > 1:
- try:
- result[1] = int(result[1])
- except:
- pass
- if result[1] in ['N/A', 'None']:
- result[1] = None
- if result[1] in ['Disabled', 'No']:
- result[1] = False
- return result
- def __get_processes__(self):
- processes = []
- for i, gpu in enumerate(self['Attached GPUs']):
- gpu = self['Attached GPUs'][gpu]
- if gpu['Processes']:
- for j in gpu['Processes']:
- processes.append((i, j))
- return processes
- @staticmethod
- def __version__():
- lines = run_cmd(['nvidia-smi', '-h'])
- lines = lines.splitlines()
- result = NVLog.__version_re__.search(lines[0]).group(1)
- return result
- def gpu_table(self):
- output = []
- output.append(self['Timestamp'])
- output.append('+-----------------------------------------------------------------------------+')
- values = []
- values.append(self['version'])
- values.append(self['Driver Version'])
- if 'CUDA Version' in self:
- values.append(self['CUDA Version'])
- else:
- values.append('N/A')
- output.append('| NVIDIA-SMI %s Driver Version: %s CUDA Version: %-5s |' % tuple(values))
- output.append('|-------------------------------+----------------------+----------------------+')
- output.append('| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |')
- output.append('| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |')
- output.append('|===============================+======================+======================|')
- for i, gpu in enumerate(self['Attached GPUs']):
- gpu = self['Attached GPUs'][gpu]
- values = []
- values.append(i)
- values.append(gpu['Product Name'])
- values.append('On' if gpu['Persistence Mode'] else 'Off')
- values.append(gpu['PCI']['Bus Id'])
- values.append('On' if gpu['Display Active'] else 'Off')
- output.append('| %d %-19s %3s | %s %3s | N/A |' % tuple(values))
- values = []
- values.append(gpu['Fan Speed'].replace(' ', ''))
- values.append(gpu['Temperature']['GPU Current Temp'].replace(' ', ''))
- values.append(gpu['Performance State'])
- values.append(int(float(gpu['Power Readings']['Power Draw'][:-2])))
- values.append(int(float(gpu['Power Readings']['Power Limit'][:-2])))
- values.append(gpu['FB Memory Usage']['Used'].replace(' ', ''))
- values.append(gpu['FB Memory Usage']['Total'].replace(' ', ''))
- values.append(gpu['Utilization']['Gpu'].replace(' ', ''))
- values.append(gpu['Compute Mode'])
- output.append('| %3s %3s %s %3dW / %3dW | %8s / %8s | %4s %8s |' % tuple(values))
- output.append('+-----------------------------------------------------------------------------+')
- return '\n'.join(output)
- def processes_table(self):
- output = []
- output.append('+-----------------------------------------------------------------------------+')
- output.append('| Processes: GPU Memory |')
- output.append('| GPU PID Type Process name Usage |')
- output.append('|=============================================================================|')
- processes = self.__get_processes__()
- if len(processes) == 0:
- output.append('| No running processes found |')
- for i, process in processes:
- values = []
- values.append(i)
- values.append(process['Process ID'])
- values.append(process['Type'])
- if len(process['Name']) > 42:
- values.append(process['Name'][:39] + '...')
- else:
- values.append(process['Name'])
- values.append(process['Used GPU Memory'].replace(' ', ''))
- output.append('| %2d %5d %6s %-42s %8s |' % tuple(values))
- output.append('+-----------------------------------------------------------------------------+')
- return '\n'.join(output)
- def as_table(self):
- output = []
- output.append(self.gpu_table())
- output.append('')
- output.append(self.processes_table())
- return '\n'.join(output)
- class NVLogPlus(NVLog):
- def processes_table(self):
- output = ['+-----------------------------------------------------------------------------+',
- '| Processes: GPU Memory |',
- '| GPU PID User Process name Usage |',
- '|=============================================================================|']
- processes = self.__get_processes__()
- if len(processes) == 0:
- output.append('| No running processes found |')
- for i, process in processes:
- values = []
- values.append(i)
- values.append(process['Process ID'])
- p = psutil.Process(process['Process ID'])
- with p.oneshot():
- values.append(p.username()[:8].center(8))
- command = p.cmdline()
- command[0] = os.path.basename(command[0])
- command = ' '.join(command)
- if len(command) > 42:
- values.append(command[:39] + '...')
- else:
- values.append(command)
- values.append(process['Used GPU Memory'].replace(' ', ''))
- output.append('| %2d %5d %8s %-42s %8s |' % tuple(values))
- output.append('+-----------------------------------------------------------------------------+')
- return '\n'.join(output)
- def run_cmd(cmd):
- return subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stdout.decode('utf-8')
- class MemoryLimiter:
- limited = False
- @classmethod
- def limit_memory_usage(cls, verbose=1):
- if not USE_MEMORY_CONTROL:
- print('memory_control module disabled.')
- return
- if cls.limited:
- if verbose:
- print('Already limited memory usage. Skipping...')
- return
- use_gpu_idx = None
- printed_compute_mode = False
- while True: # until a GPU is free
- try:
- data = NVLog()
- except FileNotFoundError:
- print('WARNING: nvidia-smi is not available')
- break
- if 'use_specific_gpu' in os.environ:
- print(f'use_specific_gpu = {os.environ["use_specific_gpu"]}')
- use_gpu_idx = int(os.environ['use_specific_gpu'])
- else:
- for idx, gpu_data in reversed(list(enumerate(data['Attached GPUs'].values()))):
- any_processes = (
- gpu_data['Processes'] is not None
- or gpu_data['Utilization']['Memory'] != '0 %'
- or gpu_data['FB Memory Usage']['Used'] != '0 MiB'
- )
- compute_mode = gpu_data['Compute Mode']
- if not printed_compute_mode:
- print('GPU Compute Mode:', compute_mode)
- printed_compute_mode = True
- if compute_mode in ['Exclusive_Process', 'Exclusive_Thread'] or os.environ.get('use_empty_gpu'):
- if not any_processes:
- use_gpu_idx = idx
- break
- elif compute_mode == 'Default':
- if GPU_MEMORY_USAGE != 'growth':
- free_memory = int(re.search(r'(\d+) MiB', gpu_data['FB Memory Usage']['Free']).group(1))
- if free_memory > 2.5 * GPU_MEMORY_USAGE:
- use_gpu_idx = idx
- break
- else:
- use_gpu_idx = idx
- break
- elif compute_mode == 'Prohibited':
- continue
- else:
- raise NotImplementedError(f'Unknown compute mode: {compute_mode}.')
- else:
- print(datetime.datetime.now().strftime("%H:%M") + ': All GPUs are currently in use.')
- sleep(300)
- continue
- os.environ["CUDA_VISIBLE_DEVICES"] = str(use_gpu_idx)
- print('Using GPU', f'{use_gpu_idx}:', list(data['Attached GPUs'].values())[use_gpu_idx]['Product Name'])
- break
- import tensorflow as tf
- # limit GPU memory usage
- # gpu_memory_limit = 3.5 * 1024
- for gpu in tf.config.experimental.list_physical_devices('GPU'):
- if GPU_MEMORY_USAGE == 'growth':
- tf.config.experimental.set_memory_growth(gpu, True)
- else:
- tf.config.experimental.set_virtual_device_configuration(gpu, [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=GPU_MEMORY_USAGE)])
- # tf.config.experimental.set_virtual_device_configuration(gpus[use_gpu_idx], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2)])
- # limit RAM usage
- try:
- from lib.memory_limit_windows import create_job, limit_memory, assign_job
- import psutil
- ram_limit = psutil.virtual_memory().total * 2 // 3
- print('Limiting RAM usage to {0:,} Bytes.'.format(ram_limit))
- assign_job(create_job())
- limit_memory(ram_limit)
- except ModuleNotFoundError:
- try:
- from lib.memory_limit_linux import limit_memory, get_memory
- ram_limit = get_memory() * 2 // 3
- print('Limiting RAM usage to {0:,} Bytes.'.format(ram_limit))
- limit_memory(ram_limit)
- except ModuleNotFoundError:
- print('WARNING: Setting memory limit failed. '
- 'This can happen if you are not on Windows nor on Linux or if you have forgot to install some dependencies.')
- cls.limited = True
- MemoryLimiter.limit_memory_usage()
- def tf_memory_leak_cleanup():
- import tensorflow
- for obj in gc.get_objects():
- if isinstance(obj, tensorflow.Graph):
- if hasattr(obj, '_py_funcs_used_in_graph'):
- del obj._py_funcs_used_in_graph[:]
- if isinstance(obj, tensorflow.keras.utils.GeneratorEnqueuer):
- obj.stop()
- gc.collect()
|