statistics.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. import math
  2. import os
  3. from typing import List, Union
  4. import numpy
  5. import pandas
  6. from pandas import DataFrame
  7. from tool_lib.tuned_pandasql import TunedPandaSQL
  8. from tool_lib.util import my_tabulate, EBC, EBE
  9. class Category(EBE):
  10. UNKNOWN = 'UNKNOWN'
  11. OPERATIVE = 'OPERATIVE'
  12. MEETING = 'MEETING'
  13. PLANNING = 'PLANNING'
  14. STRATEGY = 'STRATEGY'
  15. CONTROLLING = 'CONTROLLING'
  16. AUTOMATION = 'AUTOMATION'
  17. WASTED_TIME = 'WASTED_TIME'
  18. NOT_AVAILABLE = 'NOT_AVAILABLE'
  19. @classmethod
  20. def determine_category(cls, task: Union[str, float]) -> 'Category':
  21. task = str(task)
  22. if task == 'nan':
  23. return cls.NOT_AVAILABLE
  24. elif task == 'root':
  25. return cls.NOT_AVAILABLE
  26. elif 'meeting' in task.lower():
  27. return cls.MEETING
  28. elif 'eckard' in task.lower() or 'eckhard' in task.lower():
  29. return cls.MEETING
  30. elif 'unternehmensberatung' in task.lower():
  31. return cls.MEETING
  32. elif 'Notar' in task:
  33. return cls.MEETING
  34. elif 'wirschaftsbüro' in task.lower() or 'wirtschaftsbüro' in task.lower():
  35. return cls.MEETING
  36. elif 'controlling' in task.lower():
  37. return cls.CONTROLLING
  38. elif 'strateg' in task.lower():
  39. return cls.STRATEGY
  40. elif 'arbeitserfassungstool' in task.lower():
  41. return cls.AUTOMATION
  42. elif 'worktime table' in task.lower():
  43. return cls.AUTOMATION
  44. elif 'automat' in task.lower():
  45. return cls.AUTOMATION
  46. elif 'template' in task.lower():
  47. return cls.AUTOMATION
  48. elif 'rechnung' in task.lower():
  49. return cls.AUTOMATION
  50. elif 'operativ' in task.lower():
  51. return cls.OPERATIVE
  52. elif 'website' in task.lower():
  53. return cls.OPERATIVE
  54. elif 'bunq' in task.lower():
  55. return cls.WASTED_TIME
  56. elif task == 'root/Master-Arbeitsstunden ':
  57. return cls.OPERATIVE
  58. elif 'planungsassistent' in task.lower():
  59. return cls.OPERATIVE
  60. elif 'telefonat' in task.lower():
  61. return cls.OPERATIVE
  62. elif 'umfrage' in task.lower():
  63. return cls.OPERATIVE
  64. elif 'code cleaning' in task.lower():
  65. return cls.OPERATIVE
  66. elif 'economy_calendar' in task.lower() or 'economy calendar' in task.lower():
  67. return cls.OPERATIVE
  68. elif 'tradingbot' in task.lower() or 'trading bot' in task.lower():
  69. return cls.OPERATIVE
  70. elif 'optiwae' in task.lower():
  71. return cls.OPERATIVE
  72. elif 'twilio' in task.lower():
  73. return cls.OPERATIVE
  74. elif 'linkguide' in task.lower():
  75. return cls.OPERATIVE
  76. elif 'plan' in task.lower():
  77. return cls.PLANNING
  78. elif 'tools' in task.lower():
  79. return cls.AUTOMATION
  80. else:
  81. return cls.UNKNOWN
  82. unknown_categories = set()
  83. def category_from_row(row):
  84. result = Category.determine_category(row['Task']).value
  85. if result == Category.UNKNOWN.value:
  86. unknown_categories.add(row['Task'])
  87. return result
  88. def load_from_disk():
  89. dataframes = []
  90. this_dir = os.path.dirname(__file__)
  91. for xlsx_path in os.listdir(os.path.join(this_dir, 'time_recorded_tables')):
  92. if not xlsx_path.endswith('.xlsx'):
  93. continue
  94. if xlsx_path.startswith('~$'):
  95. continue
  96. if 'example' in xlsx_path:
  97. continue
  98. dataframes.append(pandas.read_excel(os.path.join(this_dir, 'time_recorded_tables', xlsx_path)))
  99. result = pandas.concat(dataframes)
  100. result['Datum'] = pandas.to_datetime(result['Datum'])
  101. result['Kalenderwoche'] = result['Datum'].dt.strftime('%V')
  102. result.sort_values(by=['Datum'], inplace=True)
  103. return result
  104. def get_date(row):
  105. if pandas.isnull(row['Datum']):
  106. return 'NULL'
  107. return row['Datum'].strftime('%d.%m.%Y')
  108. def get_name(row):
  109. return row['Name']
  110. def get_year(row):
  111. if pandas.isnull(row['Datum']):
  112. return 'NULL'
  113. return row['Datum'].strftime('%Y')
  114. def get_month(row):
  115. if pandas.isnull(row['Datum']):
  116. return 'NULL'
  117. return row['Datum'].strftime('%m.%Y')
  118. def get_task(row):
  119. return row['Task']
  120. def get_work_time(row):
  121. return row['Arbeitszeit']
  122. def get_work_time_minutes(row):
  123. return row['Arbeitszeit in Min']
  124. def get_income(row):
  125. return row['Einkommen']
  126. def get_overtime(row):
  127. return row['Überstunden']
  128. def raw_df():
  129. MAX_TASK_PARTS = 4
  130. def get_calendar_week(row):
  131. return row['Kalenderwoche']
  132. columns = {
  133. 'Name': get_name,
  134. 'Datum': get_date,
  135. 'Monat': get_month,
  136. 'Jahr': get_year,
  137. 'Task': get_task,
  138. 'Kalenderwoche': get_calendar_week,
  139. 'Arbeitszeit': get_work_time,
  140. 'Arbeitszeit in Minuten': get_work_time_minutes,
  141. 'Einkommen': get_income,
  142. 'Überstunden': get_overtime,
  143. 'Kategorie': category_from_row,
  144. **{
  145. f'Task Präfix {i}': lambda row, i=i: get_task_prefix(row, i)
  146. for i in range(1, MAX_TASK_PARTS)
  147. }
  148. }
  149. combinations: List[tuple] = [
  150. ('Name', 'Jahr'),
  151. # ('Name', 'Task'),
  152. # ('Name', 'Task Präfix 1'),
  153. ('Name', 'Task Präfix 2'),
  154. # ('Name', 'Task Präfix 3'),
  155. ]
  156. for combination in combinations:
  157. new_column_name = '_by_'.join(combination)
  158. columns[new_column_name] = lambda *args, combination=combination, **kwargs: ', '.join(str(columns[column](*args, **kwargs)) for column in combination)
  159. df = load_from_disk()
  160. table = [
  161. [columns[column](row) for column in columns]
  162. for row in df.to_dict('records')
  163. ]
  164. return DataFrame(table, columns=list(columns))
  165. def get_task_prefix(row, prefix_length):
  166. return "/".join(str(row['Task']).split('/')[:prefix_length])
  167. def preprocess_for_sql(x):
  168. if isinstance(x, int):
  169. return str(x)
  170. elif isinstance(x, str):
  171. x = x.replace("'", r"\'")
  172. return f"'{x}'"
  173. elif isinstance(x, float):
  174. return str(x)
  175. elif numpy.isscalar(x):
  176. return str(x)
  177. elif x is None:
  178. return 'NULL'
  179. else:
  180. raise NotImplementedError(type(x))
  181. def first_column_values(values):
  182. return list(values[values.columns[0]])
  183. def compute_rows(max_distinct_values=120, ):
  184. df = raw_df()
  185. result = []
  186. if df.shape[0] == 0:
  187. return result
  188. relevant_columns = [column for column in df.columns
  189. if not column.startswith('_')]
  190. distinct_column_values = TunedPandaSQL().multiquery(
  191. {column: f'SELECT DISTINCT "{column}" FROM df' for column in df.columns},
  192. {'df': df[relevant_columns]}
  193. )
  194. resultss = TunedPandaSQL().multiquery(
  195. {
  196. (column, v): f'SELECT Arbeitszeit, "Arbeitszeit in Minuten", Einkommen, Überstunden '
  197. f'FROM df WHERE "{column}" IS {v}'
  198. for column in relevant_columns
  199. for column_value in first_column_values(distinct_column_values[column])
  200. if len(distinct_column_values[column]) <= max_distinct_values
  201. for v in [preprocess_for_sql(column_value)]
  202. if v != 'nan'
  203. },
  204. {'df': df[relevant_columns]}
  205. )
  206. for column in relevant_columns:
  207. values: DataFrame = distinct_column_values[column]
  208. if len(values) > max_distinct_values:
  209. print(f'{len(values)} distinct values in column {column}')
  210. for (column, v), results in resultss.items():
  211. if column in ['Arbeitszeit', 'Arbeitszeit in Minuten', 'Einkommen', 'Überstunden']:
  212. continue
  213. new_row = [
  214. column,
  215. v,
  216. results['Arbeitszeit'].sum(),
  217. results['Arbeitszeit in Minuten'].sum(),
  218. results['Einkommen'].sum(),
  219. results['Überstunden'].sum(),
  220. ]
  221. result.append(new_row)
  222. return result, ['Key', 'Value', 'Arbeitszeit', 'Arbeitszeit in Minuten', 'Einkommen', 'Überstunden']
  223. rows, columns = compute_rows()
  224. rows = sorted(rows, key=lambda row: (row[0], row[1]))
  225. print(my_tabulate(data=rows, headers=columns))
  226. print()
  227. print('Unspecified categories:')
  228. for c in unknown_categories:
  229. print(' -', repr(c))