statistics.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. import os
  2. from typing import List
  3. import numpy
  4. import pandas
  5. from pandas import DataFrame
  6. from tool_lib.tuned_pandasql import TunedPandaSQL
  7. from tool_lib.util import my_tabulate
  8. def load_from_disk():
  9. dataframes = []
  10. this_dir = os.path.dirname(__file__)
  11. for xlsx_path in os.listdir(os.path.join(this_dir, 'time_recorded_tables')):
  12. if not xlsx_path.endswith('.xlsx'):
  13. continue
  14. if xlsx_path.startswith('~$'):
  15. continue
  16. if 'example' in xlsx_path:
  17. continue
  18. dataframes.append(pandas.read_excel(os.path.join(this_dir, 'time_recorded_tables', xlsx_path)))
  19. result = pandas.concat(dataframes)
  20. result['Datum'] = pandas.to_datetime(result['Datum'])
  21. result['Kalenderwoche'] = result['Datum'].dt.strftime('%V')
  22. result.sort_values(by=['Datum'], inplace=True)
  23. return result
  24. def get_date(row):
  25. if pandas.isnull(row['Datum']):
  26. return 'NULL'
  27. return row['Datum'].strftime('%d.%m.%Y')
  28. def get_name(row):
  29. return row['Name']
  30. def get_year(row):
  31. if pandas.isnull(row['Datum']):
  32. return 'NULL'
  33. return row['Datum'].strftime('%Y')
  34. def get_month(row):
  35. if pandas.isnull(row['Datum']):
  36. return 'NULL'
  37. return row['Datum'].strftime('%m.%Y')
  38. def get_task(row):
  39. return row['Task']
  40. def get_work_time(row):
  41. return row['Arbeitszeit']
  42. def get_work_time_minutes(row):
  43. return row['Arbeitszeit in Min']
  44. def get_income(row):
  45. return row['Einkommen']
  46. def get_overtime(row):
  47. return row['Überstunden']
  48. def raw_df():
  49. MAX_TASK_PARTS = 4
  50. def get_calendar_week(row):
  51. return row['Kalenderwoche']
  52. columns = {
  53. 'Name': get_name,
  54. 'Datum': get_date,
  55. 'Monat': get_month,
  56. 'Jahr': get_year,
  57. 'Task': get_task,
  58. 'Kalenderwoche': get_calendar_week,
  59. 'Arbeitszeit': get_work_time,
  60. 'Arbeitszeit in Minuten': get_work_time_minutes,
  61. 'Einkommen': get_income,
  62. 'Überstunden': get_overtime,
  63. **{
  64. f'Task Präfix {i}': lambda row, i=i: get_task_prefix(row, i)
  65. for i in range(1, MAX_TASK_PARTS)
  66. }
  67. }
  68. combinations: List[tuple] = [
  69. ('Name', 'Jahr'),
  70. ('Name', 'Task'),
  71. ('Name', 'Task Präfix 1'),
  72. ('Name', 'Task Präfix 2'),
  73. ('Name', 'Task Präfix 3'),
  74. ]
  75. for combination in combinations:
  76. new_column_name = '_by_'.join(combination)
  77. columns[new_column_name] = lambda *args, combination=combination, **kwargs: ', '.join(str(columns[column](*args, **kwargs)) for column in combination)
  78. df = load_from_disk()
  79. table = [
  80. [columns[column](row) for column in columns]
  81. for row in df.to_dict('records')
  82. ]
  83. return DataFrame(table, columns=list(columns))
  84. def get_task_prefix(row, prefix_length):
  85. return "/".join(str(row['Task']).split('/')[:prefix_length])
  86. def preprocess_for_sql(x):
  87. if isinstance(x, int):
  88. return str(x)
  89. elif isinstance(x, str):
  90. x = x.replace("'", r"\'")
  91. return f"'{x}'"
  92. elif isinstance(x, float):
  93. return str(x)
  94. elif numpy.isscalar(x):
  95. return str(x)
  96. elif x is None:
  97. return 'NULL'
  98. else:
  99. raise NotImplementedError(type(x))
  100. def first_column_values(values):
  101. return list(values[values.columns[0]])
  102. def compute_rows(max_distinct_values=120, ):
  103. df = raw_df()
  104. result = []
  105. if df.shape[0] == 0:
  106. return result
  107. relevant_columns = [column for column in df.columns
  108. if not column.startswith('_')]
  109. distinct_column_values = TunedPandaSQL().multiquery(
  110. {column: f'SELECT DISTINCT "{column}" FROM df' for column in df.columns},
  111. {'df': df[relevant_columns]}
  112. )
  113. resultss = TunedPandaSQL().multiquery(
  114. {
  115. (column, v): f'SELECT Arbeitszeit, "Arbeitszeit in Minuten", Einkommen, Überstunden '
  116. f'FROM df WHERE "{column}" IS {v}'
  117. for column in relevant_columns
  118. for column_value in first_column_values(distinct_column_values[column])
  119. if len(distinct_column_values[column]) <= max_distinct_values
  120. for v in [preprocess_for_sql(column_value)]
  121. if v != 'nan'
  122. },
  123. {'df': df[relevant_columns]}
  124. )
  125. for column in relevant_columns:
  126. values: DataFrame = distinct_column_values[column]
  127. if len(values) > max_distinct_values:
  128. print(f'{len(values)} distinct values in column {column}')
  129. for (column, v), results in resultss.items():
  130. if column in ['Arbeitszeit', 'Arbeitszeit in Minuten', 'Einkommen', 'Überstunden']:
  131. continue
  132. new_row = [
  133. column,
  134. v,
  135. results['Arbeitszeit'].sum(),
  136. results['Arbeitszeit in Minuten'].sum(),
  137. results['Einkommen'].sum(),
  138. results['Überstunden'].sum(),
  139. ]
  140. result.append(new_row)
  141. return result, ['Key', 'Value', 'Arbeitszeit', 'Arbeitszeit in Minuten', 'Einkommen', 'Überstunden']
  142. rows, columns = compute_rows()
  143. rows = sorted(rows, key=lambda row: (row[0], row[1]))
  144. print(my_tabulate(data=rows, headers=columns))