-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcategory_ratio.py
38 lines (32 loc) · 1.57 KB
/
category_ratio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import pandas as pd
import numpy as np
# 카테고리별 비율
def prepare_data(data):
category_counts = data['카테고리'].value_counts(normalize=True) * 100
original_ratios = category_counts.to_dict()
exclude_categories = ["간편 결제", "이체", "기타"]
filtered_ratios = {k: v for k, v in original_ratios.items() if k not in exclude_categories}
categories = list(filtered_ratios.keys())
weights = [1.0] * len(categories)
df = pd.DataFrame({'카테고리': categories, '원래 비율': list(filtered_ratios.values()), '가중치': weights})
return df, original_ratios, exclude_categories
# '간편 결제', '이체', '기타'를 제외한 카테고리별 비율(빈도수)
def redistribute_excluded_categories(df, original_ratios, exclude_categories):
excluded_total_ratio = sum(original_ratios.get(cat, 0) for cat in exclude_categories)
top_3_indices = df.nlargest(3, '원래 비율').index
df.loc[top_3_indices, '원래 비율'] += excluded_total_ratio / 3
return df
# 가장 높은 비율의 카테고리
def get_top_category(data):
df, original_ratios, exclude_categories = prepare_data(data)
df = redistribute_excluded_categories(df, original_ratios, exclude_categories)
top_category = df.loc[df['원래 비율'].idxmax(), '카테고리']
return top_category
# 데이터 읽기 함수
def load_data(filepath): # 파일 경로를 받아 데이터프레임 반환
try:
data = pd.read_excel(filepath)
return data
except Exception as e:
print(f"Error loading file: {e}")
return None