Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import linktransformer as lt | |
| # Function to convert DataFrame to CSV for download | |
| def convert_df_to_csv(df): | |
| return df.to_csv().encode('utf-8') | |
| st.title('Merge Dataframes using LinkTransformer') | |
| st.write('LinkTransformer supports several AI-powered data wrangling operations - here is an example that allows you to use LLMs to merge data.') | |
| # Function to load DataFrame | |
| def load_dataframe(upload): | |
| ##if csv is uploaded use read_csv to load the data , otherwise use read_excel | |
| if upload is not None: | |
| if upload.name.endswith('csv'): | |
| return pd.read_csv(upload) | |
| else: | |
| return pd.read_excel(upload) | |
| else: | |
| return pd.DataFrame() | |
| # Options for DataFrame 1 | |
| df1_upload = st.file_uploader("Upload DataFrame 1 (CSV)", type=['csv'], key='df1_upload') | |
| # Options for DataFrame 2 | |
| df2_upload = st.file_uploader("Upload DataFrame 2 (CSV)", type=['csv'], key='df2_upload') | |
| # Load and display the DataFrames | |
| df1 = load_dataframe(df1_upload) | |
| df2 = load_dataframe(df2_upload) | |
| if df1 is not None: | |
| st.write("DataFrame 1 Preview:") | |
| st.dataframe(df1.head()) | |
| if df2 is not None: | |
| st.write("DataFrame 2 Preview:") | |
| st.dataframe(df2.head()) | |
| # Model selection | |
| model_path = st.text_input("Model path (HuggingFace)", value="all-MiniLM-L6-v2") | |
| st.write("We have trained several record linkage models! Just copy the Hugging Face model path from the [model zoo](https://linktransformer.github.io/).") | |
| ##More on model selection available on https://linktransformer.github.io/ | |
| if df1_upload is not None and df2_upload is not None: | |
| # Checkbox for columns to match on | |
| if not df1.empty and not df2.empty: | |
| columns_df1 = df1.columns.tolist() | |
| columns_df2 = df2.columns.tolist() | |
| selected_columns_df1 = st.multiselect("Select columns from DataFrame 1 to match on:", columns_df1, default=columns_df1[0]) | |
| selected_columns_df2 = st.multiselect("Select columns from DataFrame 2 to match on:", columns_df2, default=columns_df2[0]) | |
| # Perform merge | |
| if st.button("Merge DataFrames"): | |
| model=lt.LinkTransformer(model_path) | |
| df_lm_matched = lt.merge(df1, df2, merge_type='1:m', on=None, model=model, left_on=selected_columns_df1, right_on=selected_columns_df2) | |
| st.write("Merged DataFrame Preview:") | |
| st.dataframe(df_lm_matched.head()) | |
| # Download button for merged DataFrame | |
| csv = convert_df_to_csv(df_lm_matched) | |
| st.download_button( | |
| label="Download as CSV", | |
| data=csv, | |
| file_name='merged_dataframe.csv', | |
| mime='text/csv', | |
| ) | |
| else: | |
| st.write("It appears that your dataframes are empty. Please upload valid dataframes.") | |
| else: | |
| st.write("Please upload or enter paths for both DataFrames.") | |
| ##Add website and citation | |
| st.write("Note that this space only supports CPU usage and is only recommended on small datasets. If you have access to the GPU, check out our python [package](https://github.com/dell-research-harvard/linktransformer/)!") | |
| st.write("For more information and advanced usage, please visit the [LinkTransformer website](https://linktransformer.github.io/).") | |
| st.write("If you use LinkTransformer in your research, please cite the following paper: [LinkTransformer: A Unified Package for Record Linkage with Transformer Language Models](https://arxiv.org/abs/2309.00789)") | |