change to simple_salesforce for data load

2025-04-08 16:02:58 +02:00
parent 5eb314fbf7
commit 2e6d82d9cc
5 changed files with 188 additions and 11 deletions
--- a/prepared_steps/1_extract_data/export.json
+++ b/prepared_steps/1_extract_data/export.json
@ -3,12 +3,12 @@
    "useSeparatedCSVFiles": true,
    "pollingQueryTimeoutMs": 1000000,
    "bulkApiVersion": "2.0",
-    "parallelRestJobs": 2,
+    "queryBulkApiThreshold ": 100,
    "objectSets": [
        {
            "objects": [
                {
-                    "query": "SELECT Id, City__c, Country__c, GeoY__c, GeoX__c, PostalCode__c, Street__c, Extension__c, HouseNo__c, FlatNo__c, Floor__c FROM SCInstalledBaseLocation__c WHERE Country__c = 'NL'",
+                    "query": "SELECT Id, City__c, Country__c, GeoY__c, GeoX__c, PostalCode__c, Street__c, Extension__c, HouseNo__c, FlatNo__c, Floor__c FROM SCInstalledBaseLocation__c WHERE Country__c = 'NL' limit 1000",
                    "externalId": "Name",
                    "operation": "Readonly"
                }
@ -19,7 +19,18 @@
                    "query": "SELECT Id, Name, CommissioningDate__c,InstallationDate__c,ProductEnergy__c, ProductUnitClass__c,ArticleNo__c,SerialNo__c, SerialNoException__c, ProductUnitType__c, InstalledBaseLocation__c  FROM SCInstalledBase__c WHERE Country__c = 'NL'",
                    "externalId": "Name",
                    "operation": "Readonly",
-                    "excludedFromUpdateFields": ["InstalledBaseLocation__c"]
+                    "master":true,
+                    "excludedFromUpdateFields": ["InstalledBaseLocation__c"],
+                    "skipRecordsComparison": true,
+                    "parallelRestJobs": 4,
+                    "restApiBatchSize": 9500,
+                    "fieldMapping": [
+                        {
+                        "sourceField": "InstalledBaseLocation__c",
+                        "targetField": "Id",
+                        "targetObject": "SCInstalledBaseLocation__c"  
+                        }
+                    ]
                }
            ]
        },{
@ -33,10 +44,10 @@
        },{
            "objects": [
                {
-                    "query": "SELECT Id, Country, CountryCode, Street, City, ParentId PostalCode FROM Address WHERE CountryCode = 'NL'",
+                    "query": "SELECT Id, Country, CountryCode, Street, City, ParentId, PostalCode FROM Address WHERE CountryCode = 'NL'",
                    "externalId": "Name",
                    "operation": "Readonly",
-                    "excludedFields": ["ParentId"]
+                    "excludedFromUpdateFields": ["ParentId"]
                }
            ]

@ -48,6 +59,14 @@
                    "operation": "Readonly"
                }
            ]
+        },{
+            "objects": [
+                {
+                    "query": "SELECT Id, Main_Product_Group__c, Family, MaterialType__c, Name, Product_Code__c, ProductCode, EAN_Product_Code__c FROM Product2",
+                    "externalId": "Name",
+                    "operation": "Readonly"
+                }
+            ]
        }
    ]
 }
--- a/prepared_steps/1_extract_data/extract_via_simple_salesforce.py
+++ b/prepared_steps/1_extract_data/extract_via_simple_salesforce.py
@ -0,0 +1,137 @@
+# python extract_via_simple_salesforce.py \
+#   --context qa2 \
+#   --object_id Account \
+#   --output_path extracted_data
+
+import os
+import pandas as pd
+from dotenv import load_dotenv, find_dotenv
+from simple_salesforce import Salesforce
+
+def get_credentials(context):
+    """
+    Get credentials for a given context from the .env file
+    
+    Args:
+        context (str): Context name (e.g., 'qa2', 'prod')
+    
+    Returns:
+        dict: Credentials dictionary with username, password, and security_token
+    """
+    context = context.upper()
+
+    # Initialize credentials dictionary
+    credentials = {
+        'USERNAME': None,
+        'PASSWORD': None,
+        'SECURITY_TOKEN': None,
+        'DOMAIN': 'test'
+    }
+    
+    # Load the .env file explicitly from one directory above
+    env_file = find_dotenv("../.env")
+    load_dotenv(env_file)
+    
+    # Load all environment variables
+    env_vars = os.environ
+    
+    for key, value in env_vars.items():
+        #print(f'{context}_SF_', key, value)
+        if f'{context}_SF_' in key:
+            credential_key = key.split(f'{context}_SF_')[-1].upper()
+            print(credential_key)
+            credentials[credential_key] = value
+            
+    return credentials
+
+def extract_data(object_id, output_path='output', context='qa2'):
+    """
+    Extract data using Bulk API and save as CSV
+    
+    Args:
+        object_id (str): Salesforce object ID
+        output_path (str): Path to save the output file (default 'output')
+        context (str): Context name for credentials (e.g., 'qa2', 'prod')
+    """
+    try:
+        # Get credentials based on context
+        credentials = get_credentials(context)
+        
+        print(credentials)
+        if not all(credentials.values()):
+            raise ValueError(f"Missing credentials for context: {context}")
+        
+        # Initialize Salesforce bulk connector
+        sf = Salesforce(
+            username=credentials['USERNAME'],
+            password=credentials['PASSWORD'],
+            security_token=credentials['SECURITY_TOKEN'],
+            domain=credentials['DOMAIN']
+        )
+        
+        # Create a simple query for the desired object
+        soql_query = f"""
+            SELECT Id, Name
+            FROM SCInstalledBase__c
+            WHERE Country__c = 'NL' limit 1000
+        """
+
+        sf.bulk2.__getattr__("SCInstalledBase__c").download(
+            soql_query, path="./", max_records=200000
+        )
+        
+        """
+        # Execute the Bulk query job
+        job = sf.bulk2.__getattr__("SCInstalledBase__c").query(soql_query)
+        
+        # Polling for job completion (might take a moment)
+        job_id = job['id']
+        while True:
+            status = sf.bulk.job(job_id).get()['status']
+            if status == 'Complete' or status == 'Closed' :
+                break
+            if status == 'Aborted':
+                exit(1)
+            if status == 'Failed':
+                raise ValueError(f'Job failed: {job_id}')
+            
+        
+        # Get the results
+        result = sf.bulk.result(job_id)
+        df = pd.DataFrame(result.records)
+        
+        # Create output directory if it doesn't exist
+        os.makedirs(output_path, exist_ok=True)
+        
+        # Save to CSV file
+        csv_file = os.path.join(output_path, f'{object_id}_data.csv')
+        df.to_csv(csv_file, index=False)
+        
+        print(f'Successfully extracted {len(df)} records from {object_id}')
+        return csv_file
+        """
+    except Exception as e:
+        raise ValueError(f'Error extracting data: {str(e)}')
+
+if __name__ == '__main__':
+    import argparse
+    
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description='Extract Salesforce data via Bulk API')
+    parser.add_argument('--context', type=str, required=True,
+                       help='Context name (e.g., "qa2", "prod")')
+    parser.add_argument('--object_id', type=str, required=True,
+                       help='Account, SCInstalledBaseLocation__c, SCInstalledBase__c, Product2')
+    parser.add_argument('--output_path', type=str, required=False,
+                       help='./')
+    
+    args = parser.parse_args()
+    
+    # Extract data using parameters
+    output_file = extract_data(
+        object_id=args.object_id,
+        output_path=args.output_path,
+        context=args.context
+    )
+    
+    print(f'File saved at: {output_file}')
--- a/prepared_steps/2_transform_via_script/LocationScript.py
+++ b/prepared_steps/2_transform_via_script/LocationScript.py
@ -5,8 +5,10 @@ country_mapping = {
    }   

 # Read the input CSV file, assuming the second row is the header
-read_df = pd.read_csv('../1_extract_data/SCInstalledBaseLocation__c.csv', header=0, keep_default_na=False, dtype=str)
-read_df_ib = pd.read_csv('../1_extract_data/SCInstalledBase__c.csv', header=0, keep_default_na=False, dtype=str)
+read_df = pd.read_csv('../1_extract_data/target/SCInstalledBaseLocation__c_upsert_target.csv', header=0, keep_default_na=False, dtype=str)
+read_df_ib = pd.read_csv('../1_extract_data/target/object-set-2/SCInstalledBase__c_upsert_target.csv', header=0, keep_default_na=False, dtype=str)
+read_df_product2 = pd.read_csv('../1_extract_data/target/object-set-6/Product2_upsert_target.csv', header=0, keep_default_na=False, dtype=str)
+
 for row in read_df.to_dict('records'):
    try:
        # Your processing logic here
@ -18,10 +20,13 @@ for row in read_df.to_dict('records'):
 reindex_columns = ['City__c','Country__c','Extension__c','FlatNo__c','Floor__c','GeoX__c','GeoY__c','HouseNo__c','Id','PostalCode__c','Street__c']
 # ArticleNo__c,CommissioningDate__c,Id,InstallationDate__c,InstalledBaseLocation__c,InstalledBaseLocation__r.Id,Name,ProductEnergy__c,ProductUnitClass__c,ProductUnitType__c,SerialNo__c,SerialNoException__c
 reindex_columns_ib = ['ArticleNo__c','CommissioningDate__c','Id','InstallationDate__c','InstalledBaseLocation__c','InstalledBaseLocation__r.Id','Name','ProductEnergy__c','ProductUnitClass__c','ProductUnitType__c','SerialNo__c','SerialNoException__c']
+# EAN_Product_Code__c,Family,Id,Main_Product_Group__c,MaterialType__c,Name,Product_Code__c,ProductCode
+reindex_columns_product2 = ['EAN_Product_Code__c','Family','Id','Main_Product_Group__c','MaterialType__c','Name','Product_Code__c','ProductCode']

 # Reindex the columns to match the desired format
 df = read_df.reindex(reindex_columns, axis=1)
 df_ib = read_df_ib.reindex(reindex_columns_ib, axis=1)
+df_product2 = read_df_product2.reindex(reindex_columns_product2, axis=1)

 df['Street'] = (
    df['Street__c'].astype(str) + ' ' +
@ -151,7 +156,20 @@ merged_df_ib = merged_df_ib.drop('InstalledBaseLocation__c', axis=1)
 merged_df_ib = merged_df_ib.drop('InstalledBaseLocation__r.Id', axis=1)
 merged_df_ib = merged_df_ib.drop('Id_y', axis=1)
 print(merged_df_ib.columns)
-merged_df_ib.columns = ['Product2.EAN_Product_Code__c', 'FSL_1st_Ignition_Date__c', 'Id', 'InstallDate',  'Name', 'Kind_of_Energy__c', 'Kind_of_Installation__c', 'Main_Product_Group__c', 'SerialNumber', 'Serialnumber_Exception__c', 'Location.ExternalReference'] 
+merged_df_ib.columns = ['Product2.Product_Code__c', 'FSL_1st_Ignition_Date__c', 'Id', 'InstallDate',  'Name', 'Kind_of_Energy__c', 'Kind_of_Installation__c', 'Main_Product_Group__c', 'SerialNumber', 'Serialnumber_Exception__c', 'Location.ExternalReference'] 
+merged_df_ib = merged_df_ib.drop('Main_Product_Group__c', axis=1)
+
+# assign Main_Product_Group__c based on product2 records
+merged_df_ib = pd.merge(merged_df_ib, 
+                    df_product2[['Product_Code__c', 'Main_Product_Group__c']], 
+                    left_on='Product2.Product_Code__c',
+                    right_on='Product_Code__c',
+                    how='left')
+
+merged_df_ib = merged_df_ib.drop('Product_Code__c', axis=1)
+
+merged_df_ib = merged_df_ib.drop_duplicates(subset=['Id'], keep='first')
+

 # Write each DataFrame to a separate CSV file
 address_df.to_csv('../3_upsert_address_and_parent_location/Address.csv', index=False)
@ -159,4 +177,6 @@ parent_df.to_csv('../3_upsert_address_and_parent_location/Location.csv', index=F
 child_df.to_csv('../5_upsert_child_location/Location.csv', index=False)
 merged_df_ib.to_csv('../7_upsert_assets/Asset.csv', index=False)

+## end mapping
+
 print('Data has been successfully split into Address.csv, Parent_Location.csv, and Child_Location.csv files with duplicate checks applied.')
--- a/prepared_steps/7_upsert_assets/ValueMapping.csv
+++ b/prepared_steps/7_upsert_assets/ValueMapping.csv
@ -1,2 +1,3 @@
 ObjectName,FieldName,RawValue,Value
-Asset,Kind_of_Energy__c,2,
+Asset,Kind_of_Energy__c,4,3
+Asset,Kind_of_Energy__c,5,3
--- a/prepared_steps/7_upsert_assets/export.json
+++ b/prepared_steps/7_upsert_assets/export.json
@ -7,9 +7,9 @@
            "operation": "Readonly",
            "externalId": "ExternalReference"
        },{
-            "query": "SELECT EAN_Product_Code__c FROM Product2 WHERE EAN_Product_Code__c != null",
+            "query": "SELECT Product_Code__c  FROM Product2 WHERE Product_Code__c != null",
            "operation": "Readonly",
-            "externalId": "EAN_Product_Code__c"
+            "externalId": "Product_Code__c "
        },{
            "query": "SELECT Product2Id,Id,InstallDate,Name,Kind_of_Energy__c,Kind_of_Installation__c,Main_Product_Group__c,SerialNumber,Serialnumber_Exception__c,LocationId FROM Asset",
            "operation": "Insert"