CAMS aerosol reanalysis and hindcast discrepancy

Hello,

I have built a dust and organic matter aerosol seasonal climatology (mean+std) database for my research using the CAMS EAC4 daily reanalysis. I want to use aerosol indices as input in a larger-scale forecasting/early warning algorithm that has learnt certain relationships between aerosol indices and precipitation. However, when testing my input data preparation for the algorithm, I found out that there is a large discrepancy between the reanalysis index and the hindcast index. For example in 2024 FMA the values just don’t align.

Can someone help me find out what goes wrong? Is it just that the two datasets have such large differences that I cannot compare them?

For reference, with the hindcast I use daily forecasts with the forecast hour +0 (closest, so most certain) and then averaged that over the season.

Here is my code for the reanalysis index calculation:

data_ae = '.../AE_daily_2003-2024.nc'
ds_ae = xr.open_dataset(data_ae, chunks=None)
vars_ae = ['duaod550', 'omaod550']
vars_ae_names = ['dust', 'organic matter']
def clean_dirty_years(ds, var, domain, months, get_diags=False, get_vals=False):
    minlat, maxlat, minlon, maxlon = domain
    ae_domain = ds[var].sel(
        latitude=slice(maxlat, minlat),
        longitude=slice(minlon, maxlon)
        )

    weights = np.cos(np.deg2rad(ae_domain["latitude"]))
    weights = weights / weights.mean()
    ae_mean = (ae_domain * weights).mean(dim=("latitude", "longitude"))

    ae_seasonal_mean = ae_mean.sel(valid_time=ae_mean.valid_time.dt.month.isin(months)
        ).groupby('valid_time.year').mean('valid_time')

    ae_idx = (ae_seasonal_mean - ae_seasonal_mean.mean("year")) / ae_seasonal_mean.std("year")
    ae_idx = ae_idx.rename("ae_idx")

    q75 = ae_idx.quantile(0.75, dim='year').values.item()
    q25 = ae_idx.quantile(0.25, dim='year').values.item()
    
    high_yrs = ae_idx['year'].values[ae_idx.values > q75]
    low_yrs  = ae_idx['year'].values[ae_idx.values < q25]
    
    if get_diags:
        seasonal_mean = ae_seasonal_mean.mean('year').values
        seasonal_std = ae_seasonal_mean.std('year').values
        return seasonal_mean, seasonal_std, q25, q75
    elif get_vals:
        neutral_yrs = np.setdiff1d(np.unique(ae_idx.year.values), np.concatenate([high_yrs, low_yrs]))
        return ae_idx.values, high_yrs, low_yrs, neutral_yrs
    else:
        return high_yrs, low_yrs

seasons = {
    'DJF': [12, 1, 2],  'JFM': [1, 2, 3],
    'FMA': [2, 3, 4],   'MAM': [3, 4, 5],
    'AMJ': [4, 5, 6],   'MJJ': [5, 6, 7],
    'JJ': [6, 7],       'JJA': [6, 7, 8],
    'JAS': [7, 8, 9],   'ASO': [8, 9, 10],
    }

def get_dust_domain(season):
    if season in ['DJF', 'JFM']:
        return [5, 20, -20, 20]
    elif season in ['FMA', 'MAM', 'AMJ', 'JAS', 'ASO']:
        return [10, 20, -20, 20]
    elif season in ['MJJ', 'JJ', 'JJA']:
        return [15, 20, -20, 20]

def get_om_n_domain(season):
    if season in ['DJF', 'JFM', 'FMA', 'MAM', 'AMJ', 'JAS', 'ASO']:
        return [10, 20, -20, 15]
    elif season in ['MJJ', 'JJ', 'JJA']:
        return [12, 20, -20, 15] 
    
def get_om_s_domain(season):
    if season in ['DJF', 'JFM', 'FMA', 'MAM', 'AMJ', 'JAS', 'ASO']:
        return [4, 10, -20, 15]
    elif season in ['MJJ', 'JJ', 'JJA']:
        return [4, 12, -20, 15] 

seasonal_diags = {}
raw_data = {}
category_data = {}

for season, season_months in seasons.items():
    DU_domain = get_dust_domain(season)
    OMN_domain = get_om_n_domain(season)
    OMS_domain = get_om_s_domain(season)
    
    DU_diags = clean_dirty_years(ds_ae, 'duaod550', DU_domain, season_months, get_diags=True, get_vals=False)
    OMN_diags = clean_dirty_years(ds_ae, 'omaod550', OMN_domain, season_months, get_diags=True, get_vals=False)
    OMS_diags = clean_dirty_years(ds_ae, 'omaod550', OMS_domain, season_months, get_diags=True, get_vals=False)

And the code for the hindcast:

def get_CAMS_forecast(date1, date2, directory):
    for folder in ['zipfiles', 'data_CAMS-forecast']:
        os.makedirs(f'{directory}/{folder}', exist_ok=True)

    dataset = "cams-global-atmospheric-composition-forecasts"
    request = {
        "date": [f"{date1}/{date2}"],
        "time": ["00:00"],
        "leadtime_hour": ["0"],
        "type": ["forecast"],
        "data_format": "netcdf_zip",
        "variable": [
            "dust_aerosol_optical_depth_550nm",
            "organic_matter_aerosol_optical_depth_550nm"
        ],
        "area": [30, -20, 0, 30]
    }

    client = cdsapi.Client()
    target = f'{directory}/zipfiles/CAMS_aerosol-forecast_{date1}_{date2}.zip'
    client.retrieve(dataset, request, target)
    

def extract_CAMS(year, month_start, month_end, season, directory='CAMS_forecasts'):
    if month_end == 2:
        daysinmonth = 28
    elif month_end in [4, 6, 9, 11]:
        daysinmonth = 30
    else:
        daysinmonth = 31
        
    date1 = f'{year}-{month_start}-01'
    date2 = f'{year}-{month_end}-{daysinmonth}'
    
    forecast = get_CAMS_forecast(date1, date2, directory)
    
    with zipfile.ZipFile(f"{directory}/zipfiles/CAMS_aerosol-forecast_{date1}_{date2}.zip", "r") as zip_ref:
        zip_ref.extractall(f'{directory}/data_CAMS-forecast/forecast_{date1}_{date2}')
            
    def calc_ae_idx(directory, date1, date2, var, season, northsouth=None):
        ds = xr.open_dataset(f'{directory}/data_CAMS-forecast/forecast_{date1}_{date2}/data_sfc.nc')
        clim = pd.read_csv('.../idxvsRS_ae-clim.csv')

        def get_dust_domain(season):
            if season in ['DJF', 'JFM']:
                return [5, 20, -20, 20]
            elif season in ['FMA', 'MAM', 'AMJ', 'JAS', 'ASO']:
                return [10, 20, -20, 20]
            elif season in ['MJJ', 'JJ', 'JJA']:
                return [15, 20, -20, 20]

        def get_om_n_domain(season):
            if season in ['DJF', 'JFM', 'FMA', 'MAM', 'AMJ', 'JAS', 'ASO']:
                return [10, 20, -20, 15]
            elif season in ['MJJ', 'JJ', 'JJA']:
                return [12, 20, -20, 15] 
            
        def get_om_s_domain(season):
            if season in ['DJF', 'JFM', 'FMA', 'MAM', 'AMJ', 'JAS', 'ASO']:
                return [4, 10, -20, 15]
            elif season in ['MJJ', 'JJ', 'JJA']:
                return [4, 12, -20, 15]
            
        ae_sel = ds[var].isel(forecast_period=0)
        ae_monthly = ae_sel.resample(forecast_reference_time='ME').mean('forecast_reference_time')
        ae_seasonal = ae_monthly.mean('forecast_reference_time')

        if var == 'duaod550':
            latmin, latmax, lonmin, lonmax = get_dust_domain(season)
            clim_mean, clim_std = clim[f'DUST_{season}'][0], clim[f'DUST_{season}'][1]
        elif var == 'omaod550' and northsouth == 'N':
            latmin, latmax, lonmin, lonmax = get_om_n_domain(season)
            clim_mean, clim_std = clim[f'OM_N_{season}'][0], clim[f'OM_N_{season}'][1]
        elif var == 'omaod550' and northsouth == 'S':
            latmin, latmax, lonmin, lonmax = get_om_s_domain(season)
            clim_mean, clim_std = clim[f'OM_S_{season}'][0], clim[f'OM_S_{season}'][1]
        
        ae_seas_domain = ae_seasonal.sel(latitude=slice(latmax, latmin), longitude=slice(lonmin, lonmax))
        weights = np.cos(np.deg2rad(ae_seas_domain["latitude"]))
        weights = weights / weights.mean()
        ae_seas_domain_mean = (ae_seas_domain * weights).mean(dim=("latitude", "longitude"))
        ae_fc_idx = (ae_seas_domain_mean -  clim_mean) / clim_std

        return ae_fc_idx.values
    
    du_idx = calc_ae_idx(directory, date1, date2, 'duaod550', season, northsouth=None)
    om_n_idx = calc_ae_idx(directory, date1, date2, 'omaod550', season, northsouth='N')
    om_s_idx = calc_ae_idx(directory, date1, date2, 'omaod550', season, northsouth='S')
            
    return du_idx, om_n_idx, om_s_idx

du_idx, om_n_idx, om_s_idx = extract_CAMS(2024, 2, 4, 'FMA')

The FMA indices from the reanalysis are [dust, om_n, om_s]: -0.28, -0.6, -0.54, but from the hindcast they are: 0.42, -3.73, -6.23.

I have no clue what’s going on. Please help me out!