Scheduled Downtime
On Friday 21 April 2023 @ 5pm MT, this website will be down for maintenance and expected to return online the morning of 24 April 2023 at the latest

Seg fault WRF-Chem with cu_physics = 3 (Grell-Freitas ensemble)

allen6510w

New member
I am getting a segmentation fault almost immediately when I run v4.3 of WRF-Chem with KPP using cu_physics = 3 and chem_opt = 112. For debug_level = 3000, the last output I see is

d01 2012-05-19_18:00:00 call cumulus_driver
d01 2012-05-19_18:00:00 calling inc/HALO_CUP_G3_IN_inline.inc
d01 2012-05-19_18:00:00 in grelldrv
[borgu024:17293:0:17293] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x8)

The code runs for cu_physics = 5 or 0 and it also runs for cu_physics = 3 when chem_opt = 0.

Can you see any issues with my namelist or job script that might be responsible for the error? Do you have any suggestions for debugging etc.

Thanks, Dale


&time_control
run_days = 0,
run_hours = 6,
run_minutes = 0,
run_seconds = 0,
start_year = 2012, 2012,
start_month = 05, 05,
start_day = 19, 19,
start_hour = 18, 18,
start_minute = 00, 00,
start_second = 00, 00,
end_year = 2012, 2012,
end_month = 05, 05,
end_day = 20, 20,
end_hour = 00, 00,
end_minute = 00, 00,
end_second = 00, 00,
interval_seconds = 21600
input_from_file = .true.,.true.,
history_interval = 10, 10,
frames_per_outfile = 1000, 1000,
restart = .false.,
restart_interval = 720,
io_form_history = 2,
io_form_restart = 2,
io_form_input = 2,
io_form_boundary = 2,
debug_level = 3000,
io_form_auxinput5 = 2,
frames_per_auxinput5 = 1, 1,
auxinput5_inname = 'wrfchemi_d<domain>_<date>',
auxinput5_interval_m = 60, 60,
io_form_auxinput6 = 2,
frames_per_auxinput6 = 1, 1,
auxinput6_inname = 'wrfbiochemi_d<domain>',
auxinput6_interval_m = 2160, 720, 43200,
io_form_auxinput7 = 2,
frames_per_auxinput7 = 1, 1,
auxinput7_inname = 'wrffirechemi_d<domain>_<date>',
auxinput7_interval_m = 60, 60,
io_form_auxinput14 = 2,
frames_per_auxinput14 = 1, 1,
auxinput14_inname = 'wrfaircraftchemi_d<domain>_<date>',
auxinput14_interval_m = 60, 60,
ignore_iofields_warning = .true.,
force_use_old_data = .true.,
!io_form_auxinput? = 2,
!frames_per_auxinput? = 1, 1, 1,
!auxinput?_inname = 'wrflda_d<domain>_<date>',
!auxinput?_interval_m = 10, 10, 60,
!auxinput?_begin_h = 4,4,
!auxinput?_end_h = 8,8,
nocolons = .true.,
/

&domains
time_step = 30,
time_step_fract_num = 0,
time_step_fract_den = 1,
max_dom = 1,
e_we = 200, 331,
e_sn = 160, 256,
e_vert = 90, 90,
p_top_requested = 5000,
num_metgrid_levels = 40,
num_metgrid_soil_levels = 4,
dx = 6000, 2000,
dy = 6000, 2000,
grid_id = 1, 2,
parent_id = 0, 1,
i_parent_start = 1, 43,
j_parent_start = 1, 44,
parent_grid_ratio = 1, 3,
parent_time_step_ratio = 1, 3,
feedback = 1,
smooth_option = 0,
track_loc_in = 4,
eta_levels = 1.000000, 0.995800, 0.991700, 0.983400, 0.975000, 0.956500, 0.937100, 0.917600, 0.897900, 0.878000, 0.857700, 0.837500, 0.816800,
0.796200, 0.775400, 0.754500, 0.733500, 0.712500, 0.691500, 0.670600, 0.649800, 0.629300, 0.608900, 0.588800, 0.569100, 0.549800, 0.530900,
0.512700, 0.494900, 0.477500, 0.460600, 0.444000, 0.427900, 0.412100, 0.396700, 0.381700, 0.367100, 0.353000, 0.339300, 0.325800, 0.312800,
0.300100, 0.287800, 0.275800, 0.264200, 0.252900, 0.242000, 0.231400, 0.221100, 0.211100, 0.201400, 0.192100, 0.183000, 0.174300, 0.165800,
0.157600, 0.149700, 0.142100, 0.134700, 0.127500, 0.120700, 0.113900, 0.107500, 0.101200, 0.095200, 0.089400, 0.083900, 0.078500, 0.073400,
0.068400, 0.063500, 0.058900, 0.054300, 0.049800, 0.045600, 0.041500, 0.037600, 0.033900, 0.030400, 0.026900, 0.023700, 0.020500, 0.017500,
0.014600, 0.011900, 0.009300, 0.006800, 0.004400, 0.002200, 0.000000
/

&physics
mp_physics = 17, 17,
ra_lw_physics = 4, 4,
ra_sw_physics = 4, 4,
radt = 10, 10,
do_radar_ref = 1,
sf_sfclay_physics = 1, 1,
sf_surface_physics = 2, 2,
num_soil_layers = 4,
bl_pbl_physics = 1, 1,
bldt = 0, 0,
cudt = 0, 0,
cu_rad_feedback = .true.,.true.,
cu_diag = 1, 1,
cu_physics = 3, 0,
cugd_avedx = 1,
isfflx = 1,
ifsnow = 0,
icloud = 1,
ishallow = 1,
surface_input_source = 1,
num_land_cat = 24,
sf_urban_physics = 0, 0,
sf_ocean_physics = 0,
topo_wind = 0, 0,
mp_zero_out = 2,
hail_opt = 1,
lightning_option = 1, 1,
lightning_dt = 30, 10,
lightning_start_seconds = 600,600,
flashrate_factor = 17,2.496,
cldtop_adjustment = 0, 2,
iccg_method = 2, 2,
!ltng_temp_upper = -45., -45.,
!ltng_temp_lower = -40., -40.,
!lda_opt = 1,
!lda_start_h = 4,
!lda_start_min = 0,
!lda_end_h = 8,
!lda_end_min = 0,
!ldaa = 0.30,
!ldab = 0.2,
!ldac = 0.02,
!ldad = 0.25,
!ldarhmax = 1.00,
!ldatmin = 263.15,
!ldatmax = 285.15,
!ldarhtd = 0.95,
!ldarhtd_damp = 0.75,
!lda_flash_min = 800,
/

&fdda
/

&dynamics
tracer_opt = 0, 0, 2, 2,
w_damping = 1,
diff_opt = 1, 1, 1,
km_opt = 4, 4, 4,
diff_6th_opt = 0, 0, 0,
diff_6th_factor = 0.12, 0.12, 0.12,
base_temp = 290.
damp_opt = 1,
zdamp = 5000., 5000., 5000.,
dampcoef = 0.2, 0.2, 0.2
khdif = 0, 0, 0,
kvdif = 0, 0, 0,
non_hydrostatic = .true., .true., .true.,
moist_adv_opt = 2, 2, 2, 2,
scalar_adv_opt = 2, 2, 2, 2,
tracer_adv_opt = 2, 2, 2, 2,
chem_adv_opt = 2, 2, 2, 2,
tke_adv_opt = 2, 2, 2, 2,
moist_adv_opt = 2, 2, 1,
scalar_adv_opt = 2, 2, 1,
gwd_opt = 1,
use_baseparam_fr_nml = .true.,
emdiv = 0.01
smdiv = 0.1
epssm = 0.1
iso_temp = 0,
/

&bdy_control
spec_bdy_width = 5,
spec_zone = 1,
relax_zone = 4,
specified = .true., .false.,.false.,
nested = .false., .true., .true.,
/

&grib2
/

&chem
kemit = 11,
kemit_aircraft = 90,
chem_opt = 112, 112,
!chem_opt = 0, 0,
track_chem_num = 4,
track_chem_name = 'o3','co','no','no2',
bioemdt = 60, 60,
photdt = 60, 60,
chemdt = 2., 2.,
io_style_emissions = 2,
emiss_inpt_opt = 111, 111,
emiss_opt = 8, 8,
emiss_opt_vol = 0,
aircraft_emiss_opt = 1, 1,
chem_in_opt = 1, 1,
phot_opt = 3, 3,
gas_drydep_opt = 1, 1,
aer_drydep_opt = 1, 1,
bio_emiss_opt = 3, 3,
ne_area = 118,
gas_bc_opt = 112, 112,
gas_ic_opt = 112, 112,
aer_bc_opt = 112, 112,
aer_ic_opt = 112, 112,
gaschem_onoff = 1, 1,
aerchem_onoff = 0, 0,
wetscav_onoff = 1, 1,
!wetscav_onoff = 0, 0,
cldchem_onoff = 0, 0,
vertmix_onoff = 1, 1,
!chem_conv_tr = 0, 0,
chem_conv_tr = 1, 1,
conv_tr_wetscav = 1, 1,
conv_tr_aqchem = 1, 1,
seas_opt = 1,
dust_opt = 1,
dmsemis_opt = 1,
biomass_burn_opt = 1, 1,
plumerisefire_frq = 60, 30,
have_bcs_chem = .true., .true.,
aer_ra_feedback = 0, 0,
opt_pars_out = 0,
chemdiag = 1, 1,
! lnox_opt = 0, 0,
! N_IC = 82, 300,82,
! N_CG = 82, 300,82,
! lnox_passive = .false.,.false.,
! ltg_temp_upper = -45., -45.,
! ltg_temp_lower = -40., -40.,
/


&namelist_quilt
nio_tasks_per_group = 0,
nio_groups = 1,
/


Job Script:
#!/usr/bin/csh
#SBATCH -J e0519_a0
#SBATCH --nodes=4 --ntasks=24 --cpus-per-task=2 --ntasks-per-node=6
#SBATCH --constraint=hasw
#SBATCH --time=1:00:00
#SBATCH -o output.%j
#SBATCH --account=????
source /usr/share/modules/init/csh
module purge
module load comp/gcc/9.2.0 mpi/impi/2021.4.0

setenv HOME /discover/nobackup/USRID/wrfchem

setenv DIR $HOME/WRFCHEM/Libs
setenv LDFLAGS -L$DIR/grib2/lib
setenv CPPFLAGS -I$DIR/grib2/include
setenv LD_LIBRARY_PATH $DIR/grib2/lib:$LD_LIBRARY_PATH
setenv JASPERLIB $DIR/grib2/lib
setenv JASPERINC $DIR/grib2/include
setenv HDF5 $DIR/grib2
setenv CPPFLAGS -I$DIR/grib2/include
setenv LDFLAGS -L$DIR/grib2/lib
setenv PATH $DIR/NETCDF/bin:$PATH
setenv CPPFLAGS -I$DIR/NETCDF/include
setenv LDFLAGS -L$DIR/NETCDF/lib
setenv LD_LIBRARY_PATH $DIR/NETCDF/lib:$LD_LIBRARY_PATH
setenv JASPER_INC $DIR/grib2/include
setenv PNG_INC $DIR/grib2/include
setenv NCEPLIBS_DIR $DIR/nceplibs
setenv NETCDF $DIR/NETCDF

setenv PATH $HOME/WRFCHEM/GrADS/Contents:$PATH
setenv WRF_EM_CORE 1
setenv WRF_NMM_CORE 0
setenv WRF_CHEM 1
setenv WRF_KPP 1
setenv YACC '/usr/bin/yacc -d'
setenv KPP_HOME $HOME/WRFCHEM/WRF-4.3/chem/KPP/kpp/kpp-2.1
setenv WRF_SRC_ROOT_DIR $HOME/WRFCHEM/WRF-4.3
setenv PATH $KPP_HOME/bin:$PATH
setenv SED /usr/bin/sed
setenv WRFIO_NCD_LARGE_FILE_SUPPORT 1
setenv FLEX /usr/local/other/Flex/2.5.35/bin/flex
setenv FLEX_LIB_DIR /usr/local/other/Flex/2.5.35/lib

#setenv OMP_NUM_THREADS 2
#setenv OMP_STACKSIZE 1G
#setenv KMP_AFFINITY compact
#setenv I_MPI_PIN_DOMAIN auto
mpirun -perhost 6 -np 24 ./wrf.exe
echo "Hello World"
 
Hi Dale,

This could be an issue with a recently added feature to GF scheme, see: https://github.com/wrf-model/WRF/pull/1354

Another PR has addressed a bug, but it has not yet been merged into the model source code: https://github.com/wrf-model/WRF/pull/1680

Could you please try a few things for me?

1. Update the source code following PR 1680 listed above, recompile the model and run it again.
2. If that doesn't work, turn off one at a time these options in the namelist:
-conv_tr_aqchem
-conv_tr_wetscav
-chem_conv_tr

Please let me know what you discover.

Jordan
 
Hi Jordan,

Thanks for the tips.

I changed
num_tracer to num_chem in line 780 of module_cu_gf_wrfdrv.F,
num_chem to num_tracer in line 791 of module_cu_gf_wrfdrv.F,
and recompiled.

Upon execution, I got a segmentation fault again.

I then set conv_tr_aqchem = 0,0, in the namelist, ran the model and got the same error message.

Keeping conv_tr_aqchem = 0,0, and changing conv_tr_wetscav = 0,0, I re-ran the model and it is executing successfully.

Next steps?

Thanks, Dale
 
Hi Dale,

Thanks for checking these options.

Please try this:

Add some print statements in the "wetscav" subroutine in phys/module_cu_gf_ctrans.F - Specifically, write out
HLndx = HLC_ndx(nv) (L372) as well as the the 6 different constants HLCNst1..6 and the value of "heff" (I'm wondering if the HLC table is not being read in correctly and thus is a memory mapping error - just to make sure, HLC.TBL is in your run directory, correct?).

You can also dry deleting module_cu_gf_ctrans.mod and module_cu_gf_ctrans.o, set some DEBUG flags (e.g., -g $(FCNOOPT) -traceback) in your configure.wrf and recompile to see if it will tell you the specific line it is failing on.

Jordan
 
Hi Jordan,

It is aborting at the HLndx = HLC_ndx(nv) line. See below.

I'm not sure I know how to write out the value of variables to the error/output stream. I tried using write(wrf_err_message) but it did not produce any output - at least in the error file.



Dale

Snippet of output follows:

d01 2012-05-19_18:00:00 Enter wetscav2
d01 2012-05-19_18:00:00 Enter wetscav3
d01 2012-05-19_18:00:00 Enter chem_moz loop
[borgv139:18690:0:18690] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x8)

Snippet of code follows:

call wrf_debug(100,'Enter wetscav2')
c0=0.004
if (t1d < 273.15) c0=c0*exp(0.07*(t1d-273.15))
call wrf_debug(100,'Enter wetscav3')
!--
!chem moz
if( chemopt == MOZCART_KPP .or. chemopt == T1_MOZCART_KPP .or. &
chemopt == MOZART_MOSAIC_4BIN_KPP .or. &
chemopt == MOZART_MOSAIC_4BIN_AQ_KPP ) then
call wrf_debug(100,'Enter chem_moz loop')
write(wrf_err_message,*) 'nv',nv
HLndx = HLC_ndx(nv)
call wrf_debug(100,'HLNdx')
 
I figured out the output issue.

FYI, nv = 2 just before the segmentation fault;

and I do have HLC.TBL in my run directory.
 
Hi,

Sorry for multiple requests, can you please tar/zip up your input files and I will also test your setup (wrifnput, wrfbdy, namelist.input).

Thanks.

Jordan
 
Hi Jordan,

#SBATCH --nodes=8 --ntasks=24 --cpus-per-task=2 --ntasks-per-node=6
#SBATCH --constraint=cas

I tried upping the number of nodes to 8 (from 4) and changing the constraint from hasw to cas. All 4 permutations resulted in the same segmentation fault. Is this what you wanted me to try or should I change ntasks, cpus-per-task, and/or ntasks-per-node instead?

The Haswell (hasw) nodes have 4.5 Gbytes per CPU with 126 GB per node.
The Cascade Lake (cas) nodes have 4.0 Gbytes per CPU and 190 GB per node.

Dale
 
Hi Jordan,

I just sent you an e-mail containing a link to a "gzipped" tar file containing the input data sets.
Hopefully, you receive the e-mail, the link works, and it contains the data sets you need. Please follow up if you need more data sets and/or information.

Thanks, Dale
 
Hi Dale,

I think I have figured out the issue and have a fix for you, there may be a slightly cleaner fix for the official version of the model, but it should be all the same to you with negligible performance diffrences. Essentially what was happening is that the physics side of the code was not seeing the allocated HLC_ndx array - even though it IS allocated in /chem. My current solution is to just duplicate the initialization subroutine on the physics side, so a version of that array will live in both chem and phys. It adds another subroutine to the model, but until I understand better how to not mess up any of the other routines that use HLC stuff, this should work.

Here are the code changes you need to make:

1. In phys/module_cu_gf_ctrans.F:
a. move the "INTEGER, allocatable :: HLC_ndx:))" statement to the the top of the module, directly below the "g" parameter declaration.
b. Add num_chem to the argument list in the call to "wetscav"
c. Directly above where the model was failing before, add this IF statement:

if ( .not. allocated(HLC_ndx) ) then
call conv_tr_wetscav_init_phys( numgas, num_chem )
endif

d. Copy in the entire conv_tr_wetscav_init subroutine from chem/module_ctrans_grell.F and name the new subroutine conv_tr_wetscav_init_phys.

2. In main/depend.common:
a. Add the following lines above module_cumulus_driver.o:

module_cu_gf_deep.o: \
module_cu_gf_ctrans.o

For completeness, I would perform a ./clean -a and then recompile, though a full clean is probably not necessary.

Let me know if this works for you, and we can reiterate if not. Thanks again for bringing this to our attention.

Jordan
 
Hi jordanschnell,

I have tried to modify the modules as you suggested, but I could not compile.

Could you ask for the modules modified, I think I am missing something.

Thanks
 
Hi PHDWRF,

Also, if you are still having issues compiling, please try using this command to compile:

./compile em_real > log_comp.txt 2> errors_comp.txt

And then attach both log_comp.txt and errors_comp.txt

Thanks,

Jordan
 
Hi Jordan,

My compilation was unsuccessful with these error messages.

Fatal Error: Cannot open module file ‘module_chem_utilities.mod’ for reading at (1): No such file or directory
Fatal Error: Cannot open module file ‘module_cu_gf_ctrans.mod’ for reading at (1): No such file or directory
Fatal Error: Cannot open module file ‘module_cu_gf_ctrans.mod’ for reading at (1): No such file or directory
Fatal Error: Cannot open module file ‘module_cu_gf_deep.mod’ for reading at (1): No such file or directory
Fatal Error: Cannot open module file ‘module_cu_gf_wrfdrv.mod’ for reading at (1): No such file or directory

I've attached a tar file containing the output from the compilation, the code I changed, and the script I used to test it.

Dale
 

Attachments

  • wrfchem_debug.tar
    1.8 MB · Views: 7
Hi Dale,

I'm not sure why module_chem_utilities.F was in the phys/ directory. The 4.3.3 release branch has it in the chem/ directory. Is this something you changed?

Jordan
 
Hi Jordan and Dale,
Did Dale's issue get resolved? And if so, is there a bugfix located somewhere? We have a student collaborating with us who has encountered the same error in v4.4.2. Since this discussion was last spring, I wondered if there is an official fix released.
Thanks!
Mary
 
Top