<?xml version="1.0" encoding="utf-8"?>
<journal>
<title>Signal and Data Processing</title>
<title_fa>پردازش علائم و داده‌ها</title_fa>
<short_title>JSDP</short_title>
<subject>Engineering &amp; Technology</subject>
<web_url>http://jsdp.rcisp.ac.ir</web_url>
<journal_hbi_system_id>1</journal_hbi_system_id>
<journal_hbi_system_user>admin</journal_hbi_system_user>
<journal_id_issn>2538-4201</journal_id_issn>
<journal_id_issn_online>2538-421X</journal_id_issn_online>
<journal_id_pii></journal_id_pii>
<journal_id_doi>10.66224/jsdp</journal_id_doi>
<journal_id_iranmedex></journal_id_iranmedex>
<journal_id_magiran></journal_id_magiran>
<journal_id_sid>1</journal_id_sid>
<journal_id_nlai>8888</journal_id_nlai>
<journal_id_science></journal_id_science>
<language>fa</language>
<pubdate>
	<type>jalali</type>
	<year>1398</year>
	<month>2</month>
	<day>1</day>
</pubdate>
<pubdate>
	<type>gregorian</type>
	<year>2019</year>
	<month>5</month>
	<day>1</day>
</pubdate>
<volume>16</volume>
<number>1</number>
<publish_type>online</publish_type>
<publish_edition>1</publish_edition>
<article_type>fulltext</article_type>
<articleset>
	<article>


	<language>fa</language>
	<article_id_doi></article_id_doi>
	<title_fa>یک روش جدید انتخاب ویژگی یک‌طرفه در دسته‌بندی داده‌های متنی نامتوازن</title_fa>
	<title>A Novel One Sided Feature Selection Method for Imbalanced Text Classification</title>
	<subject_fa>مقالات پردازش متن </subject_fa>
	<subject>Paper</subject>
	<content_type_fa>پژوهشي</content_type_fa>
	<content_type>Research</content_type>
	<abstract_fa>&lt;div style=&quot;text-align: justify;&quot;&gt;&lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt;توزیع نامتوازن داده&#8204;ها باعث افت کارایی دسته&#8204;بندها می&#8204;شود. راه&#8204;حل&#8204;های پیشنهاد&#8204;شده برای &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt;حل این &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt;مشکل به چند دسته تقسیم می&#8204;شوند، که روش&#8204;های مبتنی بر نمونه&#8204;گیری و روش&#8204;های مبتنی بر الگوریتم از مهم&#8204;ترین روش&#8204;ها هستند. انتخاب ویژگی نیز به&#8204;&#8204;عنوان یکی از راه&#8204;حل&#8204;های افزایش کارایی دسته&#8204;بندی داده&#8204;های نامتوازن مورد توجه قرار گرفته است. در این مقاله یک روش جدید انتخاب ویژگی یک&#8204;طرفه برای دسته&#8204;بندی متون نامتوازن ارائه شده است. &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt;روش پیشنهادی&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt; با استفاده از توزیع ویژگی&#8204;ها میزان نشان&#8204;گر&#8204;بودن ویژگی را محاسبه می&#8204;کند. به&#8204;منظور مقایسه عملکرد روش پیشنهادی، روش&#8204;های انتخاب ویژگی &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt;مختلفی &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt;پیاده&#8204;سازی و برای ارزیابی روش پیشنهادی از درخت تصمیم &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;font-family:times new roman,serif;&quot;&gt;&lt;span style=&quot;font-size:10.0pt;&quot;&gt;C4.5&lt;/span&gt;&lt;/span&gt;&lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt; و نایوبیز استفاده شد. نتایج آزمایش&#8204;ها بر روی پیکره&#8204;های &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;font-family:times new roman,serif;&quot;&gt;&lt;span style=&quot;font-size:10.0pt;&quot;&gt;Reuters-21875&lt;/span&gt;&lt;/span&gt;&lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt; و &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;font-family:times new roman,serif;&quot;&gt;&lt;span style=&quot;font-size:10.0pt;&quot;&gt;WebKB&lt;/span&gt;&lt;/span&gt;&lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt; برحسب معیار &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;font-family:times new roman,serif;&quot;&gt;&lt;span style=&quot;font-size:10.0pt;&quot;&gt;Micro F&lt;/span&gt;&lt;/span&gt;&lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt; ، &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;font-family:times new roman,serif;&quot;&gt;&lt;span style=&quot;font-size:10.0pt;&quot;&gt;Macro F&lt;/span&gt;&lt;/span&gt; &lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt;و &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;font-family:times new roman,serif;&quot;&gt;&lt;span style=&quot;font-size:10.0pt;&quot;&gt;G-mean&lt;/span&gt;&lt;/span&gt; &lt;span dir=&quot;RTL&quot;&gt;&lt;span style=&quot;font-family:b nazanin+ regular;&quot;&gt;&lt;span style=&quot;font-size:12.0pt;&quot;&gt;نشان می&#8204;دهد که روش پیشنهادی نسبت به روش&#8204;های دیگر، کارایی دسته&#8204;بندها را به &#8204;اندازه قابل توجهی بهبود بخشیده است.&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
</abstract_fa>
	<abstract>&lt;p class=&quot;AbstractText&quot; style=&quot;text-align: justify;&quot;&gt;&lt;strong&gt;The imbalance data can be seen in various areas such as text classification, credit card fraud detection, risk management, web page classification, image classification, medical diagnosis/monitoring, and biological data analysis. &lt;/strong&gt;&lt;br&gt;
&lt;strong&gt;The classification algorithms have more tendencies to the large class and might even deal with the minority class data as the outlier data. The text data is one of the areas where the imbalance occurs. The amount of text information is rapidly increasing in the form of books, reports, and papers. The fast and precise processing of this amount of information requires efficient automatic methods. One of the key processing tools is the text classification. Also, one of the problems with text classification is the high dimensional data that lead to the impractical learning algorithms. The problem becomes larger when the text data are also imbalance. The imbalance data distribution reduces the performance of classifiers. The various solutions proposed for this problem are divided into several categories, where the sampling-based methods and algorithm-based methods are among the most important methods. Feature selection is also considered as one of the solutions to the imbalance problem. In this research, a new method of one-way feature selection is presented for the imbalance data classification. The proposed method calculates the indicator rate of the feature using the feature distribution. &lt;/strong&gt;&lt;br&gt;
&lt;strong&gt;In the proposed method, the one-figure documents are divided in different parts, based on whether they contain a feature or not, and also if they belong to the positive-class or not. According to this classification, a new method is suggested for feature selection. In the proposed method, the following items are used. &lt;/strong&gt;&lt;/p&gt;

&lt;ol style=&quot;list-style-type:lower-alpha;&quot;&gt;
	&lt;li&gt;&lt;strong&gt;If a feature is repeated in most positive-class documents, this feature is a good indicator for the positive-class; therefore, this feature should have a high score for this class. This point can be shown as a proportion of positive-class documents that contain this feature. Besides, if most of the documents containing this feature are belonged to the positive-class, a high score should be considered for this feature as the class indicator. This point can be shown by a proportion of documents containing feature that belong to the positive-class. &lt;/strong&gt;&lt;/li&gt;
	&lt;li&gt;&lt;strong&gt;If most of the documents that do not contain a feature are not in the positive-class, a high score should be considered for this feature as the representative of this class. Moreover, if most of the documents that are not in the positive class do not contain this feature, a high score should be considered for this feature. &lt;/strong&gt;&lt;/li&gt;
&lt;/ol&gt;

&lt;p class=&quot;AbstractText&quot; style=&quot;text-align: justify;&quot;&gt;&lt;strong&gt;Using the proposed method, the score of features is specified. Finally, the features are sorted in descending order based on score, and the necessary number of required features is selected from the beginning of the feature list. &lt;/strong&gt;&lt;br&gt;
&lt;strong&gt;In order to evaluate the performance of the proposed method, different feature selection methods such as the Gini, DFS, MI and FAST were implemented. To assess the proposed method, the decision tree C4.5 and Naive Bayes were used. The results of tests on Reuters-21875 and WebKB figures per Micro F , Macro F and G-mean criteria show that the proposed method has considerably improved the efficiency of the classifiers than other methods.&lt;/strong&gt;&lt;br&gt;
&amp;nbsp;&lt;/p&gt;
</abstract>
	<keyword_fa>انتخاب ویژگی, روش پالایه, داده‌های نامتوازن, دسته‌بندی متون</keyword_fa>
	<keyword>Feature selection, Imbalanced class, High dimensionality, Text classification</keyword>
	<start_page>21</start_page>
	<end_page>40</end_page>
	<web_url>http://jsdp.rcisp.ac.ir/browse.php?a_code=A-10-652-2&amp;slc_lang=fa&amp;sid=1</web_url>


<author_list>
	<author>
	<first_name>Jafar</first_name>
	<middle_name></middle_name>
	<last_name>Pouramini</last_name>
	<suffix></suffix>
	<first_name_fa>جعفر</first_name_fa>
	<middle_name_fa></middle_name_fa>
	<last_name_fa>پورامینی</last_name_fa>
	<suffix_fa></suffix_fa>
	<email>j_pouramini@pnu.ac.ir</email>
	<code>10031947532846007496</code>
	<orcid>10031947532846007496</orcid>
	<coreauthor>Yes
</coreauthor>
	<affiliation>1Department of Computer &amp; Information Technology Engineering, Faculty of Engineering, University of Qom, Qom, Iran</affiliation>
	<affiliation_fa>گروه مهندسی فناوری اطلاعات، دانشکده فنی و مهندسی، دانشگاه پیام نور تهران</affiliation_fa>
	 </author>


	<author>
	<first_name>Behrouze</first_name>
	<middle_name></middle_name>
	<last_name>Minaei-Bidgoli</last_name>
	<suffix></suffix>
	<first_name_fa>بهروز</first_name_fa>
	<middle_name_fa></middle_name_fa>
	<last_name_fa>مینایی بیدگلی</last_name_fa>
	<suffix_fa></suffix_fa>
	<email>b_minaei@iust.ac.ir</email>
	<code>10031947532846007497</code>
	<orcid>10031947532846007497</orcid>
	<coreauthor>No</coreauthor>
	<affiliation>Faculty of Computer Engineering, Iran University of Science and Technology</affiliation>
	<affiliation_fa>دانشکده مهندسی کامپیوتر، دانشگاه علم و صنعت ایران</affiliation_fa>
	 </author>


	<author>
	<first_name>Mahdi</first_name>
	<middle_name></middle_name>
	<last_name>Esmaeili</last_name>
	<suffix></suffix>
	<first_name_fa>مهدی</first_name_fa>
	<middle_name_fa></middle_name_fa>
	<last_name_fa>اسماعیلی</last_name_fa>
	<suffix_fa></suffix_fa>
	<email>m.esmaeili@iaukashan.ac.ir</email>
	<code>10031947532846007498</code>
	<orcid>10031947532846007498</orcid>
	<coreauthor>No</coreauthor>
	<affiliation>Faculty of Computer Engineering, Kashan Islamic Azad University</affiliation>
	<affiliation_fa>دانشکده مهندسی کامپیوتر، دانشگاه آزاد اسلامی واحد کاشان</affiliation_fa>
	 </author>


</author_list>


	</article>
</articleset>
</journal>
